Add deepfilternet

This commit is contained in:
Xin Wang
2026-05-27 16:37:14 +08:00
parent c4a53b5205
commit 673a54049a
6 changed files with 307 additions and 0 deletions

61
docs/deepfilternet.md Normal file
View File

@@ -0,0 +1,61 @@
# DeepFilterNet Input Filter
The engine can optionally run DeepFilterNet on inbound microphone audio before
Pipecat VAD and STT. The integration uses DeepFilterNet's real-time `libDF` C
API (`df_process_frame`) rather than the Python `df.enhance()` batch helper.
## Build DeepFilterNet
From the DeepFilterNet checkout:
```bash
cd /Users/wangx/Code/DeepFilterNet
cargo build --release -p deep_filter --features capi
```
Use the generated native library path as `audio_filter.lib_path`. On macOS this
is usually:
```text
/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib
```
Use an ONNX tar.gz model as `audio_filter.model_path`, for example:
```text
/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz
```
The low-latency model is preferred for a live voice endpoint.
## Install Optional Python Dependencies
```bash
uv pip install -r requirements-deepfilternet.txt
```
## Enable
```json
"audio_filter": {
"enabled": true,
"provider": "deepfilternet",
"lib_path": "/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib",
"model_path": "/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz",
"model_sample_rate_hz": 48000,
"atten_lim_db": 100.0,
"post_filter_beta": 0.0,
"log_level": null
}
```
`model_sample_rate_hz` defaults to `48000`, matching the bundled DeepFilterNet
models. The filter resamples from the engine sample rate to the model sample
rate, processes hop-sized frames, then resamples back to the engine sample rate.
You can also provide paths through environment variables:
```bash
export DEEPFILTERNET_LIB_PATH=/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib
export DEEPFILTERNET_MODEL_PATH=/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz
```

32
engine/audio_filters.py Normal file
View File

@@ -0,0 +1,32 @@
from __future__ import annotations
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
from .config import AudioConfig, AudioFilterConfig
def create_audio_input_filter(
config: AudioFilterConfig,
audio: AudioConfig,
) -> BaseAudioFilter | None:
"""Create the optional transport-level input audio filter."""
if not config.enabled:
return None
if config.provider == "deepfilternet":
from .deepfilternet_filter import DeepFilterNetAudioFilter
return DeepFilterNetAudioFilter(
lib_path=config.lib_path,
model_path=config.model_path,
model_sample_rate=config.model_sample_rate_hz,
channels=audio.channels,
atten_lim_db=config.atten_lim_db,
post_filter_beta=config.post_filter_beta,
log_level=config.log_level,
)
raise ValueError(
f"Unsupported audio_filter provider {config.provider!r}; expected 'deepfilternet'"
)

View File

@@ -28,6 +28,20 @@ class AudioConfig:
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
@dataclass(frozen=True)
class AudioFilterConfig:
"""Optional input audio filter applied by the Pipecat transport before VAD/STT."""
enabled: bool = False
provider: str = "none"
lib_path: str | None = None
model_path: str | None = None
model_sample_rate_hz: int = 48000
atten_lim_db: float = 100.0
post_filter_beta: float = 0.0
log_level: str | None = None
@dataclass(frozen=True)
class SessionConfig:
inactivity_timeout_sec: int = 60
@@ -180,6 +194,7 @@ class ServicesConfig:
class EngineConfig:
server: ServerConfig = field(default_factory=ServerConfig)
audio: AudioConfig = field(default_factory=AudioConfig)
audio_filter: AudioFilterConfig = field(default_factory=AudioFilterConfig)
session: SessionConfig = field(default_factory=SessionConfig)
turn: TurnConfig = field(default_factory=TurnConfig)
agent: AgentConfig = field(default_factory=AgentConfig)
@@ -223,6 +238,7 @@ def config_from_dict(data: dict) -> EngineConfig:
return EngineConfig(
server=ServerConfig(**_dict(data.get("server"))),
audio=AudioConfig(**_dict(data.get("audio"))),
audio_filter=AudioFilterConfig(**_normalize_audio_filter(_dict(data.get("audio_filter")))),
session=SessionConfig(**_dict(data.get("session"))),
turn=TurnConfig(
vad=VADConfig(**vad),
@@ -255,6 +271,18 @@ def _dict(value: object) -> dict:
return dict(value) if isinstance(value, dict) else {}
def _normalize_audio_filter(value: dict) -> dict:
if value.get("lib_path") == "":
value["lib_path"] = None
if value.get("model_path") == "":
value["model_path"] = None
if value.get("log_level") == "":
value["log_level"] = None
if "provider" in value:
value["provider"] = str(value["provider"]).strip().lower()
return value
def _normalize_llm_provider(value: object) -> str:
provider = str(value or LLMConfig().provider).strip().lower()
normalized = _LLM_PROVIDER_ALIASES.get(provider)

View File

@@ -0,0 +1,182 @@
from __future__ import annotations
import ctypes
import os
from pathlib import Path
import numpy as np
from loguru import logger
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
class DeepFilterNetAudioFilter(BaseAudioFilter):
"""DeepFilterNet transport filter backed by libDF's real-time C API.
The DeepFilterNet Python ``enhance`` helper is file/batch oriented. This
filter uses ``df_process_frame`` instead, which keeps the model, STFT, and
rolling lookahead state alive across hop-sized frames for one voice session.
"""
def __init__(
self,
*,
lib_path: str | None,
model_path: str | None,
model_sample_rate: int = 48000,
channels: int = 1,
atten_lim_db: float = 100.0,
post_filter_beta: float = 0.0,
log_level: str | None = None,
) -> None:
self._lib_path = lib_path or os.environ.get("DEEPFILTERNET_LIB_PATH")
self._model_path = model_path or os.environ.get("DEEPFILTERNET_MODEL_PATH")
self._model_sample_rate = model_sample_rate
self._channels = channels
self._atten_lim_db = atten_lim_db
self._post_filter_beta = post_filter_beta
self._log_level = log_level
self._filtering = True
self._sample_rate = 0
self._lib = None
self._state = None
self._frame_length = 0
self._input_resampler = SOXRStreamAudioResampler()
self._output_resampler = SOXRStreamAudioResampler()
self._pending_model_bytes = bytearray()
async def start(self, sample_rate: int) -> None:
if self._channels != 1:
raise ValueError("DeepFilterNet audio filter currently supports mono PCM only")
self._sample_rate = sample_rate
self._pending_model_bytes.clear()
self._lib = self._load_library()
self._state = self._create_state()
self._frame_length = int(self._lib.df_get_frame_length(self._state))
if self._frame_length <= 0:
raise RuntimeError("DeepFilterNet returned an invalid frame length")
if self._post_filter_beta > 0:
self._lib.df_set_post_filter_beta(self._state, ctypes.c_float(self._post_filter_beta))
logger.info(
"DeepFilterNet audio filter enabled "
f"sample_rate={sample_rate} model_sample_rate={self._model_sample_rate} "
f"frame_length={self._frame_length}"
)
async def stop(self) -> None:
if self._lib and self._state:
self._lib.df_free(self._state)
self._lib = None
self._state = None
self._frame_length = 0
self._pending_model_bytes.clear()
async def process_frame(self, frame: FilterControlFrame) -> None:
if isinstance(frame, FilterEnableFrame):
self._filtering = frame.enable
async def filter(self, audio: bytes) -> bytes:
if not self._filtering or not self._lib or not self._state:
return audio
if not audio:
return b""
model_rate_audio = await self._input_resampler.resample(
audio,
self._sample_rate,
self._model_sample_rate,
)
self._pending_model_bytes.extend(model_rate_audio)
frame_bytes = self._frame_length * 2
processed_chunks: list[bytes] = []
while len(self._pending_model_bytes) >= frame_bytes:
chunk = bytes(self._pending_model_bytes[:frame_bytes])
del self._pending_model_bytes[:frame_bytes]
processed_chunks.append(self._process_model_frame(chunk))
if not processed_chunks:
return b""
processed_audio = b"".join(processed_chunks)
return await self._output_resampler.resample(
processed_audio,
self._model_sample_rate,
self._sample_rate,
)
def _load_library(self):
if not self._lib_path:
raise RuntimeError(
"DeepFilterNet audio filter requires audio_filter.lib_path "
"or DEEPFILTERNET_LIB_PATH"
)
lib_path = Path(self._lib_path).expanduser()
if not lib_path.exists():
raise FileNotFoundError(f"DeepFilterNet library not found: {lib_path}")
lib = ctypes.CDLL(str(lib_path))
lib.df_create.argtypes = [ctypes.c_char_p, ctypes.c_float, ctypes.c_char_p]
lib.df_create.restype = ctypes.c_void_p
lib.df_get_frame_length.argtypes = [ctypes.c_void_p]
lib.df_get_frame_length.restype = ctypes.c_size_t
lib.df_set_post_filter_beta.argtypes = [ctypes.c_void_p, ctypes.c_float]
lib.df_set_post_filter_beta.restype = None
lib.df_process_frame.argtypes = [
ctypes.c_void_p,
ctypes.POINTER(ctypes.c_float),
ctypes.POINTER(ctypes.c_float),
]
lib.df_process_frame.restype = ctypes.c_float
lib.df_free.argtypes = [ctypes.c_void_p]
lib.df_free.restype = None
return lib
def _create_state(self):
if not self._model_path:
raise RuntimeError(
"DeepFilterNet audio filter requires audio_filter.model_path "
"or DEEPFILTERNET_MODEL_PATH"
)
model_path = Path(self._model_path).expanduser()
if not model_path.exists():
raise FileNotFoundError(f"DeepFilterNet model not found: {model_path}")
log_level = self._log_level.encode("utf-8") if self._log_level else None
state = self._lib.df_create(
str(model_path).encode("utf-8"),
ctypes.c_float(self._atten_lim_db),
log_level,
)
if not state:
raise RuntimeError("DeepFilterNet failed to create model state")
return state
def _process_model_frame(self, pcm16_audio: bytes) -> bytes:
input_f32 = _pcm16_to_float32(pcm16_audio)
output_f32 = np.zeros(self._frame_length, dtype=np.float32)
self._lib.df_process_frame(
self._state,
input_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
output_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
)
return _float32_to_pcm16(output_f32)
def _pcm16_to_float32(audio: bytes) -> np.ndarray:
return (np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0).copy()
def _float32_to_pcm16(audio: np.ndarray) -> bytes:
clipped = np.clip(audio, -1.0, 1.0)
return (clipped * 32767.0).astype(np.int16).tobytes()

View File

@@ -32,6 +32,7 @@ from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
)
from pipecat.turns.user_turn_strategies import UserTurnStrategies
from .audio_filters import create_audio_input_filter
from .config import EngineConfig
from .context_sync import AssistantContextSyncProcessor
from .fastgpt_llm import FastGPTLLMService
@@ -80,6 +81,7 @@ async def run_pipeline_with_serializer(
audio_out_sample_rate=config.audio.sample_rate_hz,
audio_in_channels=config.audio.channels,
audio_out_channels=config.audio.channels,
audio_in_filter=create_audio_input_filter(config.audio_filter, config.audio),
serializer=serializer,
session_timeout=None,
),

View File

@@ -0,0 +1,2 @@
numpy>=1.26
soxr>=0.5