From 673a54049a56e8215b1665072f58b00df859b334 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Wed, 27 May 2026 16:37:14 +0800 Subject: [PATCH] Add deepfilternet --- docs/deepfilternet.md | 61 +++++++++++ engine/audio_filters.py | 32 ++++++ engine/config.py | 28 +++++ engine/deepfilternet_filter.py | 182 +++++++++++++++++++++++++++++++++ engine/pipeline.py | 2 + requirements-deepfilternet.txt | 2 + 6 files changed, 307 insertions(+) create mode 100644 docs/deepfilternet.md create mode 100644 engine/audio_filters.py create mode 100644 engine/deepfilternet_filter.py create mode 100644 requirements-deepfilternet.txt diff --git a/docs/deepfilternet.md b/docs/deepfilternet.md new file mode 100644 index 0000000..5544389 --- /dev/null +++ b/docs/deepfilternet.md @@ -0,0 +1,61 @@ +# DeepFilterNet Input Filter + +The engine can optionally run DeepFilterNet on inbound microphone audio before +Pipecat VAD and STT. The integration uses DeepFilterNet's real-time `libDF` C +API (`df_process_frame`) rather than the Python `df.enhance()` batch helper. + +## Build DeepFilterNet + +From the DeepFilterNet checkout: + +```bash +cd /Users/wangx/Code/DeepFilterNet +cargo build --release -p deep_filter --features capi +``` + +Use the generated native library path as `audio_filter.lib_path`. On macOS this +is usually: + +```text +/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib +``` + +Use an ONNX tar.gz model as `audio_filter.model_path`, for example: + +```text +/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz +``` + +The low-latency model is preferred for a live voice endpoint. + +## Install Optional Python Dependencies + +```bash +uv pip install -r requirements-deepfilternet.txt +``` + +## Enable + +```json +"audio_filter": { + "enabled": true, + "provider": "deepfilternet", + "lib_path": "/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib", + "model_path": "/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz", + "model_sample_rate_hz": 48000, + "atten_lim_db": 100.0, + "post_filter_beta": 0.0, + "log_level": null +} +``` + +`model_sample_rate_hz` defaults to `48000`, matching the bundled DeepFilterNet +models. The filter resamples from the engine sample rate to the model sample +rate, processes hop-sized frames, then resamples back to the engine sample rate. + +You can also provide paths through environment variables: + +```bash +export DEEPFILTERNET_LIB_PATH=/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib +export DEEPFILTERNET_MODEL_PATH=/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz +``` diff --git a/engine/audio_filters.py b/engine/audio_filters.py new file mode 100644 index 0000000..190cbff --- /dev/null +++ b/engine/audio_filters.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from pipecat.audio.filters.base_audio_filter import BaseAudioFilter + +from .config import AudioConfig, AudioFilterConfig + + +def create_audio_input_filter( + config: AudioFilterConfig, + audio: AudioConfig, +) -> BaseAudioFilter | None: + """Create the optional transport-level input audio filter.""" + + if not config.enabled: + return None + + if config.provider == "deepfilternet": + from .deepfilternet_filter import DeepFilterNetAudioFilter + + return DeepFilterNetAudioFilter( + lib_path=config.lib_path, + model_path=config.model_path, + model_sample_rate=config.model_sample_rate_hz, + channels=audio.channels, + atten_lim_db=config.atten_lim_db, + post_filter_beta=config.post_filter_beta, + log_level=config.log_level, + ) + + raise ValueError( + f"Unsupported audio_filter provider {config.provider!r}; expected 'deepfilternet'" + ) diff --git a/engine/config.py b/engine/config.py index edc4518..4c8b628 100644 --- a/engine/config.py +++ b/engine/config.py @@ -28,6 +28,20 @@ class AudioConfig: return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2 +@dataclass(frozen=True) +class AudioFilterConfig: + """Optional input audio filter applied by the Pipecat transport before VAD/STT.""" + + enabled: bool = False + provider: str = "none" + lib_path: str | None = None + model_path: str | None = None + model_sample_rate_hz: int = 48000 + atten_lim_db: float = 100.0 + post_filter_beta: float = 0.0 + log_level: str | None = None + + @dataclass(frozen=True) class SessionConfig: inactivity_timeout_sec: int = 60 @@ -180,6 +194,7 @@ class ServicesConfig: class EngineConfig: server: ServerConfig = field(default_factory=ServerConfig) audio: AudioConfig = field(default_factory=AudioConfig) + audio_filter: AudioFilterConfig = field(default_factory=AudioFilterConfig) session: SessionConfig = field(default_factory=SessionConfig) turn: TurnConfig = field(default_factory=TurnConfig) agent: AgentConfig = field(default_factory=AgentConfig) @@ -223,6 +238,7 @@ def config_from_dict(data: dict) -> EngineConfig: return EngineConfig( server=ServerConfig(**_dict(data.get("server"))), audio=AudioConfig(**_dict(data.get("audio"))), + audio_filter=AudioFilterConfig(**_normalize_audio_filter(_dict(data.get("audio_filter")))), session=SessionConfig(**_dict(data.get("session"))), turn=TurnConfig( vad=VADConfig(**vad), @@ -255,6 +271,18 @@ def _dict(value: object) -> dict: return dict(value) if isinstance(value, dict) else {} +def _normalize_audio_filter(value: dict) -> dict: + if value.get("lib_path") == "": + value["lib_path"] = None + if value.get("model_path") == "": + value["model_path"] = None + if value.get("log_level") == "": + value["log_level"] = None + if "provider" in value: + value["provider"] = str(value["provider"]).strip().lower() + return value + + def _normalize_llm_provider(value: object) -> str: provider = str(value or LLMConfig().provider).strip().lower() normalized = _LLM_PROVIDER_ALIASES.get(provider) diff --git a/engine/deepfilternet_filter.py b/engine/deepfilternet_filter.py new file mode 100644 index 0000000..3ae1823 --- /dev/null +++ b/engine/deepfilternet_filter.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import ctypes +import os +from pathlib import Path + +import numpy as np +from loguru import logger + +from pipecat.audio.filters.base_audio_filter import BaseAudioFilter +from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler +from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame + + +class DeepFilterNetAudioFilter(BaseAudioFilter): + """DeepFilterNet transport filter backed by libDF's real-time C API. + + The DeepFilterNet Python ``enhance`` helper is file/batch oriented. This + filter uses ``df_process_frame`` instead, which keeps the model, STFT, and + rolling lookahead state alive across hop-sized frames for one voice session. + """ + + def __init__( + self, + *, + lib_path: str | None, + model_path: str | None, + model_sample_rate: int = 48000, + channels: int = 1, + atten_lim_db: float = 100.0, + post_filter_beta: float = 0.0, + log_level: str | None = None, + ) -> None: + self._lib_path = lib_path or os.environ.get("DEEPFILTERNET_LIB_PATH") + self._model_path = model_path or os.environ.get("DEEPFILTERNET_MODEL_PATH") + self._model_sample_rate = model_sample_rate + self._channels = channels + self._atten_lim_db = atten_lim_db + self._post_filter_beta = post_filter_beta + self._log_level = log_level + + self._filtering = True + self._sample_rate = 0 + self._lib = None + self._state = None + self._frame_length = 0 + self._input_resampler = SOXRStreamAudioResampler() + self._output_resampler = SOXRStreamAudioResampler() + self._pending_model_bytes = bytearray() + + async def start(self, sample_rate: int) -> None: + if self._channels != 1: + raise ValueError("DeepFilterNet audio filter currently supports mono PCM only") + + self._sample_rate = sample_rate + self._pending_model_bytes.clear() + self._lib = self._load_library() + self._state = self._create_state() + self._frame_length = int(self._lib.df_get_frame_length(self._state)) + if self._frame_length <= 0: + raise RuntimeError("DeepFilterNet returned an invalid frame length") + + if self._post_filter_beta > 0: + self._lib.df_set_post_filter_beta(self._state, ctypes.c_float(self._post_filter_beta)) + + logger.info( + "DeepFilterNet audio filter enabled " + f"sample_rate={sample_rate} model_sample_rate={self._model_sample_rate} " + f"frame_length={self._frame_length}" + ) + + async def stop(self) -> None: + if self._lib and self._state: + self._lib.df_free(self._state) + self._lib = None + self._state = None + self._frame_length = 0 + self._pending_model_bytes.clear() + + async def process_frame(self, frame: FilterControlFrame) -> None: + if isinstance(frame, FilterEnableFrame): + self._filtering = frame.enable + + async def filter(self, audio: bytes) -> bytes: + if not self._filtering or not self._lib or not self._state: + return audio + if not audio: + return b"" + + model_rate_audio = await self._input_resampler.resample( + audio, + self._sample_rate, + self._model_sample_rate, + ) + self._pending_model_bytes.extend(model_rate_audio) + + frame_bytes = self._frame_length * 2 + processed_chunks: list[bytes] = [] + while len(self._pending_model_bytes) >= frame_bytes: + chunk = bytes(self._pending_model_bytes[:frame_bytes]) + del self._pending_model_bytes[:frame_bytes] + processed_chunks.append(self._process_model_frame(chunk)) + + if not processed_chunks: + return b"" + + processed_audio = b"".join(processed_chunks) + return await self._output_resampler.resample( + processed_audio, + self._model_sample_rate, + self._sample_rate, + ) + + def _load_library(self): + if not self._lib_path: + raise RuntimeError( + "DeepFilterNet audio filter requires audio_filter.lib_path " + "or DEEPFILTERNET_LIB_PATH" + ) + + lib_path = Path(self._lib_path).expanduser() + if not lib_path.exists(): + raise FileNotFoundError(f"DeepFilterNet library not found: {lib_path}") + + lib = ctypes.CDLL(str(lib_path)) + lib.df_create.argtypes = [ctypes.c_char_p, ctypes.c_float, ctypes.c_char_p] + lib.df_create.restype = ctypes.c_void_p + lib.df_get_frame_length.argtypes = [ctypes.c_void_p] + lib.df_get_frame_length.restype = ctypes.c_size_t + lib.df_set_post_filter_beta.argtypes = [ctypes.c_void_p, ctypes.c_float] + lib.df_set_post_filter_beta.restype = None + lib.df_process_frame.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_float), + ctypes.POINTER(ctypes.c_float), + ] + lib.df_process_frame.restype = ctypes.c_float + lib.df_free.argtypes = [ctypes.c_void_p] + lib.df_free.restype = None + return lib + + def _create_state(self): + if not self._model_path: + raise RuntimeError( + "DeepFilterNet audio filter requires audio_filter.model_path " + "or DEEPFILTERNET_MODEL_PATH" + ) + + model_path = Path(self._model_path).expanduser() + if not model_path.exists(): + raise FileNotFoundError(f"DeepFilterNet model not found: {model_path}") + + log_level = self._log_level.encode("utf-8") if self._log_level else None + state = self._lib.df_create( + str(model_path).encode("utf-8"), + ctypes.c_float(self._atten_lim_db), + log_level, + ) + if not state: + raise RuntimeError("DeepFilterNet failed to create model state") + return state + + def _process_model_frame(self, pcm16_audio: bytes) -> bytes: + input_f32 = _pcm16_to_float32(pcm16_audio) + output_f32 = np.zeros(self._frame_length, dtype=np.float32) + + self._lib.df_process_frame( + self._state, + input_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + output_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + ) + + return _float32_to_pcm16(output_f32) + + +def _pcm16_to_float32(audio: bytes) -> np.ndarray: + return (np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0).copy() + + +def _float32_to_pcm16(audio: np.ndarray) -> bytes: + clipped = np.clip(audio, -1.0, 1.0) + return (clipped * 32767.0).astype(np.int16).tobytes() diff --git a/engine/pipeline.py b/engine/pipeline.py index ccf13e4..97dc646 100644 --- a/engine/pipeline.py +++ b/engine/pipeline.py @@ -32,6 +32,7 @@ from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import ( ) from pipecat.turns.user_turn_strategies import UserTurnStrategies +from .audio_filters import create_audio_input_filter from .config import EngineConfig from .context_sync import AssistantContextSyncProcessor from .fastgpt_llm import FastGPTLLMService @@ -80,6 +81,7 @@ async def run_pipeline_with_serializer( audio_out_sample_rate=config.audio.sample_rate_hz, audio_in_channels=config.audio.channels, audio_out_channels=config.audio.channels, + audio_in_filter=create_audio_input_filter(config.audio_filter, config.audio), serializer=serializer, session_timeout=None, ), diff --git a/requirements-deepfilternet.txt b/requirements-deepfilternet.txt new file mode 100644 index 0000000..667bf69 --- /dev/null +++ b/requirements-deepfilternet.txt @@ -0,0 +1,2 @@ +numpy>=1.26 +soxr>=0.5