Add deepfilternet
This commit is contained in:
61
docs/deepfilternet.md
Normal file
61
docs/deepfilternet.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# DeepFilterNet Input Filter
|
||||
|
||||
The engine can optionally run DeepFilterNet on inbound microphone audio before
|
||||
Pipecat VAD and STT. The integration uses DeepFilterNet's real-time `libDF` C
|
||||
API (`df_process_frame`) rather than the Python `df.enhance()` batch helper.
|
||||
|
||||
## Build DeepFilterNet
|
||||
|
||||
From the DeepFilterNet checkout:
|
||||
|
||||
```bash
|
||||
cd /Users/wangx/Code/DeepFilterNet
|
||||
cargo build --release -p deep_filter --features capi
|
||||
```
|
||||
|
||||
Use the generated native library path as `audio_filter.lib_path`. On macOS this
|
||||
is usually:
|
||||
|
||||
```text
|
||||
/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib
|
||||
```
|
||||
|
||||
Use an ONNX tar.gz model as `audio_filter.model_path`, for example:
|
||||
|
||||
```text
|
||||
/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz
|
||||
```
|
||||
|
||||
The low-latency model is preferred for a live voice endpoint.
|
||||
|
||||
## Install Optional Python Dependencies
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements-deepfilternet.txt
|
||||
```
|
||||
|
||||
## Enable
|
||||
|
||||
```json
|
||||
"audio_filter": {
|
||||
"enabled": true,
|
||||
"provider": "deepfilternet",
|
||||
"lib_path": "/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib",
|
||||
"model_path": "/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz",
|
||||
"model_sample_rate_hz": 48000,
|
||||
"atten_lim_db": 100.0,
|
||||
"post_filter_beta": 0.0,
|
||||
"log_level": null
|
||||
}
|
||||
```
|
||||
|
||||
`model_sample_rate_hz` defaults to `48000`, matching the bundled DeepFilterNet
|
||||
models. The filter resamples from the engine sample rate to the model sample
|
||||
rate, processes hop-sized frames, then resamples back to the engine sample rate.
|
||||
|
||||
You can also provide paths through environment variables:
|
||||
|
||||
```bash
|
||||
export DEEPFILTERNET_LIB_PATH=/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib
|
||||
export DEEPFILTERNET_MODEL_PATH=/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz
|
||||
```
|
||||
32
engine/audio_filters.py
Normal file
32
engine/audio_filters.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
||||
|
||||
from .config import AudioConfig, AudioFilterConfig
|
||||
|
||||
|
||||
def create_audio_input_filter(
|
||||
config: AudioFilterConfig,
|
||||
audio: AudioConfig,
|
||||
) -> BaseAudioFilter | None:
|
||||
"""Create the optional transport-level input audio filter."""
|
||||
|
||||
if not config.enabled:
|
||||
return None
|
||||
|
||||
if config.provider == "deepfilternet":
|
||||
from .deepfilternet_filter import DeepFilterNetAudioFilter
|
||||
|
||||
return DeepFilterNetAudioFilter(
|
||||
lib_path=config.lib_path,
|
||||
model_path=config.model_path,
|
||||
model_sample_rate=config.model_sample_rate_hz,
|
||||
channels=audio.channels,
|
||||
atten_lim_db=config.atten_lim_db,
|
||||
post_filter_beta=config.post_filter_beta,
|
||||
log_level=config.log_level,
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
f"Unsupported audio_filter provider {config.provider!r}; expected 'deepfilternet'"
|
||||
)
|
||||
@@ -28,6 +28,20 @@ class AudioConfig:
|
||||
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AudioFilterConfig:
|
||||
"""Optional input audio filter applied by the Pipecat transport before VAD/STT."""
|
||||
|
||||
enabled: bool = False
|
||||
provider: str = "none"
|
||||
lib_path: str | None = None
|
||||
model_path: str | None = None
|
||||
model_sample_rate_hz: int = 48000
|
||||
atten_lim_db: float = 100.0
|
||||
post_filter_beta: float = 0.0
|
||||
log_level: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SessionConfig:
|
||||
inactivity_timeout_sec: int = 60
|
||||
@@ -180,6 +194,7 @@ class ServicesConfig:
|
||||
class EngineConfig:
|
||||
server: ServerConfig = field(default_factory=ServerConfig)
|
||||
audio: AudioConfig = field(default_factory=AudioConfig)
|
||||
audio_filter: AudioFilterConfig = field(default_factory=AudioFilterConfig)
|
||||
session: SessionConfig = field(default_factory=SessionConfig)
|
||||
turn: TurnConfig = field(default_factory=TurnConfig)
|
||||
agent: AgentConfig = field(default_factory=AgentConfig)
|
||||
@@ -223,6 +238,7 @@ def config_from_dict(data: dict) -> EngineConfig:
|
||||
return EngineConfig(
|
||||
server=ServerConfig(**_dict(data.get("server"))),
|
||||
audio=AudioConfig(**_dict(data.get("audio"))),
|
||||
audio_filter=AudioFilterConfig(**_normalize_audio_filter(_dict(data.get("audio_filter")))),
|
||||
session=SessionConfig(**_dict(data.get("session"))),
|
||||
turn=TurnConfig(
|
||||
vad=VADConfig(**vad),
|
||||
@@ -255,6 +271,18 @@ def _dict(value: object) -> dict:
|
||||
return dict(value) if isinstance(value, dict) else {}
|
||||
|
||||
|
||||
def _normalize_audio_filter(value: dict) -> dict:
|
||||
if value.get("lib_path") == "":
|
||||
value["lib_path"] = None
|
||||
if value.get("model_path") == "":
|
||||
value["model_path"] = None
|
||||
if value.get("log_level") == "":
|
||||
value["log_level"] = None
|
||||
if "provider" in value:
|
||||
value["provider"] = str(value["provider"]).strip().lower()
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_llm_provider(value: object) -> str:
|
||||
provider = str(value or LLMConfig().provider).strip().lower()
|
||||
normalized = _LLM_PROVIDER_ALIASES.get(provider)
|
||||
|
||||
182
engine/deepfilternet_filter.py
Normal file
182
engine/deepfilternet_filter.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
||||
from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
|
||||
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
|
||||
|
||||
|
||||
class DeepFilterNetAudioFilter(BaseAudioFilter):
|
||||
"""DeepFilterNet transport filter backed by libDF's real-time C API.
|
||||
|
||||
The DeepFilterNet Python ``enhance`` helper is file/batch oriented. This
|
||||
filter uses ``df_process_frame`` instead, which keeps the model, STFT, and
|
||||
rolling lookahead state alive across hop-sized frames for one voice session.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
lib_path: str | None,
|
||||
model_path: str | None,
|
||||
model_sample_rate: int = 48000,
|
||||
channels: int = 1,
|
||||
atten_lim_db: float = 100.0,
|
||||
post_filter_beta: float = 0.0,
|
||||
log_level: str | None = None,
|
||||
) -> None:
|
||||
self._lib_path = lib_path or os.environ.get("DEEPFILTERNET_LIB_PATH")
|
||||
self._model_path = model_path or os.environ.get("DEEPFILTERNET_MODEL_PATH")
|
||||
self._model_sample_rate = model_sample_rate
|
||||
self._channels = channels
|
||||
self._atten_lim_db = atten_lim_db
|
||||
self._post_filter_beta = post_filter_beta
|
||||
self._log_level = log_level
|
||||
|
||||
self._filtering = True
|
||||
self._sample_rate = 0
|
||||
self._lib = None
|
||||
self._state = None
|
||||
self._frame_length = 0
|
||||
self._input_resampler = SOXRStreamAudioResampler()
|
||||
self._output_resampler = SOXRStreamAudioResampler()
|
||||
self._pending_model_bytes = bytearray()
|
||||
|
||||
async def start(self, sample_rate: int) -> None:
|
||||
if self._channels != 1:
|
||||
raise ValueError("DeepFilterNet audio filter currently supports mono PCM only")
|
||||
|
||||
self._sample_rate = sample_rate
|
||||
self._pending_model_bytes.clear()
|
||||
self._lib = self._load_library()
|
||||
self._state = self._create_state()
|
||||
self._frame_length = int(self._lib.df_get_frame_length(self._state))
|
||||
if self._frame_length <= 0:
|
||||
raise RuntimeError("DeepFilterNet returned an invalid frame length")
|
||||
|
||||
if self._post_filter_beta > 0:
|
||||
self._lib.df_set_post_filter_beta(self._state, ctypes.c_float(self._post_filter_beta))
|
||||
|
||||
logger.info(
|
||||
"DeepFilterNet audio filter enabled "
|
||||
f"sample_rate={sample_rate} model_sample_rate={self._model_sample_rate} "
|
||||
f"frame_length={self._frame_length}"
|
||||
)
|
||||
|
||||
async def stop(self) -> None:
|
||||
if self._lib and self._state:
|
||||
self._lib.df_free(self._state)
|
||||
self._lib = None
|
||||
self._state = None
|
||||
self._frame_length = 0
|
||||
self._pending_model_bytes.clear()
|
||||
|
||||
async def process_frame(self, frame: FilterControlFrame) -> None:
|
||||
if isinstance(frame, FilterEnableFrame):
|
||||
self._filtering = frame.enable
|
||||
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
if not self._filtering or not self._lib or not self._state:
|
||||
return audio
|
||||
if not audio:
|
||||
return b""
|
||||
|
||||
model_rate_audio = await self._input_resampler.resample(
|
||||
audio,
|
||||
self._sample_rate,
|
||||
self._model_sample_rate,
|
||||
)
|
||||
self._pending_model_bytes.extend(model_rate_audio)
|
||||
|
||||
frame_bytes = self._frame_length * 2
|
||||
processed_chunks: list[bytes] = []
|
||||
while len(self._pending_model_bytes) >= frame_bytes:
|
||||
chunk = bytes(self._pending_model_bytes[:frame_bytes])
|
||||
del self._pending_model_bytes[:frame_bytes]
|
||||
processed_chunks.append(self._process_model_frame(chunk))
|
||||
|
||||
if not processed_chunks:
|
||||
return b""
|
||||
|
||||
processed_audio = b"".join(processed_chunks)
|
||||
return await self._output_resampler.resample(
|
||||
processed_audio,
|
||||
self._model_sample_rate,
|
||||
self._sample_rate,
|
||||
)
|
||||
|
||||
def _load_library(self):
|
||||
if not self._lib_path:
|
||||
raise RuntimeError(
|
||||
"DeepFilterNet audio filter requires audio_filter.lib_path "
|
||||
"or DEEPFILTERNET_LIB_PATH"
|
||||
)
|
||||
|
||||
lib_path = Path(self._lib_path).expanduser()
|
||||
if not lib_path.exists():
|
||||
raise FileNotFoundError(f"DeepFilterNet library not found: {lib_path}")
|
||||
|
||||
lib = ctypes.CDLL(str(lib_path))
|
||||
lib.df_create.argtypes = [ctypes.c_char_p, ctypes.c_float, ctypes.c_char_p]
|
||||
lib.df_create.restype = ctypes.c_void_p
|
||||
lib.df_get_frame_length.argtypes = [ctypes.c_void_p]
|
||||
lib.df_get_frame_length.restype = ctypes.c_size_t
|
||||
lib.df_set_post_filter_beta.argtypes = [ctypes.c_void_p, ctypes.c_float]
|
||||
lib.df_set_post_filter_beta.restype = None
|
||||
lib.df_process_frame.argtypes = [
|
||||
ctypes.c_void_p,
|
||||
ctypes.POINTER(ctypes.c_float),
|
||||
ctypes.POINTER(ctypes.c_float),
|
||||
]
|
||||
lib.df_process_frame.restype = ctypes.c_float
|
||||
lib.df_free.argtypes = [ctypes.c_void_p]
|
||||
lib.df_free.restype = None
|
||||
return lib
|
||||
|
||||
def _create_state(self):
|
||||
if not self._model_path:
|
||||
raise RuntimeError(
|
||||
"DeepFilterNet audio filter requires audio_filter.model_path "
|
||||
"or DEEPFILTERNET_MODEL_PATH"
|
||||
)
|
||||
|
||||
model_path = Path(self._model_path).expanduser()
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(f"DeepFilterNet model not found: {model_path}")
|
||||
|
||||
log_level = self._log_level.encode("utf-8") if self._log_level else None
|
||||
state = self._lib.df_create(
|
||||
str(model_path).encode("utf-8"),
|
||||
ctypes.c_float(self._atten_lim_db),
|
||||
log_level,
|
||||
)
|
||||
if not state:
|
||||
raise RuntimeError("DeepFilterNet failed to create model state")
|
||||
return state
|
||||
|
||||
def _process_model_frame(self, pcm16_audio: bytes) -> bytes:
|
||||
input_f32 = _pcm16_to_float32(pcm16_audio)
|
||||
output_f32 = np.zeros(self._frame_length, dtype=np.float32)
|
||||
|
||||
self._lib.df_process_frame(
|
||||
self._state,
|
||||
input_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
output_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
)
|
||||
|
||||
return _float32_to_pcm16(output_f32)
|
||||
|
||||
|
||||
def _pcm16_to_float32(audio: bytes) -> np.ndarray:
|
||||
return (np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0).copy()
|
||||
|
||||
|
||||
def _float32_to_pcm16(audio: np.ndarray) -> bytes:
|
||||
clipped = np.clip(audio, -1.0, 1.0)
|
||||
return (clipped * 32767.0).astype(np.int16).tobytes()
|
||||
@@ -32,6 +32,7 @@ from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
|
||||
)
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
from .audio_filters import create_audio_input_filter
|
||||
from .config import EngineConfig
|
||||
from .context_sync import AssistantContextSyncProcessor
|
||||
from .fastgpt_llm import FastGPTLLMService
|
||||
@@ -80,6 +81,7 @@ async def run_pipeline_with_serializer(
|
||||
audio_out_sample_rate=config.audio.sample_rate_hz,
|
||||
audio_in_channels=config.audio.channels,
|
||||
audio_out_channels=config.audio.channels,
|
||||
audio_in_filter=create_audio_input_filter(config.audio_filter, config.audio),
|
||||
serializer=serializer,
|
||||
session_timeout=None,
|
||||
),
|
||||
|
||||
2
requirements-deepfilternet.txt
Normal file
2
requirements-deepfilternet.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
numpy>=1.26
|
||||
soxr>=0.5
|
||||
Reference in New Issue
Block a user