Add deepfilternet
This commit is contained in:
61
docs/deepfilternet.md
Normal file
61
docs/deepfilternet.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# DeepFilterNet Input Filter
|
||||||
|
|
||||||
|
The engine can optionally run DeepFilterNet on inbound microphone audio before
|
||||||
|
Pipecat VAD and STT. The integration uses DeepFilterNet's real-time `libDF` C
|
||||||
|
API (`df_process_frame`) rather than the Python `df.enhance()` batch helper.
|
||||||
|
|
||||||
|
## Build DeepFilterNet
|
||||||
|
|
||||||
|
From the DeepFilterNet checkout:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /Users/wangx/Code/DeepFilterNet
|
||||||
|
cargo build --release -p deep_filter --features capi
|
||||||
|
```
|
||||||
|
|
||||||
|
Use the generated native library path as `audio_filter.lib_path`. On macOS this
|
||||||
|
is usually:
|
||||||
|
|
||||||
|
```text
|
||||||
|
/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib
|
||||||
|
```
|
||||||
|
|
||||||
|
Use an ONNX tar.gz model as `audio_filter.model_path`, for example:
|
||||||
|
|
||||||
|
```text
|
||||||
|
/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
The low-latency model is preferred for a live voice endpoint.
|
||||||
|
|
||||||
|
## Install Optional Python Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv pip install -r requirements-deepfilternet.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Enable
|
||||||
|
|
||||||
|
```json
|
||||||
|
"audio_filter": {
|
||||||
|
"enabled": true,
|
||||||
|
"provider": "deepfilternet",
|
||||||
|
"lib_path": "/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib",
|
||||||
|
"model_path": "/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz",
|
||||||
|
"model_sample_rate_hz": 48000,
|
||||||
|
"atten_lim_db": 100.0,
|
||||||
|
"post_filter_beta": 0.0,
|
||||||
|
"log_level": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`model_sample_rate_hz` defaults to `48000`, matching the bundled DeepFilterNet
|
||||||
|
models. The filter resamples from the engine sample rate to the model sample
|
||||||
|
rate, processes hop-sized frames, then resamples back to the engine sample rate.
|
||||||
|
|
||||||
|
You can also provide paths through environment variables:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export DEEPFILTERNET_LIB_PATH=/Users/wangx/Code/DeepFilterNet/target/release/libdf.dylib
|
||||||
|
export DEEPFILTERNET_MODEL_PATH=/Users/wangx/Code/DeepFilterNet/models/DeepFilterNet3_ll_onnx.tar.gz
|
||||||
|
```
|
||||||
32
engine/audio_filters.py
Normal file
32
engine/audio_filters.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
||||||
|
|
||||||
|
from .config import AudioConfig, AudioFilterConfig
|
||||||
|
|
||||||
|
|
||||||
|
def create_audio_input_filter(
|
||||||
|
config: AudioFilterConfig,
|
||||||
|
audio: AudioConfig,
|
||||||
|
) -> BaseAudioFilter | None:
|
||||||
|
"""Create the optional transport-level input audio filter."""
|
||||||
|
|
||||||
|
if not config.enabled:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if config.provider == "deepfilternet":
|
||||||
|
from .deepfilternet_filter import DeepFilterNetAudioFilter
|
||||||
|
|
||||||
|
return DeepFilterNetAudioFilter(
|
||||||
|
lib_path=config.lib_path,
|
||||||
|
model_path=config.model_path,
|
||||||
|
model_sample_rate=config.model_sample_rate_hz,
|
||||||
|
channels=audio.channels,
|
||||||
|
atten_lim_db=config.atten_lim_db,
|
||||||
|
post_filter_beta=config.post_filter_beta,
|
||||||
|
log_level=config.log_level,
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported audio_filter provider {config.provider!r}; expected 'deepfilternet'"
|
||||||
|
)
|
||||||
@@ -28,6 +28,20 @@ class AudioConfig:
|
|||||||
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
|
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AudioFilterConfig:
|
||||||
|
"""Optional input audio filter applied by the Pipecat transport before VAD/STT."""
|
||||||
|
|
||||||
|
enabled: bool = False
|
||||||
|
provider: str = "none"
|
||||||
|
lib_path: str | None = None
|
||||||
|
model_path: str | None = None
|
||||||
|
model_sample_rate_hz: int = 48000
|
||||||
|
atten_lim_db: float = 100.0
|
||||||
|
post_filter_beta: float = 0.0
|
||||||
|
log_level: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class SessionConfig:
|
class SessionConfig:
|
||||||
inactivity_timeout_sec: int = 60
|
inactivity_timeout_sec: int = 60
|
||||||
@@ -180,6 +194,7 @@ class ServicesConfig:
|
|||||||
class EngineConfig:
|
class EngineConfig:
|
||||||
server: ServerConfig = field(default_factory=ServerConfig)
|
server: ServerConfig = field(default_factory=ServerConfig)
|
||||||
audio: AudioConfig = field(default_factory=AudioConfig)
|
audio: AudioConfig = field(default_factory=AudioConfig)
|
||||||
|
audio_filter: AudioFilterConfig = field(default_factory=AudioFilterConfig)
|
||||||
session: SessionConfig = field(default_factory=SessionConfig)
|
session: SessionConfig = field(default_factory=SessionConfig)
|
||||||
turn: TurnConfig = field(default_factory=TurnConfig)
|
turn: TurnConfig = field(default_factory=TurnConfig)
|
||||||
agent: AgentConfig = field(default_factory=AgentConfig)
|
agent: AgentConfig = field(default_factory=AgentConfig)
|
||||||
@@ -223,6 +238,7 @@ def config_from_dict(data: dict) -> EngineConfig:
|
|||||||
return EngineConfig(
|
return EngineConfig(
|
||||||
server=ServerConfig(**_dict(data.get("server"))),
|
server=ServerConfig(**_dict(data.get("server"))),
|
||||||
audio=AudioConfig(**_dict(data.get("audio"))),
|
audio=AudioConfig(**_dict(data.get("audio"))),
|
||||||
|
audio_filter=AudioFilterConfig(**_normalize_audio_filter(_dict(data.get("audio_filter")))),
|
||||||
session=SessionConfig(**_dict(data.get("session"))),
|
session=SessionConfig(**_dict(data.get("session"))),
|
||||||
turn=TurnConfig(
|
turn=TurnConfig(
|
||||||
vad=VADConfig(**vad),
|
vad=VADConfig(**vad),
|
||||||
@@ -255,6 +271,18 @@ def _dict(value: object) -> dict:
|
|||||||
return dict(value) if isinstance(value, dict) else {}
|
return dict(value) if isinstance(value, dict) else {}
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_audio_filter(value: dict) -> dict:
|
||||||
|
if value.get("lib_path") == "":
|
||||||
|
value["lib_path"] = None
|
||||||
|
if value.get("model_path") == "":
|
||||||
|
value["model_path"] = None
|
||||||
|
if value.get("log_level") == "":
|
||||||
|
value["log_level"] = None
|
||||||
|
if "provider" in value:
|
||||||
|
value["provider"] = str(value["provider"]).strip().lower()
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
def _normalize_llm_provider(value: object) -> str:
|
def _normalize_llm_provider(value: object) -> str:
|
||||||
provider = str(value or LLMConfig().provider).strip().lower()
|
provider = str(value or LLMConfig().provider).strip().lower()
|
||||||
normalized = _LLM_PROVIDER_ALIASES.get(provider)
|
normalized = _LLM_PROVIDER_ALIASES.get(provider)
|
||||||
|
|||||||
182
engine/deepfilternet_filter.py
Normal file
182
engine/deepfilternet_filter.py
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ctypes
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
||||||
|
from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
|
||||||
|
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
|
||||||
|
|
||||||
|
|
||||||
|
class DeepFilterNetAudioFilter(BaseAudioFilter):
|
||||||
|
"""DeepFilterNet transport filter backed by libDF's real-time C API.
|
||||||
|
|
||||||
|
The DeepFilterNet Python ``enhance`` helper is file/batch oriented. This
|
||||||
|
filter uses ``df_process_frame`` instead, which keeps the model, STFT, and
|
||||||
|
rolling lookahead state alive across hop-sized frames for one voice session.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
lib_path: str | None,
|
||||||
|
model_path: str | None,
|
||||||
|
model_sample_rate: int = 48000,
|
||||||
|
channels: int = 1,
|
||||||
|
atten_lim_db: float = 100.0,
|
||||||
|
post_filter_beta: float = 0.0,
|
||||||
|
log_level: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self._lib_path = lib_path or os.environ.get("DEEPFILTERNET_LIB_PATH")
|
||||||
|
self._model_path = model_path or os.environ.get("DEEPFILTERNET_MODEL_PATH")
|
||||||
|
self._model_sample_rate = model_sample_rate
|
||||||
|
self._channels = channels
|
||||||
|
self._atten_lim_db = atten_lim_db
|
||||||
|
self._post_filter_beta = post_filter_beta
|
||||||
|
self._log_level = log_level
|
||||||
|
|
||||||
|
self._filtering = True
|
||||||
|
self._sample_rate = 0
|
||||||
|
self._lib = None
|
||||||
|
self._state = None
|
||||||
|
self._frame_length = 0
|
||||||
|
self._input_resampler = SOXRStreamAudioResampler()
|
||||||
|
self._output_resampler = SOXRStreamAudioResampler()
|
||||||
|
self._pending_model_bytes = bytearray()
|
||||||
|
|
||||||
|
async def start(self, sample_rate: int) -> None:
|
||||||
|
if self._channels != 1:
|
||||||
|
raise ValueError("DeepFilterNet audio filter currently supports mono PCM only")
|
||||||
|
|
||||||
|
self._sample_rate = sample_rate
|
||||||
|
self._pending_model_bytes.clear()
|
||||||
|
self._lib = self._load_library()
|
||||||
|
self._state = self._create_state()
|
||||||
|
self._frame_length = int(self._lib.df_get_frame_length(self._state))
|
||||||
|
if self._frame_length <= 0:
|
||||||
|
raise RuntimeError("DeepFilterNet returned an invalid frame length")
|
||||||
|
|
||||||
|
if self._post_filter_beta > 0:
|
||||||
|
self._lib.df_set_post_filter_beta(self._state, ctypes.c_float(self._post_filter_beta))
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"DeepFilterNet audio filter enabled "
|
||||||
|
f"sample_rate={sample_rate} model_sample_rate={self._model_sample_rate} "
|
||||||
|
f"frame_length={self._frame_length}"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def stop(self) -> None:
|
||||||
|
if self._lib and self._state:
|
||||||
|
self._lib.df_free(self._state)
|
||||||
|
self._lib = None
|
||||||
|
self._state = None
|
||||||
|
self._frame_length = 0
|
||||||
|
self._pending_model_bytes.clear()
|
||||||
|
|
||||||
|
async def process_frame(self, frame: FilterControlFrame) -> None:
|
||||||
|
if isinstance(frame, FilterEnableFrame):
|
||||||
|
self._filtering = frame.enable
|
||||||
|
|
||||||
|
async def filter(self, audio: bytes) -> bytes:
|
||||||
|
if not self._filtering or not self._lib or not self._state:
|
||||||
|
return audio
|
||||||
|
if not audio:
|
||||||
|
return b""
|
||||||
|
|
||||||
|
model_rate_audio = await self._input_resampler.resample(
|
||||||
|
audio,
|
||||||
|
self._sample_rate,
|
||||||
|
self._model_sample_rate,
|
||||||
|
)
|
||||||
|
self._pending_model_bytes.extend(model_rate_audio)
|
||||||
|
|
||||||
|
frame_bytes = self._frame_length * 2
|
||||||
|
processed_chunks: list[bytes] = []
|
||||||
|
while len(self._pending_model_bytes) >= frame_bytes:
|
||||||
|
chunk = bytes(self._pending_model_bytes[:frame_bytes])
|
||||||
|
del self._pending_model_bytes[:frame_bytes]
|
||||||
|
processed_chunks.append(self._process_model_frame(chunk))
|
||||||
|
|
||||||
|
if not processed_chunks:
|
||||||
|
return b""
|
||||||
|
|
||||||
|
processed_audio = b"".join(processed_chunks)
|
||||||
|
return await self._output_resampler.resample(
|
||||||
|
processed_audio,
|
||||||
|
self._model_sample_rate,
|
||||||
|
self._sample_rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_library(self):
|
||||||
|
if not self._lib_path:
|
||||||
|
raise RuntimeError(
|
||||||
|
"DeepFilterNet audio filter requires audio_filter.lib_path "
|
||||||
|
"or DEEPFILTERNET_LIB_PATH"
|
||||||
|
)
|
||||||
|
|
||||||
|
lib_path = Path(self._lib_path).expanduser()
|
||||||
|
if not lib_path.exists():
|
||||||
|
raise FileNotFoundError(f"DeepFilterNet library not found: {lib_path}")
|
||||||
|
|
||||||
|
lib = ctypes.CDLL(str(lib_path))
|
||||||
|
lib.df_create.argtypes = [ctypes.c_char_p, ctypes.c_float, ctypes.c_char_p]
|
||||||
|
lib.df_create.restype = ctypes.c_void_p
|
||||||
|
lib.df_get_frame_length.argtypes = [ctypes.c_void_p]
|
||||||
|
lib.df_get_frame_length.restype = ctypes.c_size_t
|
||||||
|
lib.df_set_post_filter_beta.argtypes = [ctypes.c_void_p, ctypes.c_float]
|
||||||
|
lib.df_set_post_filter_beta.restype = None
|
||||||
|
lib.df_process_frame.argtypes = [
|
||||||
|
ctypes.c_void_p,
|
||||||
|
ctypes.POINTER(ctypes.c_float),
|
||||||
|
ctypes.POINTER(ctypes.c_float),
|
||||||
|
]
|
||||||
|
lib.df_process_frame.restype = ctypes.c_float
|
||||||
|
lib.df_free.argtypes = [ctypes.c_void_p]
|
||||||
|
lib.df_free.restype = None
|
||||||
|
return lib
|
||||||
|
|
||||||
|
def _create_state(self):
|
||||||
|
if not self._model_path:
|
||||||
|
raise RuntimeError(
|
||||||
|
"DeepFilterNet audio filter requires audio_filter.model_path "
|
||||||
|
"or DEEPFILTERNET_MODEL_PATH"
|
||||||
|
)
|
||||||
|
|
||||||
|
model_path = Path(self._model_path).expanduser()
|
||||||
|
if not model_path.exists():
|
||||||
|
raise FileNotFoundError(f"DeepFilterNet model not found: {model_path}")
|
||||||
|
|
||||||
|
log_level = self._log_level.encode("utf-8") if self._log_level else None
|
||||||
|
state = self._lib.df_create(
|
||||||
|
str(model_path).encode("utf-8"),
|
||||||
|
ctypes.c_float(self._atten_lim_db),
|
||||||
|
log_level,
|
||||||
|
)
|
||||||
|
if not state:
|
||||||
|
raise RuntimeError("DeepFilterNet failed to create model state")
|
||||||
|
return state
|
||||||
|
|
||||||
|
def _process_model_frame(self, pcm16_audio: bytes) -> bytes:
|
||||||
|
input_f32 = _pcm16_to_float32(pcm16_audio)
|
||||||
|
output_f32 = np.zeros(self._frame_length, dtype=np.float32)
|
||||||
|
|
||||||
|
self._lib.df_process_frame(
|
||||||
|
self._state,
|
||||||
|
input_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||||
|
output_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||||
|
)
|
||||||
|
|
||||||
|
return _float32_to_pcm16(output_f32)
|
||||||
|
|
||||||
|
|
||||||
|
def _pcm16_to_float32(audio: bytes) -> np.ndarray:
|
||||||
|
return (np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0).copy()
|
||||||
|
|
||||||
|
|
||||||
|
def _float32_to_pcm16(audio: np.ndarray) -> bytes:
|
||||||
|
clipped = np.clip(audio, -1.0, 1.0)
|
||||||
|
return (clipped * 32767.0).astype(np.int16).tobytes()
|
||||||
@@ -32,6 +32,7 @@ from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
|
|||||||
)
|
)
|
||||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||||
|
|
||||||
|
from .audio_filters import create_audio_input_filter
|
||||||
from .config import EngineConfig
|
from .config import EngineConfig
|
||||||
from .context_sync import AssistantContextSyncProcessor
|
from .context_sync import AssistantContextSyncProcessor
|
||||||
from .fastgpt_llm import FastGPTLLMService
|
from .fastgpt_llm import FastGPTLLMService
|
||||||
@@ -80,6 +81,7 @@ async def run_pipeline_with_serializer(
|
|||||||
audio_out_sample_rate=config.audio.sample_rate_hz,
|
audio_out_sample_rate=config.audio.sample_rate_hz,
|
||||||
audio_in_channels=config.audio.channels,
|
audio_in_channels=config.audio.channels,
|
||||||
audio_out_channels=config.audio.channels,
|
audio_out_channels=config.audio.channels,
|
||||||
|
audio_in_filter=create_audio_input_filter(config.audio_filter, config.audio),
|
||||||
serializer=serializer,
|
serializer=serializer,
|
||||||
session_timeout=None,
|
session_timeout=None,
|
||||||
),
|
),
|
||||||
|
|||||||
2
requirements-deepfilternet.txt
Normal file
2
requirements-deepfilternet.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
numpy>=1.26
|
||||||
|
soxr>=0.5
|
||||||
Reference in New Issue
Block a user