Cleanup engine

This commit is contained in:
Xin Wang
2026-02-12 17:42:21 +08:00
parent 838c19bf9c
commit 35bd83767e
2 changed files with 20 additions and 16 deletions

View File

@@ -25,6 +25,10 @@ class Settings(BaseSettings):
sample_rate: int = Field(default=16000, description="Audio sample rate in Hz") sample_rate: int = Field(default=16000, description="Audio sample rate in Hz")
chunk_size_ms: int = Field(default=20, description="Audio chunk duration in milliseconds") chunk_size_ms: int = Field(default=20, description="Audio chunk duration in milliseconds")
default_codec: str = Field(default="pcm", description="Default audio codec") default_codec: str = Field(default="pcm", description="Default audio codec")
max_audio_buffer_seconds: int = Field(
default=30,
description="Maximum buffered user audio duration kept in memory for current turn"
)
# VAD Configuration # VAD Configuration
vad_type: str = Field(default="silero", description="VAD algorithm type") vad_type: str = Field(default="silero", description="VAD algorithm type")
@@ -79,6 +83,10 @@ class Settings(BaseSettings):
default=200, default=200,
description="Minimum speech duration (ms) required to trigger barge-in. Lower=more sensitive." description="Minimum speech duration (ms) required to trigger barge-in. Lower=more sensitive."
) )
barge_in_silence_tolerance_ms: int = Field(
default=60,
description="How much silence (ms) is tolerated during potential barge-in before reset"
)
# Logging # Logging
log_level: str = Field(default="INFO", description="Logging level") log_level: str = Field(default="INFO", description="Logging level")

View File

@@ -228,21 +228,19 @@ class DuplexPipeline:
self._is_bot_speaking = False self._is_bot_speaking = False
self._current_turn_task: Optional[asyncio.Task] = None self._current_turn_task: Optional[asyncio.Task] = None
self._audio_buffer: bytes = b"" self._audio_buffer: bytes = b""
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30 max_buffer_seconds = settings.max_audio_buffer_seconds
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds) self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
self._asr_start_min_speech_ms: int = ( self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
settings.asr_start_min_speech_ms if hasattr(settings, "asr_start_min_speech_ms") else 160
)
self._asr_capture_active: bool = False self._asr_capture_active: bool = False
self._pending_speech_audio: bytes = b"" self._pending_speech_audio: bytes = b""
# Keep a short rolling pre-speech window so VAD transition latency # Keep a short rolling pre-speech window so VAD transition latency
# does not clip the first phoneme/character sent to ASR. # does not clip the first phoneme/character sent to ASR.
pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240 pre_speech_ms = settings.asr_pre_speech_ms
self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0)) self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0))
self._pre_speech_buffer: bytes = b"" self._pre_speech_buffer: bytes = b""
# Add a tiny trailing silence tail before final ASR to avoid # Add a tiny trailing silence tail before final ASR to avoid
# clipping the last phoneme at utterance boundaries. # clipping the last phoneme at utterance boundaries.
asr_final_tail_ms = settings.asr_final_tail_ms if hasattr(settings, "asr_final_tail_ms") else 120 asr_final_tail_ms = settings.asr_final_tail_ms
self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0)) self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0))
self._last_vad_status: str = "Silence" self._last_vad_status: str = "Silence"
self._process_lock = asyncio.Lock() self._process_lock = asyncio.Lock()
@@ -261,10 +259,10 @@ class DuplexPipeline:
# Barge-in filtering - require minimum speech duration to interrupt # Barge-in filtering - require minimum speech duration to interrupt
self._barge_in_speech_start_time: Optional[float] = None self._barge_in_speech_start_time: Optional[float] = None
self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms if hasattr(settings, 'barge_in_min_duration_ms') else 50 self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms
self._barge_in_silence_tolerance_ms: int = settings.barge_in_silence_tolerance_ms
self._barge_in_speech_frames: int = 0 # Count speech frames self._barge_in_speech_frames: int = 0 # Count speech frames
self._barge_in_silence_frames: int = 0 # Count silence frames during potential barge-in self._barge_in_silence_frames: int = 0 # Count silence frames during potential barge-in
self._barge_in_silence_tolerance: int = 3 # Allow up to 3 silence frames (60ms at 20ms chunks)
# Runtime overrides injected from session.start metadata # Runtime overrides injected from session.start metadata
self._runtime_llm: Dict[str, Any] = {} self._runtime_llm: Dict[str, Any] = {}
@@ -415,6 +413,11 @@ class DuplexPipeline:
return self._runtime_barge_in_min_duration_ms return self._runtime_barge_in_min_duration_ms
return self._barge_in_min_duration_ms return self._barge_in_min_duration_ms
def _barge_in_silence_tolerance_frames(self) -> int:
"""Convert silence tolerance from ms to frame count using current chunk size."""
chunk_ms = max(1, settings.chunk_size_ms)
return max(1, int(np.ceil(self._barge_in_silence_tolerance_ms / chunk_ms)))
async def _generate_runtime_greeting(self) -> Optional[str]: async def _generate_runtime_greeting(self) -> Optional[str]:
if not self.llm_service: if not self.llm_service:
return None return None
@@ -679,7 +682,7 @@ class DuplexPipeline:
if self._barge_in_speech_start_time is not None: if self._barge_in_speech_start_time is not None:
self._barge_in_silence_frames += 1 self._barge_in_silence_frames += 1
# Allow brief silence gaps (VAD flickering) # Allow brief silence gaps (VAD flickering)
if self._barge_in_silence_frames > self._barge_in_silence_tolerance: if self._barge_in_silence_frames > self._barge_in_silence_tolerance_frames():
# Too much silence - reset barge-in tracking # Too much silence - reset barge-in tracking
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames") logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
self._barge_in_speech_start_time = None self._barge_in_speech_start_time = None
@@ -927,9 +930,6 @@ class DuplexPipeline:
fn = item.get("function") fn = item.get("function")
if isinstance(fn, dict) and fn.get("name"): if isinstance(fn, dict) and fn.get("name"):
fn_name = str(fn.get("name")) fn_name = str(fn.get("name"))
executor = str(item.get("executor") or item.get("run_on") or "").strip().lower()
if executor in {"client", "server"}:
self._runtime_tool_executor[fn_name] = executor
schemas.append( schemas.append(
{ {
"type": "function", "type": "function",
@@ -943,10 +943,6 @@ class DuplexPipeline:
continue continue
if item.get("name"): if item.get("name"):
fn_name = str(item.get("name"))
executor = str(item.get("executor") or item.get("run_on") or "").strip().lower()
if executor in {"client", "server"}:
self._runtime_tool_executor[fn_name] = executor
schemas.append( schemas.append(
{ {
"type": "function", "type": "function",