From 35bd83767e5aeb6e43fb2d8f33dd926e1910b222 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Thu, 12 Feb 2026 17:42:21 +0800 Subject: [PATCH] Cleanup engine --- engine/app/config.py | 8 ++++++++ engine/core/duplex_pipeline.py | 28 ++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/engine/app/config.py b/engine/app/config.py index 2eb4bbe..609b3ad 100644 --- a/engine/app/config.py +++ b/engine/app/config.py @@ -25,6 +25,10 @@ class Settings(BaseSettings): sample_rate: int = Field(default=16000, description="Audio sample rate in Hz") chunk_size_ms: int = Field(default=20, description="Audio chunk duration in milliseconds") default_codec: str = Field(default="pcm", description="Default audio codec") + max_audio_buffer_seconds: int = Field( + default=30, + description="Maximum buffered user audio duration kept in memory for current turn" + ) # VAD Configuration vad_type: str = Field(default="silero", description="VAD algorithm type") @@ -79,6 +83,10 @@ class Settings(BaseSettings): default=200, description="Minimum speech duration (ms) required to trigger barge-in. Lower=more sensitive." ) + barge_in_silence_tolerance_ms: int = Field( + default=60, + description="How much silence (ms) is tolerated during potential barge-in before reset" + ) # Logging log_level: str = Field(default="INFO", description="Logging level") diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index 6e7a0ea..61ea397 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -228,21 +228,19 @@ class DuplexPipeline: self._is_bot_speaking = False self._current_turn_task: Optional[asyncio.Task] = None self._audio_buffer: bytes = b"" - max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30 + max_buffer_seconds = settings.max_audio_buffer_seconds self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds) - self._asr_start_min_speech_ms: int = ( - settings.asr_start_min_speech_ms if hasattr(settings, "asr_start_min_speech_ms") else 160 - ) + self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms self._asr_capture_active: bool = False self._pending_speech_audio: bytes = b"" # Keep a short rolling pre-speech window so VAD transition latency # does not clip the first phoneme/character sent to ASR. - pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240 + pre_speech_ms = settings.asr_pre_speech_ms self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0)) self._pre_speech_buffer: bytes = b"" # Add a tiny trailing silence tail before final ASR to avoid # clipping the last phoneme at utterance boundaries. - asr_final_tail_ms = settings.asr_final_tail_ms if hasattr(settings, "asr_final_tail_ms") else 120 + asr_final_tail_ms = settings.asr_final_tail_ms self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0)) self._last_vad_status: str = "Silence" self._process_lock = asyncio.Lock() @@ -261,10 +259,10 @@ class DuplexPipeline: # Barge-in filtering - require minimum speech duration to interrupt self._barge_in_speech_start_time: Optional[float] = None - self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms if hasattr(settings, 'barge_in_min_duration_ms') else 50 + self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms + self._barge_in_silence_tolerance_ms: int = settings.barge_in_silence_tolerance_ms self._barge_in_speech_frames: int = 0 # Count speech frames self._barge_in_silence_frames: int = 0 # Count silence frames during potential barge-in - self._barge_in_silence_tolerance: int = 3 # Allow up to 3 silence frames (60ms at 20ms chunks) # Runtime overrides injected from session.start metadata self._runtime_llm: Dict[str, Any] = {} @@ -415,6 +413,11 @@ class DuplexPipeline: return self._runtime_barge_in_min_duration_ms return self._barge_in_min_duration_ms + def _barge_in_silence_tolerance_frames(self) -> int: + """Convert silence tolerance from ms to frame count using current chunk size.""" + chunk_ms = max(1, settings.chunk_size_ms) + return max(1, int(np.ceil(self._barge_in_silence_tolerance_ms / chunk_ms))) + async def _generate_runtime_greeting(self) -> Optional[str]: if not self.llm_service: return None @@ -679,7 +682,7 @@ class DuplexPipeline: if self._barge_in_speech_start_time is not None: self._barge_in_silence_frames += 1 # Allow brief silence gaps (VAD flickering) - if self._barge_in_silence_frames > self._barge_in_silence_tolerance: + if self._barge_in_silence_frames > self._barge_in_silence_tolerance_frames(): # Too much silence - reset barge-in tracking logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames") self._barge_in_speech_start_time = None @@ -927,9 +930,6 @@ class DuplexPipeline: fn = item.get("function") if isinstance(fn, dict) and fn.get("name"): fn_name = str(fn.get("name")) - executor = str(item.get("executor") or item.get("run_on") or "").strip().lower() - if executor in {"client", "server"}: - self._runtime_tool_executor[fn_name] = executor schemas.append( { "type": "function", @@ -943,10 +943,6 @@ class DuplexPipeline: continue if item.get("name"): - fn_name = str(item.get("name")) - executor = str(item.get("executor") or item.get("run_on") or "").strip().lower() - if executor in {"client", "server"}: - self._runtime_tool_executor[fn_name] = executor schemas.append( { "type": "function",