diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index e360951..3fe5e26 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -72,6 +72,7 @@ class DuplexPipeline: _PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms _ASR_DELTA_THROTTLE_MS = 300 _LLM_DELTA_THROTTLE_MS = 80 + _ASR_CAPTURE_MAX_MS = 15000 _DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = { "current_time": { "name": "current_time", @@ -166,6 +167,7 @@ class DuplexPipeline: self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds) self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms self._asr_capture_active: bool = False + self._asr_capture_started_ms: float = 0.0 self._pending_speech_audio: bytes = b"" # Keep a short rolling pre-speech window so VAD transition latency # does not clip the first phoneme/character sent to ASR. @@ -1078,6 +1080,15 @@ class DuplexPipeline: # 4. Check for End of Utterance - this triggers LLM response if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active): await self._on_end_of_utterance() + elif ( + self._asr_capture_active + and self._asr_capture_started_ms > 0.0 + and (time.monotonic() * 1000.0 - self._asr_capture_started_ms) >= self._ASR_CAPTURE_MAX_MS + ): + logger.warning( + f"[EOU] Force finalize after ASR capture timeout: {self._ASR_CAPTURE_MAX_MS}ms" + ) + await self._on_end_of_utterance() elif ( vad_status == "Silence" and not self.eou_detector.is_speaking @@ -1180,6 +1191,7 @@ class DuplexPipeline: self._last_sent_transcript = "" self.eou_detector.reset() self._asr_capture_active = False + self._asr_capture_started_ms = 0.0 self._pending_speech_audio = b"" # Clear ASR buffer. Interim starts only after ASR capture is activated. @@ -1211,6 +1223,7 @@ class DuplexPipeline: self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:] self._asr_capture_active = True + self._asr_capture_started_ms = time.monotonic() * 1000.0 logger.debug( f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), " f"capture={len(capture_audio)} bytes" @@ -1247,6 +1260,7 @@ class DuplexPipeline: self._audio_buffer = b"" self._last_sent_transcript = "" self._asr_capture_active = False + self._asr_capture_started_ms = 0.0 self._pending_speech_audio = b"" # Return to idle; don't force LISTENING which causes buffering on silence await self.conversation.set_state(ConversationState.IDLE) @@ -1272,6 +1286,7 @@ class DuplexPipeline: self._pending_transcript_delta = "" self._last_transcript_delta_emit_ms = 0.0 self._asr_capture_active = False + self._asr_capture_started_ms = 0.0 self._pending_speech_audio = b"" # Process the turn - trigger LLM response @@ -2017,6 +2032,7 @@ class DuplexPipeline: self._audio_buffer = b"" self.eou_detector.reset() self._asr_capture_active = False + self._asr_capture_started_ms = 0.0 self._pending_speech_audio = b"" async def _stop_current_speech(self) -> None: diff --git a/engine/processors/eou.py b/engine/processors/eou.py index 22d104f..7a3758b 100644 --- a/engine/processors/eou.py +++ b/engine/processors/eou.py @@ -53,6 +53,18 @@ class EouDetector: if vad_status == "Silence": if not self.is_speaking: + # Fallback path: ASR has already started but VAD speaking state + # was lost (e.g. VAD flicker). Allow silence-only EOU so the + # turn can still close instead of stalling forever. + if not force_eligible: + return False + if self.silence_start_time is None: + self.silence_start_time = now + silence_duration = now - self.silence_start_time + if silence_duration >= self.threshold and not self.triggered: + self.triggered = True + self.silence_start_time = None + return True return False if self.silence_start_time is None: self.silence_start_time = now