Add ASR capture timeout handling in DuplexPipeline and enhance EOU detection logic. Introduce _ASR_CAPTURE_MAX_MS constant and manage capture state timing to ensure timely end of utterance processing, even during silence. Update EouDetector to allow silence-only EOU when VAD state is lost.

2026-02-27 09:59:54 +08:00
parent 0b308f9bce
commit 403b4b93c7
2 changed files with 28 additions and 0 deletions
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -72,6 +72,7 @@ class DuplexPipeline:
    _PCM_FRAME_BYTES = 640  # 16k mono pcm_s16le, 20ms
    _ASR_DELTA_THROTTLE_MS = 300
    _LLM_DELTA_THROTTLE_MS = 80
+    _ASR_CAPTURE_MAX_MS = 15000
    _DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
        "current_time": {
            "name": "current_time",
@@ -166,6 +167,7 @@ class DuplexPipeline:
        self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
        self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
        self._asr_capture_active: bool = False
+        self._asr_capture_started_ms: float = 0.0
        self._pending_speech_audio: bytes = b""
        # Keep a short rolling pre-speech window so VAD transition latency
        # does not clip the first phoneme/character sent to ASR.
@@ -1078,6 +1080,15 @@ class DuplexPipeline:
                # 4. Check for End of Utterance - this triggers LLM response
                if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
                    await self._on_end_of_utterance()
+                elif (
+                    self._asr_capture_active
+                    and self._asr_capture_started_ms > 0.0
+                    and (time.monotonic() * 1000.0 - self._asr_capture_started_ms) >= self._ASR_CAPTURE_MAX_MS
+                ):
+                    logger.warning(
+                        f"[EOU] Force finalize after ASR capture timeout: {self._ASR_CAPTURE_MAX_MS}ms"
+                    )
+                    await self._on_end_of_utterance()
                elif (
                    vad_status == "Silence"
                    and not self.eou_detector.is_speaking
@@ -1180,6 +1191,7 @@ class DuplexPipeline:
            self._last_sent_transcript = ""
            self.eou_detector.reset()
            self._asr_capture_active = False
+            self._asr_capture_started_ms = 0.0
            self._pending_speech_audio = b""

            # Clear ASR buffer. Interim starts only after ASR capture is activated.
@@ -1211,6 +1223,7 @@ class DuplexPipeline:
            self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]

        self._asr_capture_active = True
+        self._asr_capture_started_ms = time.monotonic() * 1000.0
        logger.debug(
            f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
            f"capture={len(capture_audio)} bytes"
@@ -1247,6 +1260,7 @@ class DuplexPipeline:
            self._audio_buffer = b""
            self._last_sent_transcript = ""
            self._asr_capture_active = False
+            self._asr_capture_started_ms = 0.0
            self._pending_speech_audio = b""
            # Return to idle; don't force LISTENING which causes buffering on silence
            await self.conversation.set_state(ConversationState.IDLE)
@@ -1272,6 +1286,7 @@ class DuplexPipeline:
        self._pending_transcript_delta = ""
        self._last_transcript_delta_emit_ms = 0.0
        self._asr_capture_active = False
+        self._asr_capture_started_ms = 0.0
        self._pending_speech_audio = b""

        # Process the turn - trigger LLM response
@@ -2017,6 +2032,7 @@ class DuplexPipeline:
        self._audio_buffer = b""
        self.eou_detector.reset()
        self._asr_capture_active = False
+        self._asr_capture_started_ms = 0.0
        self._pending_speech_audio = b""

    async def _stop_current_speech(self) -> None:
--- a/engine/processors/eou.py
+++ b/engine/processors/eou.py
@@ -53,6 +53,18 @@ class EouDetector:

        if vad_status == "Silence":
            if not self.is_speaking:
+                # Fallback path: ASR has already started but VAD speaking state
+                # was lost (e.g. VAD flicker). Allow silence-only EOU so the
+                # turn can still close instead of stalling forever.
+                if not force_eligible:
+                    return False
+                if self.silence_start_time is None:
+                    self.silence_start_time = now
+                silence_duration = now - self.silence_start_time
+                if silence_duration >= self.threshold and not self.triggered:
+                    self.triggered = True
+                    self.silence_start_time = None
+                    return True
                return False
            if self.silence_start_time is None:
                self.silence_start_time = now