Add ASR capture timeout handling in DuplexPipeline and enhance EOU detection logic. Introduce _ASR_CAPTURE_MAX_MS constant and manage capture state timing to ensure timely end of utterance processing, even during silence. Update EouDetector to allow silence-only EOU when VAD state is lost.

This commit is contained in:
Xin Wang
2026-02-27 09:59:54 +08:00
parent 0b308f9bce
commit 403b4b93c7
2 changed files with 28 additions and 0 deletions

View File

@@ -72,6 +72,7 @@ class DuplexPipeline:
_PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms _PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms
_ASR_DELTA_THROTTLE_MS = 300 _ASR_DELTA_THROTTLE_MS = 300
_LLM_DELTA_THROTTLE_MS = 80 _LLM_DELTA_THROTTLE_MS = 80
_ASR_CAPTURE_MAX_MS = 15000
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = { _DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
"current_time": { "current_time": {
"name": "current_time", "name": "current_time",
@@ -166,6 +167,7 @@ class DuplexPipeline:
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds) self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
self._asr_capture_active: bool = False self._asr_capture_active: bool = False
self._asr_capture_started_ms: float = 0.0
self._pending_speech_audio: bytes = b"" self._pending_speech_audio: bytes = b""
# Keep a short rolling pre-speech window so VAD transition latency # Keep a short rolling pre-speech window so VAD transition latency
# does not clip the first phoneme/character sent to ASR. # does not clip the first phoneme/character sent to ASR.
@@ -1078,6 +1080,15 @@ class DuplexPipeline:
# 4. Check for End of Utterance - this triggers LLM response # 4. Check for End of Utterance - this triggers LLM response
if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active): if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
await self._on_end_of_utterance() await self._on_end_of_utterance()
elif (
self._asr_capture_active
and self._asr_capture_started_ms > 0.0
and (time.monotonic() * 1000.0 - self._asr_capture_started_ms) >= self._ASR_CAPTURE_MAX_MS
):
logger.warning(
f"[EOU] Force finalize after ASR capture timeout: {self._ASR_CAPTURE_MAX_MS}ms"
)
await self._on_end_of_utterance()
elif ( elif (
vad_status == "Silence" vad_status == "Silence"
and not self.eou_detector.is_speaking and not self.eou_detector.is_speaking
@@ -1180,6 +1191,7 @@ class DuplexPipeline:
self._last_sent_transcript = "" self._last_sent_transcript = ""
self.eou_detector.reset() self.eou_detector.reset()
self._asr_capture_active = False self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b"" self._pending_speech_audio = b""
# Clear ASR buffer. Interim starts only after ASR capture is activated. # Clear ASR buffer. Interim starts only after ASR capture is activated.
@@ -1211,6 +1223,7 @@ class DuplexPipeline:
self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:] self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]
self._asr_capture_active = True self._asr_capture_active = True
self._asr_capture_started_ms = time.monotonic() * 1000.0
logger.debug( logger.debug(
f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), " f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
f"capture={len(capture_audio)} bytes" f"capture={len(capture_audio)} bytes"
@@ -1247,6 +1260,7 @@ class DuplexPipeline:
self._audio_buffer = b"" self._audio_buffer = b""
self._last_sent_transcript = "" self._last_sent_transcript = ""
self._asr_capture_active = False self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b"" self._pending_speech_audio = b""
# Return to idle; don't force LISTENING which causes buffering on silence # Return to idle; don't force LISTENING which causes buffering on silence
await self.conversation.set_state(ConversationState.IDLE) await self.conversation.set_state(ConversationState.IDLE)
@@ -1272,6 +1286,7 @@ class DuplexPipeline:
self._pending_transcript_delta = "" self._pending_transcript_delta = ""
self._last_transcript_delta_emit_ms = 0.0 self._last_transcript_delta_emit_ms = 0.0
self._asr_capture_active = False self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b"" self._pending_speech_audio = b""
# Process the turn - trigger LLM response # Process the turn - trigger LLM response
@@ -2017,6 +2032,7 @@ class DuplexPipeline:
self._audio_buffer = b"" self._audio_buffer = b""
self.eou_detector.reset() self.eou_detector.reset()
self._asr_capture_active = False self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b"" self._pending_speech_audio = b""
async def _stop_current_speech(self) -> None: async def _stop_current_speech(self) -> None:

View File

@@ -53,6 +53,18 @@ class EouDetector:
if vad_status == "Silence": if vad_status == "Silence":
if not self.is_speaking: if not self.is_speaking:
# Fallback path: ASR has already started but VAD speaking state
# was lost (e.g. VAD flicker). Allow silence-only EOU so the
# turn can still close instead of stalling forever.
if not force_eligible:
return False
if self.silence_start_time is None:
self.silence_start_time = now
silence_duration = now - self.silence_start_time
if silence_duration >= self.threshold and not self.triggered:
self.triggered = True
self.silence_start_time = None
return True
return False return False
if self.silence_start_time is None: if self.silence_start_time is None:
self.silence_start_time = now self.silence_start_time = now