Add ASR capture timeout handling in DuplexPipeline and enhance EOU detection logic. Introduce _ASR_CAPTURE_MAX_MS constant and manage capture state timing to ensure timely end of utterance processing, even during silence. Update EouDetector to allow silence-only EOU when VAD state is lost.

This commit is contained in:
Xin Wang
2026-02-27 09:59:54 +08:00
parent 0b308f9bce
commit 403b4b93c7
2 changed files with 28 additions and 0 deletions

View File

@@ -72,6 +72,7 @@ class DuplexPipeline:
_PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms
_ASR_DELTA_THROTTLE_MS = 300
_LLM_DELTA_THROTTLE_MS = 80
_ASR_CAPTURE_MAX_MS = 15000
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
"current_time": {
"name": "current_time",
@@ -166,6 +167,7 @@ class DuplexPipeline:
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
self._asr_capture_active: bool = False
self._asr_capture_started_ms: float = 0.0
self._pending_speech_audio: bytes = b""
# Keep a short rolling pre-speech window so VAD transition latency
# does not clip the first phoneme/character sent to ASR.
@@ -1078,6 +1080,15 @@ class DuplexPipeline:
# 4. Check for End of Utterance - this triggers LLM response
if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
await self._on_end_of_utterance()
elif (
self._asr_capture_active
and self._asr_capture_started_ms > 0.0
and (time.monotonic() * 1000.0 - self._asr_capture_started_ms) >= self._ASR_CAPTURE_MAX_MS
):
logger.warning(
f"[EOU] Force finalize after ASR capture timeout: {self._ASR_CAPTURE_MAX_MS}ms"
)
await self._on_end_of_utterance()
elif (
vad_status == "Silence"
and not self.eou_detector.is_speaking
@@ -1180,6 +1191,7 @@ class DuplexPipeline:
self._last_sent_transcript = ""
self.eou_detector.reset()
self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b""
# Clear ASR buffer. Interim starts only after ASR capture is activated.
@@ -1211,6 +1223,7 @@ class DuplexPipeline:
self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]
self._asr_capture_active = True
self._asr_capture_started_ms = time.monotonic() * 1000.0
logger.debug(
f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
f"capture={len(capture_audio)} bytes"
@@ -1247,6 +1260,7 @@ class DuplexPipeline:
self._audio_buffer = b""
self._last_sent_transcript = ""
self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b""
# Return to idle; don't force LISTENING which causes buffering on silence
await self.conversation.set_state(ConversationState.IDLE)
@@ -1272,6 +1286,7 @@ class DuplexPipeline:
self._pending_transcript_delta = ""
self._last_transcript_delta_emit_ms = 0.0
self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b""
# Process the turn - trigger LLM response
@@ -2017,6 +2032,7 @@ class DuplexPipeline:
self._audio_buffer = b""
self.eou_detector.reset()
self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b""
async def _stop_current_speech(self) -> None:

View File

@@ -53,6 +53,18 @@ class EouDetector:
if vad_status == "Silence":
if not self.is_speaking:
# Fallback path: ASR has already started but VAD speaking state
# was lost (e.g. VAD flicker). Allow silence-only EOU so the
# turn can still close instead of stalling forever.
if not force_eligible:
return False
if self.silence_start_time is None:
self.silence_start_time = now
silence_duration = now - self.silence_start_time
if silence_duration >= self.threshold and not self.triggered:
self.triggered = True
self.silence_start_time = None
return True
return False
if self.silence_start_time is None:
self.silence_start_time = now