Add ASR capture timeout handling in DuplexPipeline and enhance EOU detection logic. Introduce _ASR_CAPTURE_MAX_MS constant and manage capture state timing to ensure timely end of utterance processing, even during silence. Update EouDetector to allow silence-only EOU when VAD state is lost.
This commit is contained in:
@@ -72,6 +72,7 @@ class DuplexPipeline:
|
||||
_PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms
|
||||
_ASR_DELTA_THROTTLE_MS = 300
|
||||
_LLM_DELTA_THROTTLE_MS = 80
|
||||
_ASR_CAPTURE_MAX_MS = 15000
|
||||
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
|
||||
"current_time": {
|
||||
"name": "current_time",
|
||||
@@ -166,6 +167,7 @@ class DuplexPipeline:
|
||||
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
||||
self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
|
||||
self._asr_capture_active: bool = False
|
||||
self._asr_capture_started_ms: float = 0.0
|
||||
self._pending_speech_audio: bytes = b""
|
||||
# Keep a short rolling pre-speech window so VAD transition latency
|
||||
# does not clip the first phoneme/character sent to ASR.
|
||||
@@ -1078,6 +1080,15 @@ class DuplexPipeline:
|
||||
# 4. Check for End of Utterance - this triggers LLM response
|
||||
if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
|
||||
await self._on_end_of_utterance()
|
||||
elif (
|
||||
self._asr_capture_active
|
||||
and self._asr_capture_started_ms > 0.0
|
||||
and (time.monotonic() * 1000.0 - self._asr_capture_started_ms) >= self._ASR_CAPTURE_MAX_MS
|
||||
):
|
||||
logger.warning(
|
||||
f"[EOU] Force finalize after ASR capture timeout: {self._ASR_CAPTURE_MAX_MS}ms"
|
||||
)
|
||||
await self._on_end_of_utterance()
|
||||
elif (
|
||||
vad_status == "Silence"
|
||||
and not self.eou_detector.is_speaking
|
||||
@@ -1180,6 +1191,7 @@ class DuplexPipeline:
|
||||
self._last_sent_transcript = ""
|
||||
self.eou_detector.reset()
|
||||
self._asr_capture_active = False
|
||||
self._asr_capture_started_ms = 0.0
|
||||
self._pending_speech_audio = b""
|
||||
|
||||
# Clear ASR buffer. Interim starts only after ASR capture is activated.
|
||||
@@ -1211,6 +1223,7 @@ class DuplexPipeline:
|
||||
self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]
|
||||
|
||||
self._asr_capture_active = True
|
||||
self._asr_capture_started_ms = time.monotonic() * 1000.0
|
||||
logger.debug(
|
||||
f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
|
||||
f"capture={len(capture_audio)} bytes"
|
||||
@@ -1247,6 +1260,7 @@ class DuplexPipeline:
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
self._asr_capture_active = False
|
||||
self._asr_capture_started_ms = 0.0
|
||||
self._pending_speech_audio = b""
|
||||
# Return to idle; don't force LISTENING which causes buffering on silence
|
||||
await self.conversation.set_state(ConversationState.IDLE)
|
||||
@@ -1272,6 +1286,7 @@ class DuplexPipeline:
|
||||
self._pending_transcript_delta = ""
|
||||
self._last_transcript_delta_emit_ms = 0.0
|
||||
self._asr_capture_active = False
|
||||
self._asr_capture_started_ms = 0.0
|
||||
self._pending_speech_audio = b""
|
||||
|
||||
# Process the turn - trigger LLM response
|
||||
@@ -2017,6 +2032,7 @@ class DuplexPipeline:
|
||||
self._audio_buffer = b""
|
||||
self.eou_detector.reset()
|
||||
self._asr_capture_active = False
|
||||
self._asr_capture_started_ms = 0.0
|
||||
self._pending_speech_audio = b""
|
||||
|
||||
async def _stop_current_speech(self) -> None:
|
||||
|
||||
@@ -53,6 +53,18 @@ class EouDetector:
|
||||
|
||||
if vad_status == "Silence":
|
||||
if not self.is_speaking:
|
||||
# Fallback path: ASR has already started but VAD speaking state
|
||||
# was lost (e.g. VAD flicker). Allow silence-only EOU so the
|
||||
# turn can still close instead of stalling forever.
|
||||
if not force_eligible:
|
||||
return False
|
||||
if self.silence_start_time is None:
|
||||
self.silence_start_time = now
|
||||
silence_duration = now - self.silence_start_time
|
||||
if silence_duration >= self.threshold and not self.triggered:
|
||||
self.triggered = True
|
||||
self.silence_start_time = None
|
||||
return True
|
||||
return False
|
||||
if self.silence_start_time is None:
|
||||
self.silence_start_time = now
|
||||
|
||||
Reference in New Issue
Block a user