Add ASR capture timeout handling in DuplexPipeline and enhance EOU detection logic. Introduce _ASR_CAPTURE_MAX_MS constant and manage capture state timing to ensure timely end of utterance processing, even during silence. Update EouDetector to allow silence-only EOU when VAD state is lost.
This commit is contained in:
@@ -72,6 +72,7 @@ class DuplexPipeline:
|
|||||||
_PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms
|
_PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms
|
||||||
_ASR_DELTA_THROTTLE_MS = 300
|
_ASR_DELTA_THROTTLE_MS = 300
|
||||||
_LLM_DELTA_THROTTLE_MS = 80
|
_LLM_DELTA_THROTTLE_MS = 80
|
||||||
|
_ASR_CAPTURE_MAX_MS = 15000
|
||||||
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
|
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
|
||||||
"current_time": {
|
"current_time": {
|
||||||
"name": "current_time",
|
"name": "current_time",
|
||||||
@@ -166,6 +167,7 @@ class DuplexPipeline:
|
|||||||
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
||||||
self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
|
self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
|
||||||
self._asr_capture_active: bool = False
|
self._asr_capture_active: bool = False
|
||||||
|
self._asr_capture_started_ms: float = 0.0
|
||||||
self._pending_speech_audio: bytes = b""
|
self._pending_speech_audio: bytes = b""
|
||||||
# Keep a short rolling pre-speech window so VAD transition latency
|
# Keep a short rolling pre-speech window so VAD transition latency
|
||||||
# does not clip the first phoneme/character sent to ASR.
|
# does not clip the first phoneme/character sent to ASR.
|
||||||
@@ -1078,6 +1080,15 @@ class DuplexPipeline:
|
|||||||
# 4. Check for End of Utterance - this triggers LLM response
|
# 4. Check for End of Utterance - this triggers LLM response
|
||||||
if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
|
if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
|
||||||
await self._on_end_of_utterance()
|
await self._on_end_of_utterance()
|
||||||
|
elif (
|
||||||
|
self._asr_capture_active
|
||||||
|
and self._asr_capture_started_ms > 0.0
|
||||||
|
and (time.monotonic() * 1000.0 - self._asr_capture_started_ms) >= self._ASR_CAPTURE_MAX_MS
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"[EOU] Force finalize after ASR capture timeout: {self._ASR_CAPTURE_MAX_MS}ms"
|
||||||
|
)
|
||||||
|
await self._on_end_of_utterance()
|
||||||
elif (
|
elif (
|
||||||
vad_status == "Silence"
|
vad_status == "Silence"
|
||||||
and not self.eou_detector.is_speaking
|
and not self.eou_detector.is_speaking
|
||||||
@@ -1180,6 +1191,7 @@ class DuplexPipeline:
|
|||||||
self._last_sent_transcript = ""
|
self._last_sent_transcript = ""
|
||||||
self.eou_detector.reset()
|
self.eou_detector.reset()
|
||||||
self._asr_capture_active = False
|
self._asr_capture_active = False
|
||||||
|
self._asr_capture_started_ms = 0.0
|
||||||
self._pending_speech_audio = b""
|
self._pending_speech_audio = b""
|
||||||
|
|
||||||
# Clear ASR buffer. Interim starts only after ASR capture is activated.
|
# Clear ASR buffer. Interim starts only after ASR capture is activated.
|
||||||
@@ -1211,6 +1223,7 @@ class DuplexPipeline:
|
|||||||
self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]
|
self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]
|
||||||
|
|
||||||
self._asr_capture_active = True
|
self._asr_capture_active = True
|
||||||
|
self._asr_capture_started_ms = time.monotonic() * 1000.0
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
|
f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
|
||||||
f"capture={len(capture_audio)} bytes"
|
f"capture={len(capture_audio)} bytes"
|
||||||
@@ -1247,6 +1260,7 @@ class DuplexPipeline:
|
|||||||
self._audio_buffer = b""
|
self._audio_buffer = b""
|
||||||
self._last_sent_transcript = ""
|
self._last_sent_transcript = ""
|
||||||
self._asr_capture_active = False
|
self._asr_capture_active = False
|
||||||
|
self._asr_capture_started_ms = 0.0
|
||||||
self._pending_speech_audio = b""
|
self._pending_speech_audio = b""
|
||||||
# Return to idle; don't force LISTENING which causes buffering on silence
|
# Return to idle; don't force LISTENING which causes buffering on silence
|
||||||
await self.conversation.set_state(ConversationState.IDLE)
|
await self.conversation.set_state(ConversationState.IDLE)
|
||||||
@@ -1272,6 +1286,7 @@ class DuplexPipeline:
|
|||||||
self._pending_transcript_delta = ""
|
self._pending_transcript_delta = ""
|
||||||
self._last_transcript_delta_emit_ms = 0.0
|
self._last_transcript_delta_emit_ms = 0.0
|
||||||
self._asr_capture_active = False
|
self._asr_capture_active = False
|
||||||
|
self._asr_capture_started_ms = 0.0
|
||||||
self._pending_speech_audio = b""
|
self._pending_speech_audio = b""
|
||||||
|
|
||||||
# Process the turn - trigger LLM response
|
# Process the turn - trigger LLM response
|
||||||
@@ -2017,6 +2032,7 @@ class DuplexPipeline:
|
|||||||
self._audio_buffer = b""
|
self._audio_buffer = b""
|
||||||
self.eou_detector.reset()
|
self.eou_detector.reset()
|
||||||
self._asr_capture_active = False
|
self._asr_capture_active = False
|
||||||
|
self._asr_capture_started_ms = 0.0
|
||||||
self._pending_speech_audio = b""
|
self._pending_speech_audio = b""
|
||||||
|
|
||||||
async def _stop_current_speech(self) -> None:
|
async def _stop_current_speech(self) -> None:
|
||||||
|
|||||||
@@ -53,6 +53,18 @@ class EouDetector:
|
|||||||
|
|
||||||
if vad_status == "Silence":
|
if vad_status == "Silence":
|
||||||
if not self.is_speaking:
|
if not self.is_speaking:
|
||||||
|
# Fallback path: ASR has already started but VAD speaking state
|
||||||
|
# was lost (e.g. VAD flicker). Allow silence-only EOU so the
|
||||||
|
# turn can still close instead of stalling forever.
|
||||||
|
if not force_eligible:
|
||||||
|
return False
|
||||||
|
if self.silence_start_time is None:
|
||||||
|
self.silence_start_time = now
|
||||||
|
silence_duration = now - self.silence_start_time
|
||||||
|
if silence_duration >= self.threshold and not self.triggered:
|
||||||
|
self.triggered = True
|
||||||
|
self.silence_start_time = None
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
if self.silence_start_time is None:
|
if self.silence_start_time is None:
|
||||||
self.silence_start_time = now
|
self.silence_start_time = now
|
||||||
|
|||||||
Reference in New Issue
Block a user