diff --git a/engine/app/config.py b/engine/app/config.py index f1bb729..2eb4bbe 100644 --- a/engine/app/config.py +++ b/engine/app/config.py @@ -53,6 +53,10 @@ class Settings(BaseSettings): siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model") asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms") asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result") + asr_start_min_speech_ms: int = Field( + default=160, + description="Minimum continuous speech duration before ASR capture starts" + ) asr_pre_speech_ms: int = Field( default=240, description="Audio context (ms) prepended before detected speech to avoid clipping first phoneme" diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index 2f45387..a4fe91b 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -230,6 +230,11 @@ class DuplexPipeline: self._audio_buffer: bytes = b"" max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30 self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds) + self._asr_start_min_speech_ms: int = ( + settings.asr_start_min_speech_ms if hasattr(settings, "asr_start_min_speech_ms") else 160 + ) + self._asr_capture_active: bool = False + self._pending_speech_audio: bytes = b"" # Keep a short rolling pre-speech window so VAD transition latency # does not clip the first phoneme/character sent to ASR. pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240 @@ -646,7 +651,7 @@ class DuplexPipeline: # Update state based on VAD if vad_status == "Speech" and self._last_vad_status != "Speech": - await self._on_speech_start(current_chunk=pcm_bytes) + await self._on_speech_start() self._last_vad_status = vad_status @@ -685,20 +690,44 @@ class DuplexPipeline: self._barge_in_speech_frames = 0 self._barge_in_silence_frames = 0 - # 3. Buffer audio for ASR - if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING: - self._audio_buffer += pcm_bytes - if len(self._audio_buffer) > self._max_audio_buffer_bytes: - # Keep only the most recent audio to cap memory usage - self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:] - await self.asr_service.send_audio(pcm_bytes) + # 3. Buffer audio for ASR. + # Gate ASR startup by a short speech-duration threshold to reduce + # false positives from micro noises, then always close the turn + # by EOU once ASR has started. + just_started_asr = False + if vad_status == "Speech" and not self._asr_capture_active: + self._pending_speech_audio += pcm_bytes + pending_ms = (len(self._pending_speech_audio) / (settings.sample_rate * 2)) * 1000.0 + if pending_ms >= self._asr_start_min_speech_ms: + await self._start_asr_capture() + just_started_asr = True + + if self._asr_capture_active: + if not just_started_asr: + self._audio_buffer += pcm_bytes + if len(self._audio_buffer) > self._max_audio_buffer_bytes: + # Keep only the most recent audio to cap memory usage + self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:] + await self.asr_service.send_audio(pcm_bytes) # For SiliconFlow ASR, trigger interim transcription periodically # The service handles timing internally via start_interim_transcription() # 4. Check for End of Utterance - this triggers LLM response - if self.eou_detector.process(vad_status): + if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active): await self._on_end_of_utterance() + elif ( + vad_status == "Silence" + and not self.eou_detector.is_speaking + and not self._asr_capture_active + and self.conversation.state == ConversationState.LISTENING + ): + # Speech was too short to pass ASR gate; reset turn so next + # utterance can start cleanly. + self._pending_speech_audio = b"" + self._audio_buffer = b"" + self._last_sent_transcript = "" + await self.conversation.set_state(ConversationState.IDLE) except Exception as e: logger.error(f"Pipeline audio processing error: {e}", exc_info=True) @@ -757,32 +786,44 @@ class DuplexPipeline: logger.info(f"[ASR] ASR interim: {text[:100]}") logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...") - async def _on_speech_start(self, current_chunk: bytes = b"") -> None: + async def _on_speech_start(self) -> None: """Handle user starting to speak.""" if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED): await self.conversation.start_user_turn() self._audio_buffer = b"" self._last_sent_transcript = "" self.eou_detector.reset() + self._asr_capture_active = False + self._pending_speech_audio = b"" - # Clear ASR buffer and start interim transcriptions + # Clear ASR buffer. Interim starts only after ASR capture is activated. if hasattr(self.asr_service, 'clear_buffer'): self.asr_service.clear_buffer() - if hasattr(self.asr_service, 'start_interim_transcription'): - await self.asr_service.start_interim_transcription() - # Prime ASR with a short pre-speech context window so the utterance - # start isn't lost while waiting for VAD to transition to Speech. - pre_roll = self._pre_speech_buffer - if current_chunk and len(pre_roll) > len(current_chunk): - pre_roll = pre_roll[:-len(current_chunk)] - elif current_chunk: - pre_roll = b"" - if pre_roll: - await self.asr_service.send_audio(pre_roll) - self._audio_buffer = pre_roll logger.debug("User speech started") + async def _start_asr_capture(self) -> None: + """Start ASR capture for the current turn after min speech gate passes.""" + if self._asr_capture_active: + return + + if hasattr(self.asr_service, 'start_interim_transcription'): + await self.asr_service.start_interim_transcription() + + # Prime ASR with a short pre-speech context window so the utterance + # start isn't lost while waiting for VAD to transition to Speech. + pre_roll = self._pre_speech_buffer + capture_audio = pre_roll + self._pending_speech_audio + if capture_audio: + await self.asr_service.send_audio(capture_audio) + self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:] + + self._asr_capture_active = True + logger.debug( + f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), " + f"capture={len(capture_audio)} bytes" + ) + async def _on_end_of_utterance(self) -> None: """Handle end of user utterance.""" if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED): @@ -813,6 +854,8 @@ class DuplexPipeline: # Reset for next utterance self._audio_buffer = b"" self._last_sent_transcript = "" + self._asr_capture_active = False + self._pending_speech_audio = b"" # Return to idle; don't force LISTENING which causes buffering on silence await self.conversation.set_state(ConversationState.IDLE) return @@ -833,6 +876,8 @@ class DuplexPipeline: # Clear buffers self._audio_buffer = b"" self._last_sent_transcript = "" + self._asr_capture_active = False + self._pending_speech_audio = b"" # Process the turn - trigger LLM response # Cancel any existing turn to avoid overlapping assistant responses @@ -1493,6 +1538,8 @@ class DuplexPipeline: await self.conversation.start_user_turn() self._audio_buffer = b"" self.eou_detector.reset() + self._asr_capture_active = False + self._pending_speech_audio = b"" async def _stop_current_speech(self) -> None: """Stop any current speech task.""" diff --git a/engine/processors/eou.py b/engine/processors/eou.py index baf6807..22d104f 100644 --- a/engine/processors/eou.py +++ b/engine/processors/eou.py @@ -30,7 +30,7 @@ class EouDetector: self.silence_start_time: Optional[float] = None self.triggered = False - def process(self, vad_status: str) -> bool: + def process(self, vad_status: str, force_eligible: bool = False) -> bool: """ Process VAD status and detect end of utterance. @@ -58,7 +58,7 @@ class EouDetector: self.silence_start_time = now speech_duration = self.silence_start_time - self.speech_start_time - if speech_duration < self.min_speech: + if speech_duration < self.min_speech and not force_eligible: self.is_speaking = False self.silence_start_time = None return False