From 0135f718f3b1b37b474aed277b58b8c5a11bfdf2 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Mon, 9 Feb 2026 18:28:43 +0800 Subject: [PATCH] Fix asr boundary bug --- engine/app/config.py | 8 ++++++++ engine/core/duplex_pipeline.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/engine/app/config.py b/engine/app/config.py index 95b92e1..2888887 100644 --- a/engine/app/config.py +++ b/engine/app/config.py @@ -53,6 +53,14 @@ class Settings(BaseSettings): siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model") asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms") asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result") + asr_pre_speech_ms: int = Field( + default=240, + description="Audio context (ms) prepended before detected speech to avoid clipping first phoneme" + ) + asr_final_tail_ms: int = Field( + default=120, + description="Silence tail (ms) appended before final ASR decode to protect utterance ending" + ) # Duplex Pipeline Configuration duplex_enabled: bool = Field(default=True, description="Enable duplex voice pipeline") diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index aece39f..70f0016 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -119,6 +119,15 @@ class DuplexPipeline: self._audio_buffer: bytes = b"" max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30 self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds) + # Keep a short rolling pre-speech window so VAD transition latency + # does not clip the first phoneme/character sent to ASR. + pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240 + self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0)) + self._pre_speech_buffer: bytes = b"" + # Add a tiny trailing silence tail before final ASR to avoid + # clipping the last phoneme at utterance boundaries. + asr_final_tail_ms = settings.asr_final_tail_ms if hasattr(settings, "asr_final_tail_ms") else 120 + self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0)) self._last_vad_status: str = "Silence" self._process_lock = asyncio.Lock() @@ -285,6 +294,11 @@ class DuplexPipeline: try: async with self._process_lock: + if pcm_bytes: + self._pre_speech_buffer += pcm_bytes + if len(self._pre_speech_buffer) > self._asr_pre_speech_bytes: + self._pre_speech_buffer = self._pre_speech_buffer[-self._asr_pre_speech_bytes:] + # 1. Process through VAD vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms) @@ -311,7 +325,7 @@ class DuplexPipeline: # Update state based on VAD if vad_status == "Speech" and self._last_vad_status != "Speech": - await self._on_speech_start() + await self._on_speech_start(current_chunk=pcm_bytes) self._last_vad_status = vad_status @@ -418,7 +432,7 @@ class DuplexPipeline: logger.info(f"ASR interim: {text[:100]}") logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...") - async def _on_speech_start(self) -> None: + async def _on_speech_start(self, current_chunk: bytes = b"") -> None: """Handle user starting to speak.""" if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED): await self.conversation.start_user_turn() @@ -431,6 +445,16 @@ class DuplexPipeline: self.asr_service.clear_buffer() if hasattr(self.asr_service, 'start_interim_transcription'): await self.asr_service.start_interim_transcription() + # Prime ASR with a short pre-speech context window so the utterance + # start isn't lost while waiting for VAD to transition to Speech. + pre_roll = self._pre_speech_buffer + if current_chunk and len(pre_roll) > len(current_chunk): + pre_roll = pre_roll[:-len(current_chunk)] + elif current_chunk: + pre_roll = b"" + if pre_roll: + await self.asr_service.send_audio(pre_roll) + self._audio_buffer = pre_roll logger.debug("User speech started") @@ -439,6 +463,11 @@ class DuplexPipeline: if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED): return + # Add a tiny trailing silence tail to stabilize final-token decoding. + if self._asr_final_tail_bytes > 0: + final_tail = b"\x00" * self._asr_final_tail_bytes + await self.asr_service.send_audio(final_tail) + # Stop interim transcriptions if hasattr(self.asr_service, 'stop_interim_transcription'): await self.asr_service.stop_interim_transcription()