From 0135f718f3b1b37b474aed277b58b8c5a11bfdf2 Mon Sep 17 00:00:00 2001
From: Xin Wang <wx44wx@XindeMac-mini.local>
Date: Mon, 9 Feb 2026 18:28:43 +0800
Subject: [PATCH] Fix asr boundary bug

---
 engine/app/config.py           |  8 ++++++++
 engine/core/duplex_pipeline.py | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/engine/app/config.py b/engine/app/config.py
index 95b92e1..2888887 100644
--- a/engine/app/config.py
+++ b/engine/app/config.py
@@ -53,6 +53,14 @@ class Settings(BaseSettings):
     siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model")
     asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
     asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
+    asr_pre_speech_ms: int = Field(
+        default=240,
+        description="Audio context (ms) prepended before detected speech to avoid clipping first phoneme"
+    )
+    asr_final_tail_ms: int = Field(
+        default=120,
+        description="Silence tail (ms) appended before final ASR decode to protect utterance ending"
+    )
     
     # Duplex Pipeline Configuration
     duplex_enabled: bool = Field(default=True, description="Enable duplex voice pipeline")
diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py
index aece39f..70f0016 100644
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -119,6 +119,15 @@ class DuplexPipeline:
         self._audio_buffer: bytes = b""
         max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
         self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
+        # Keep a short rolling pre-speech window so VAD transition latency
+        # does not clip the first phoneme/character sent to ASR.
+        pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240
+        self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0))
+        self._pre_speech_buffer: bytes = b""
+        # Add a tiny trailing silence tail before final ASR to avoid
+        # clipping the last phoneme at utterance boundaries.
+        asr_final_tail_ms = settings.asr_final_tail_ms if hasattr(settings, "asr_final_tail_ms") else 120
+        self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0))
         self._last_vad_status: str = "Silence"
         self._process_lock = asyncio.Lock()
 
@@ -285,6 +294,11 @@ class DuplexPipeline:
 
         try:
             async with self._process_lock:
+                if pcm_bytes:
+                    self._pre_speech_buffer += pcm_bytes
+                    if len(self._pre_speech_buffer) > self._asr_pre_speech_bytes:
+                        self._pre_speech_buffer = self._pre_speech_buffer[-self._asr_pre_speech_bytes:]
+
                 # 1. Process through VAD
                 vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
 
@@ -311,7 +325,7 @@ class DuplexPipeline:
 
                 # Update state based on VAD
                 if vad_status == "Speech" and self._last_vad_status != "Speech":
-                    await self._on_speech_start()
+                    await self._on_speech_start(current_chunk=pcm_bytes)
 
                 self._last_vad_status = vad_status
 
@@ -418,7 +432,7 @@ class DuplexPipeline:
             logger.info(f"ASR interim: {text[:100]}")
         logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
 
-    async def _on_speech_start(self) -> None:
+    async def _on_speech_start(self, current_chunk: bytes = b"") -> None:
         """Handle user starting to speak."""
         if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED):
             await self.conversation.start_user_turn()
@@ -431,6 +445,16 @@ class DuplexPipeline:
                 self.asr_service.clear_buffer()
             if hasattr(self.asr_service, 'start_interim_transcription'):
                 await self.asr_service.start_interim_transcription()
+            # Prime ASR with a short pre-speech context window so the utterance
+            # start isn't lost while waiting for VAD to transition to Speech.
+            pre_roll = self._pre_speech_buffer
+            if current_chunk and len(pre_roll) > len(current_chunk):
+                pre_roll = pre_roll[:-len(current_chunk)]
+            elif current_chunk:
+                pre_roll = b""
+            if pre_roll:
+                await self.asr_service.send_audio(pre_roll)
+                self._audio_buffer = pre_roll
 
             logger.debug("User speech started")
 
@@ -439,6 +463,11 @@ class DuplexPipeline:
         if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED):
             return
 
+        # Add a tiny trailing silence tail to stabilize final-token decoding.
+        if self._asr_final_tail_bytes > 0:
+            final_tail = b"\x00" * self._asr_final_tail_bytes
+            await self.asr_service.send_audio(final_tail)
+
         # Stop interim transcriptions
         if hasattr(self.asr_service, 'stop_interim_transcription'):
             await self.asr_service.stop_interim_transcription()