voice barge-in is ok

2026-01-29 17:47:15 +08:00
parent d6d0ade33e
commit aa4316de6f
3 changed files with 112 additions and 41 deletions
--- a/core/duplex_pipeline.py
+++ b/core/duplex_pipeline.py
@@ -12,6 +12,7 @@ event-driven design.
 """

 import asyncio
+import time
 from typing import Optional, Callable, Awaitable
 from loguru import logger

@@ -112,6 +113,13 @@ class DuplexPipeline:
        # Interruption handling
        self._interrupt_event = asyncio.Event()
        
+        # Barge-in filtering - require minimum speech duration to interrupt
+        self._barge_in_speech_start_time: Optional[float] = None
+        self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms if hasattr(settings, 'barge_in_min_duration_ms') else 50
+        self._barge_in_speech_frames: int = 0  # Count speech frames
+        self._barge_in_silence_frames: int = 0  # Count silence frames during potential barge-in
+        self._barge_in_silence_tolerance: int = 3  # Allow up to 3 silence frames (60ms at 20ms chunks)
+        
        logger.info(f"DuplexPipeline initialized for session {session_id}")
    
    async def start(self) -> None:
@@ -218,8 +226,35 @@ class DuplexPipeline:
            self._last_vad_status = vad_status
            
            # 2. Check for barge-in (user speaking while bot speaking)
-            if self._is_bot_speaking and vad_status == "Speech":
-                await self._handle_barge_in()
+            # Filter false interruptions by requiring minimum speech duration
+            if self._is_bot_speaking:
+                if vad_status == "Speech":
+                    # User is speaking while bot is speaking
+                    self._barge_in_silence_frames = 0  # Reset silence counter
+                    
+                    if self._barge_in_speech_start_time is None:
+                        # Start tracking speech duration
+                        self._barge_in_speech_start_time = time.time()
+                        self._barge_in_speech_frames = 1
+                        logger.debug("Potential barge-in detected, tracking duration...")
+                    else:
+                        self._barge_in_speech_frames += 1
+                        # Check if speech duration exceeds threshold
+                        speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
+                        if speech_duration_ms >= self._barge_in_min_duration_ms:
+                            logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
+                            await self._handle_barge_in()
+                else:
+                    # Silence frame during potential barge-in
+                    if self._barge_in_speech_start_time is not None:
+                        self._barge_in_silence_frames += 1
+                        # Allow brief silence gaps (VAD flickering)
+                        if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
+                            # Too much silence - reset barge-in tracking
+                            logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
+                            self._barge_in_speech_start_time = None
+                            self._barge_in_speech_frames = 0
+                            self._barge_in_silence_frames = 0
            
            # 3. Buffer audio for ASR
            if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
@@ -334,6 +369,15 @@ class DuplexPipeline:
        
        logger.info(f"EOU detected - user said: {user_text[:100]}...")
        
+        # Send final transcription to client
+        await self.transport.send_event({
+            "event": "transcript",
+            "trackId": self.session_id,
+            "text": user_text,
+            "isFinal": True,
+            "timestamp": self._get_timestamp_ms()
+        })
+        
        # Clear buffers
        self._audio_buffer = b""
        self._last_sent_transcript = ""
@@ -434,6 +478,10 @@ class DuplexPipeline:
            await self.conversation.end_assistant_turn(was_interrupted=True)
        finally:
            self._is_bot_speaking = False
+            # Reset barge-in tracking when bot finishes speaking
+            self._barge_in_speech_start_time = None
+            self._barge_in_speech_frames = 0
+            self._barge_in_silence_frames = 0
    
    async def _speak_sentence(self, text: str) -> None:
        """
@@ -508,6 +556,11 @@ class DuplexPipeline:
        
        logger.info("Barge-in detected - interrupting bot speech")
        
+        # Reset barge-in tracking
+        self._barge_in_speech_start_time = None
+        self._barge_in_speech_frames = 0
+        self._barge_in_silence_frames = 0
+        
        # Signal interruption
        self._interrupt_event.set()