voice barge-in is ok
This commit is contained in:
@@ -12,6 +12,7 @@ event-driven design.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional, Callable, Awaitable
|
||||
from loguru import logger
|
||||
|
||||
@@ -112,6 +113,13 @@ class DuplexPipeline:
|
||||
# Interruption handling
|
||||
self._interrupt_event = asyncio.Event()
|
||||
|
||||
# Barge-in filtering - require minimum speech duration to interrupt
|
||||
self._barge_in_speech_start_time: Optional[float] = None
|
||||
self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms if hasattr(settings, 'barge_in_min_duration_ms') else 50
|
||||
self._barge_in_speech_frames: int = 0 # Count speech frames
|
||||
self._barge_in_silence_frames: int = 0 # Count silence frames during potential barge-in
|
||||
self._barge_in_silence_tolerance: int = 3 # Allow up to 3 silence frames (60ms at 20ms chunks)
|
||||
|
||||
logger.info(f"DuplexPipeline initialized for session {session_id}")
|
||||
|
||||
async def start(self) -> None:
|
||||
@@ -218,8 +226,35 @@ class DuplexPipeline:
|
||||
self._last_vad_status = vad_status
|
||||
|
||||
# 2. Check for barge-in (user speaking while bot speaking)
|
||||
if self._is_bot_speaking and vad_status == "Speech":
|
||||
await self._handle_barge_in()
|
||||
# Filter false interruptions by requiring minimum speech duration
|
||||
if self._is_bot_speaking:
|
||||
if vad_status == "Speech":
|
||||
# User is speaking while bot is speaking
|
||||
self._barge_in_silence_frames = 0 # Reset silence counter
|
||||
|
||||
if self._barge_in_speech_start_time is None:
|
||||
# Start tracking speech duration
|
||||
self._barge_in_speech_start_time = time.time()
|
||||
self._barge_in_speech_frames = 1
|
||||
logger.debug("Potential barge-in detected, tracking duration...")
|
||||
else:
|
||||
self._barge_in_speech_frames += 1
|
||||
# Check if speech duration exceeds threshold
|
||||
speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
|
||||
if speech_duration_ms >= self._barge_in_min_duration_ms:
|
||||
logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
|
||||
await self._handle_barge_in()
|
||||
else:
|
||||
# Silence frame during potential barge-in
|
||||
if self._barge_in_speech_start_time is not None:
|
||||
self._barge_in_silence_frames += 1
|
||||
# Allow brief silence gaps (VAD flickering)
|
||||
if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
|
||||
# Too much silence - reset barge-in tracking
|
||||
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
|
||||
self._barge_in_speech_start_time = None
|
||||
self._barge_in_speech_frames = 0
|
||||
self._barge_in_silence_frames = 0
|
||||
|
||||
# 3. Buffer audio for ASR
|
||||
if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
|
||||
@@ -334,6 +369,15 @@ class DuplexPipeline:
|
||||
|
||||
logger.info(f"EOU detected - user said: {user_text[:100]}...")
|
||||
|
||||
# Send final transcription to client
|
||||
await self.transport.send_event({
|
||||
"event": "transcript",
|
||||
"trackId": self.session_id,
|
||||
"text": user_text,
|
||||
"isFinal": True,
|
||||
"timestamp": self._get_timestamp_ms()
|
||||
})
|
||||
|
||||
# Clear buffers
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
@@ -434,6 +478,10 @@ class DuplexPipeline:
|
||||
await self.conversation.end_assistant_turn(was_interrupted=True)
|
||||
finally:
|
||||
self._is_bot_speaking = False
|
||||
# Reset barge-in tracking when bot finishes speaking
|
||||
self._barge_in_speech_start_time = None
|
||||
self._barge_in_speech_frames = 0
|
||||
self._barge_in_silence_frames = 0
|
||||
|
||||
async def _speak_sentence(self, text: str) -> None:
|
||||
"""
|
||||
@@ -508,6 +556,11 @@ class DuplexPipeline:
|
||||
|
||||
logger.info("Barge-in detected - interrupting bot speech")
|
||||
|
||||
# Reset barge-in tracking
|
||||
self._barge_in_speech_start_time = None
|
||||
self._barge_in_speech_frames = 0
|
||||
self._barge_in_silence_frames = 0
|
||||
|
||||
# Signal interruption
|
||||
self._interrupt_event.set()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user