Tune engine vad config
This commit is contained in:
@@ -53,6 +53,10 @@ class Settings(BaseSettings):
|
|||||||
siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model")
|
siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model")
|
||||||
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
|
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
|
||||||
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
|
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
|
||||||
|
asr_start_min_speech_ms: int = Field(
|
||||||
|
default=160,
|
||||||
|
description="Minimum continuous speech duration before ASR capture starts"
|
||||||
|
)
|
||||||
asr_pre_speech_ms: int = Field(
|
asr_pre_speech_ms: int = Field(
|
||||||
default=240,
|
default=240,
|
||||||
description="Audio context (ms) prepended before detected speech to avoid clipping first phoneme"
|
description="Audio context (ms) prepended before detected speech to avoid clipping first phoneme"
|
||||||
|
|||||||
@@ -230,6 +230,11 @@ class DuplexPipeline:
|
|||||||
self._audio_buffer: bytes = b""
|
self._audio_buffer: bytes = b""
|
||||||
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
|
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
|
||||||
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
||||||
|
self._asr_start_min_speech_ms: int = (
|
||||||
|
settings.asr_start_min_speech_ms if hasattr(settings, "asr_start_min_speech_ms") else 160
|
||||||
|
)
|
||||||
|
self._asr_capture_active: bool = False
|
||||||
|
self._pending_speech_audio: bytes = b""
|
||||||
# Keep a short rolling pre-speech window so VAD transition latency
|
# Keep a short rolling pre-speech window so VAD transition latency
|
||||||
# does not clip the first phoneme/character sent to ASR.
|
# does not clip the first phoneme/character sent to ASR.
|
||||||
pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240
|
pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240
|
||||||
@@ -646,7 +651,7 @@ class DuplexPipeline:
|
|||||||
|
|
||||||
# Update state based on VAD
|
# Update state based on VAD
|
||||||
if vad_status == "Speech" and self._last_vad_status != "Speech":
|
if vad_status == "Speech" and self._last_vad_status != "Speech":
|
||||||
await self._on_speech_start(current_chunk=pcm_bytes)
|
await self._on_speech_start()
|
||||||
|
|
||||||
self._last_vad_status = vad_status
|
self._last_vad_status = vad_status
|
||||||
|
|
||||||
@@ -685,20 +690,44 @@ class DuplexPipeline:
|
|||||||
self._barge_in_speech_frames = 0
|
self._barge_in_speech_frames = 0
|
||||||
self._barge_in_silence_frames = 0
|
self._barge_in_silence_frames = 0
|
||||||
|
|
||||||
# 3. Buffer audio for ASR
|
# 3. Buffer audio for ASR.
|
||||||
if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
|
# Gate ASR startup by a short speech-duration threshold to reduce
|
||||||
self._audio_buffer += pcm_bytes
|
# false positives from micro noises, then always close the turn
|
||||||
if len(self._audio_buffer) > self._max_audio_buffer_bytes:
|
# by EOU once ASR has started.
|
||||||
# Keep only the most recent audio to cap memory usage
|
just_started_asr = False
|
||||||
self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:]
|
if vad_status == "Speech" and not self._asr_capture_active:
|
||||||
await self.asr_service.send_audio(pcm_bytes)
|
self._pending_speech_audio += pcm_bytes
|
||||||
|
pending_ms = (len(self._pending_speech_audio) / (settings.sample_rate * 2)) * 1000.0
|
||||||
|
if pending_ms >= self._asr_start_min_speech_ms:
|
||||||
|
await self._start_asr_capture()
|
||||||
|
just_started_asr = True
|
||||||
|
|
||||||
|
if self._asr_capture_active:
|
||||||
|
if not just_started_asr:
|
||||||
|
self._audio_buffer += pcm_bytes
|
||||||
|
if len(self._audio_buffer) > self._max_audio_buffer_bytes:
|
||||||
|
# Keep only the most recent audio to cap memory usage
|
||||||
|
self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:]
|
||||||
|
await self.asr_service.send_audio(pcm_bytes)
|
||||||
|
|
||||||
# For SiliconFlow ASR, trigger interim transcription periodically
|
# For SiliconFlow ASR, trigger interim transcription periodically
|
||||||
# The service handles timing internally via start_interim_transcription()
|
# The service handles timing internally via start_interim_transcription()
|
||||||
|
|
||||||
# 4. Check for End of Utterance - this triggers LLM response
|
# 4. Check for End of Utterance - this triggers LLM response
|
||||||
if self.eou_detector.process(vad_status):
|
if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
|
||||||
await self._on_end_of_utterance()
|
await self._on_end_of_utterance()
|
||||||
|
elif (
|
||||||
|
vad_status == "Silence"
|
||||||
|
and not self.eou_detector.is_speaking
|
||||||
|
and not self._asr_capture_active
|
||||||
|
and self.conversation.state == ConversationState.LISTENING
|
||||||
|
):
|
||||||
|
# Speech was too short to pass ASR gate; reset turn so next
|
||||||
|
# utterance can start cleanly.
|
||||||
|
self._pending_speech_audio = b""
|
||||||
|
self._audio_buffer = b""
|
||||||
|
self._last_sent_transcript = ""
|
||||||
|
await self.conversation.set_state(ConversationState.IDLE)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Pipeline audio processing error: {e}", exc_info=True)
|
logger.error(f"Pipeline audio processing error: {e}", exc_info=True)
|
||||||
@@ -757,32 +786,44 @@ class DuplexPipeline:
|
|||||||
logger.info(f"[ASR] ASR interim: {text[:100]}")
|
logger.info(f"[ASR] ASR interim: {text[:100]}")
|
||||||
logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
|
logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
|
||||||
|
|
||||||
async def _on_speech_start(self, current_chunk: bytes = b"") -> None:
|
async def _on_speech_start(self) -> None:
|
||||||
"""Handle user starting to speak."""
|
"""Handle user starting to speak."""
|
||||||
if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED):
|
if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED):
|
||||||
await self.conversation.start_user_turn()
|
await self.conversation.start_user_turn()
|
||||||
self._audio_buffer = b""
|
self._audio_buffer = b""
|
||||||
self._last_sent_transcript = ""
|
self._last_sent_transcript = ""
|
||||||
self.eou_detector.reset()
|
self.eou_detector.reset()
|
||||||
|
self._asr_capture_active = False
|
||||||
|
self._pending_speech_audio = b""
|
||||||
|
|
||||||
# Clear ASR buffer and start interim transcriptions
|
# Clear ASR buffer. Interim starts only after ASR capture is activated.
|
||||||
if hasattr(self.asr_service, 'clear_buffer'):
|
if hasattr(self.asr_service, 'clear_buffer'):
|
||||||
self.asr_service.clear_buffer()
|
self.asr_service.clear_buffer()
|
||||||
if hasattr(self.asr_service, 'start_interim_transcription'):
|
|
||||||
await self.asr_service.start_interim_transcription()
|
|
||||||
# Prime ASR with a short pre-speech context window so the utterance
|
|
||||||
# start isn't lost while waiting for VAD to transition to Speech.
|
|
||||||
pre_roll = self._pre_speech_buffer
|
|
||||||
if current_chunk and len(pre_roll) > len(current_chunk):
|
|
||||||
pre_roll = pre_roll[:-len(current_chunk)]
|
|
||||||
elif current_chunk:
|
|
||||||
pre_roll = b""
|
|
||||||
if pre_roll:
|
|
||||||
await self.asr_service.send_audio(pre_roll)
|
|
||||||
self._audio_buffer = pre_roll
|
|
||||||
|
|
||||||
logger.debug("User speech started")
|
logger.debug("User speech started")
|
||||||
|
|
||||||
|
async def _start_asr_capture(self) -> None:
|
||||||
|
"""Start ASR capture for the current turn after min speech gate passes."""
|
||||||
|
if self._asr_capture_active:
|
||||||
|
return
|
||||||
|
|
||||||
|
if hasattr(self.asr_service, 'start_interim_transcription'):
|
||||||
|
await self.asr_service.start_interim_transcription()
|
||||||
|
|
||||||
|
# Prime ASR with a short pre-speech context window so the utterance
|
||||||
|
# start isn't lost while waiting for VAD to transition to Speech.
|
||||||
|
pre_roll = self._pre_speech_buffer
|
||||||
|
capture_audio = pre_roll + self._pending_speech_audio
|
||||||
|
if capture_audio:
|
||||||
|
await self.asr_service.send_audio(capture_audio)
|
||||||
|
self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]
|
||||||
|
|
||||||
|
self._asr_capture_active = True
|
||||||
|
logger.debug(
|
||||||
|
f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
|
||||||
|
f"capture={len(capture_audio)} bytes"
|
||||||
|
)
|
||||||
|
|
||||||
async def _on_end_of_utterance(self) -> None:
|
async def _on_end_of_utterance(self) -> None:
|
||||||
"""Handle end of user utterance."""
|
"""Handle end of user utterance."""
|
||||||
if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED):
|
if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED):
|
||||||
@@ -813,6 +854,8 @@ class DuplexPipeline:
|
|||||||
# Reset for next utterance
|
# Reset for next utterance
|
||||||
self._audio_buffer = b""
|
self._audio_buffer = b""
|
||||||
self._last_sent_transcript = ""
|
self._last_sent_transcript = ""
|
||||||
|
self._asr_capture_active = False
|
||||||
|
self._pending_speech_audio = b""
|
||||||
# Return to idle; don't force LISTENING which causes buffering on silence
|
# Return to idle; don't force LISTENING which causes buffering on silence
|
||||||
await self.conversation.set_state(ConversationState.IDLE)
|
await self.conversation.set_state(ConversationState.IDLE)
|
||||||
return
|
return
|
||||||
@@ -833,6 +876,8 @@ class DuplexPipeline:
|
|||||||
# Clear buffers
|
# Clear buffers
|
||||||
self._audio_buffer = b""
|
self._audio_buffer = b""
|
||||||
self._last_sent_transcript = ""
|
self._last_sent_transcript = ""
|
||||||
|
self._asr_capture_active = False
|
||||||
|
self._pending_speech_audio = b""
|
||||||
|
|
||||||
# Process the turn - trigger LLM response
|
# Process the turn - trigger LLM response
|
||||||
# Cancel any existing turn to avoid overlapping assistant responses
|
# Cancel any existing turn to avoid overlapping assistant responses
|
||||||
@@ -1493,6 +1538,8 @@ class DuplexPipeline:
|
|||||||
await self.conversation.start_user_turn()
|
await self.conversation.start_user_turn()
|
||||||
self._audio_buffer = b""
|
self._audio_buffer = b""
|
||||||
self.eou_detector.reset()
|
self.eou_detector.reset()
|
||||||
|
self._asr_capture_active = False
|
||||||
|
self._pending_speech_audio = b""
|
||||||
|
|
||||||
async def _stop_current_speech(self) -> None:
|
async def _stop_current_speech(self) -> None:
|
||||||
"""Stop any current speech task."""
|
"""Stop any current speech task."""
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class EouDetector:
|
|||||||
self.silence_start_time: Optional[float] = None
|
self.silence_start_time: Optional[float] = None
|
||||||
self.triggered = False
|
self.triggered = False
|
||||||
|
|
||||||
def process(self, vad_status: str) -> bool:
|
def process(self, vad_status: str, force_eligible: bool = False) -> bool:
|
||||||
"""
|
"""
|
||||||
Process VAD status and detect end of utterance.
|
Process VAD status and detect end of utterance.
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ class EouDetector:
|
|||||||
self.silence_start_time = now
|
self.silence_start_time = now
|
||||||
|
|
||||||
speech_duration = self.silence_start_time - self.speech_start_time
|
speech_duration = self.silence_start_time - self.speech_start_time
|
||||||
if speech_duration < self.min_speech:
|
if speech_duration < self.min_speech and not force_eligible:
|
||||||
self.is_speaking = False
|
self.is_speaking = False
|
||||||
self.silence_start_time = None
|
self.silence_start_time = None
|
||||||
return False
|
return False
|
||||||
|
|||||||
Reference in New Issue
Block a user