Fix asr boundary bug
This commit is contained in:
@@ -53,6 +53,14 @@ class Settings(BaseSettings):
|
|||||||
siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model")
|
siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model")
|
||||||
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
|
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
|
||||||
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
|
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
|
||||||
|
asr_pre_speech_ms: int = Field(
|
||||||
|
default=240,
|
||||||
|
description="Audio context (ms) prepended before detected speech to avoid clipping first phoneme"
|
||||||
|
)
|
||||||
|
asr_final_tail_ms: int = Field(
|
||||||
|
default=120,
|
||||||
|
description="Silence tail (ms) appended before final ASR decode to protect utterance ending"
|
||||||
|
)
|
||||||
|
|
||||||
# Duplex Pipeline Configuration
|
# Duplex Pipeline Configuration
|
||||||
duplex_enabled: bool = Field(default=True, description="Enable duplex voice pipeline")
|
duplex_enabled: bool = Field(default=True, description="Enable duplex voice pipeline")
|
||||||
|
|||||||
@@ -119,6 +119,15 @@ class DuplexPipeline:
|
|||||||
self._audio_buffer: bytes = b""
|
self._audio_buffer: bytes = b""
|
||||||
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
|
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
|
||||||
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
||||||
|
# Keep a short rolling pre-speech window so VAD transition latency
|
||||||
|
# does not clip the first phoneme/character sent to ASR.
|
||||||
|
pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240
|
||||||
|
self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0))
|
||||||
|
self._pre_speech_buffer: bytes = b""
|
||||||
|
# Add a tiny trailing silence tail before final ASR to avoid
|
||||||
|
# clipping the last phoneme at utterance boundaries.
|
||||||
|
asr_final_tail_ms = settings.asr_final_tail_ms if hasattr(settings, "asr_final_tail_ms") else 120
|
||||||
|
self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0))
|
||||||
self._last_vad_status: str = "Silence"
|
self._last_vad_status: str = "Silence"
|
||||||
self._process_lock = asyncio.Lock()
|
self._process_lock = asyncio.Lock()
|
||||||
|
|
||||||
@@ -285,6 +294,11 @@ class DuplexPipeline:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
async with self._process_lock:
|
async with self._process_lock:
|
||||||
|
if pcm_bytes:
|
||||||
|
self._pre_speech_buffer += pcm_bytes
|
||||||
|
if len(self._pre_speech_buffer) > self._asr_pre_speech_bytes:
|
||||||
|
self._pre_speech_buffer = self._pre_speech_buffer[-self._asr_pre_speech_bytes:]
|
||||||
|
|
||||||
# 1. Process through VAD
|
# 1. Process through VAD
|
||||||
vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
|
vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
|
||||||
|
|
||||||
@@ -311,7 +325,7 @@ class DuplexPipeline:
|
|||||||
|
|
||||||
# Update state based on VAD
|
# Update state based on VAD
|
||||||
if vad_status == "Speech" and self._last_vad_status != "Speech":
|
if vad_status == "Speech" and self._last_vad_status != "Speech":
|
||||||
await self._on_speech_start()
|
await self._on_speech_start(current_chunk=pcm_bytes)
|
||||||
|
|
||||||
self._last_vad_status = vad_status
|
self._last_vad_status = vad_status
|
||||||
|
|
||||||
@@ -418,7 +432,7 @@ class DuplexPipeline:
|
|||||||
logger.info(f"ASR interim: {text[:100]}")
|
logger.info(f"ASR interim: {text[:100]}")
|
||||||
logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
|
logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
|
||||||
|
|
||||||
async def _on_speech_start(self) -> None:
|
async def _on_speech_start(self, current_chunk: bytes = b"") -> None:
|
||||||
"""Handle user starting to speak."""
|
"""Handle user starting to speak."""
|
||||||
if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED):
|
if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED):
|
||||||
await self.conversation.start_user_turn()
|
await self.conversation.start_user_turn()
|
||||||
@@ -431,6 +445,16 @@ class DuplexPipeline:
|
|||||||
self.asr_service.clear_buffer()
|
self.asr_service.clear_buffer()
|
||||||
if hasattr(self.asr_service, 'start_interim_transcription'):
|
if hasattr(self.asr_service, 'start_interim_transcription'):
|
||||||
await self.asr_service.start_interim_transcription()
|
await self.asr_service.start_interim_transcription()
|
||||||
|
# Prime ASR with a short pre-speech context window so the utterance
|
||||||
|
# start isn't lost while waiting for VAD to transition to Speech.
|
||||||
|
pre_roll = self._pre_speech_buffer
|
||||||
|
if current_chunk and len(pre_roll) > len(current_chunk):
|
||||||
|
pre_roll = pre_roll[:-len(current_chunk)]
|
||||||
|
elif current_chunk:
|
||||||
|
pre_roll = b""
|
||||||
|
if pre_roll:
|
||||||
|
await self.asr_service.send_audio(pre_roll)
|
||||||
|
self._audio_buffer = pre_roll
|
||||||
|
|
||||||
logger.debug("User speech started")
|
logger.debug("User speech started")
|
||||||
|
|
||||||
@@ -439,6 +463,11 @@ class DuplexPipeline:
|
|||||||
if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED):
|
if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Add a tiny trailing silence tail to stabilize final-token decoding.
|
||||||
|
if self._asr_final_tail_bytes > 0:
|
||||||
|
final_tail = b"\x00" * self._asr_final_tail_bytes
|
||||||
|
await self.asr_service.send_audio(final_tail)
|
||||||
|
|
||||||
# Stop interim transcriptions
|
# Stop interim transcriptions
|
||||||
if hasattr(self.asr_service, 'stop_interim_transcription'):
|
if hasattr(self.asr_service, 'stop_interim_transcription'):
|
||||||
await self.asr_service.stop_interim_transcription()
|
await self.asr_service.stop_interim_transcription()
|
||||||
|
|||||||
Reference in New Issue
Block a user