Take into account VAD start_secs when passing audio data to Smart Turn, and add an extra 500ms of pre-speech audio for good measure

This commit is contained in:
marcus-daily
2026-01-08 15:42:03 +00:00
committed by Marcus
parent 16819a5caa
commit 35a99f92ab
3 changed files with 49 additions and 6 deletions

View File

@@ -36,6 +36,7 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
logger.info("✅ Silero VAD model loaded")
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
@@ -44,7 +45,10 @@ from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.aggregators.llm_response_universal import (
LLMContextAggregatorPair,
LLMUserAggregatorParams,
)
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
@@ -53,6 +57,10 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.turns.user_stop.turn_analyzer_user_turn_stop_strategy import (
TurnAnalyzerUserTurnStopStrategy,
)
from pipecat.turns.user_turn_strategies import UserTurnStrategies
logger.info("✅ All components loaded successfully!")
@@ -79,7 +87,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
context_aggregator = LLMContextAggregatorPair(
context,
user_params=LLMUserAggregatorParams(
user_turn_strategies=UserTurnStrategies(
stop=[
TurnAnalyzerUserTurnStopStrategy(
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
)
],
)
),
)
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
@@ -130,13 +149,11 @@ async def bot(runner_args: RunnerArguments):
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(),
),
}

View File

@@ -25,7 +25,7 @@ from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
# Default timing parameters
STOP_SECS = 3
PRE_SPEECH_MS = 0
PRE_SPEECH_MS = 500
MAX_DURATION_SECONDS = 8 # Max allowed segment duration
@@ -35,11 +35,15 @@ class SmartTurnParams(BaseTurnParams):
Parameters:
stop_secs: Maximum silence duration in seconds before ending turn.
pre_speech_ms: Milliseconds of audio to include before speech starts.
vad_start_secs: Seconds VAD waits before confirming speech start (e.g. VAD STARTING window).
This is added to `pre_speech_ms` at inference slicing time so Smart Turn can include
the initial audio that occurred while VAD was still confirming speech.
max_duration_secs: Maximum duration in seconds for audio segments.
"""
stop_secs: float = STOP_SECS
pre_speech_ms: float = PRE_SPEECH_MS
vad_start_secs: float = 0.0
max_duration_secs: float = MAX_DURATION_SECONDS
@@ -181,7 +185,8 @@ class BaseSmartTurn(BaseTurnAnalyzer):
return state, None
# Extract recent audio segment for prediction
start_time = self._speech_start_time - (self._params.pre_speech_ms / 1000)
effective_pre_speech_ms = self._params.pre_speech_ms + (self._params.vad_start_secs * 1000)
start_time = self._speech_start_time - (effective_pre_speech_ms / 1000)
start_index = 0
for i, (t, _) in enumerate(audio_buffer):
if t >= start_time:

View File

@@ -10,6 +10,7 @@ import asyncio
from typing import Optional
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.frames.frames import (
Frame,
InputAudioRawFrame,
@@ -86,6 +87,8 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy):
if isinstance(frame, StartFrame):
await self._start(frame)
elif isinstance(frame, SpeechControlParamsFrame):
await self._handle_speech_control_params(frame)
elif isinstance(frame, VADUserStartedSpeakingFrame):
await self._handle_vad_user_started_speaking(frame)
elif isinstance(frame, VADUserStoppedSpeakingFrame):
@@ -102,6 +105,24 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy):
self._turn_analyzer.set_sample_rate(frame.audio_in_sample_rate)
await self.broadcast_frame(SpeechControlParamsFrame, turn_params=self._turn_analyzer.params)
async def _handle_speech_control_params(self, frame: SpeechControlParamsFrame):
"""Sync Smart Turn pre-speech buffering with VAD start delay.
In the new user-turn-strategies pipeline, `VADUserStartedSpeakingFrame`
is emitted only once VAD has *confirmed* speech (after `vad_params.start_secs`).
Smart Turn should still include the initial audio collected during that
confirmation window, so we record it in `SmartTurnParams.vad_start_secs` and
add it at inference slicing time (preserving `pre_speech_ms` semantics).
"""
if not frame.vad_params:
return
params = self._turn_analyzer.params
if not isinstance(params, SmartTurnParams):
return
params.vad_start_secs = frame.vad_params.start_secs
async def _handle_input_audio(self, frame: InputAudioRawFrame):
"""Handle input audio to check if the turn is completed."""
state = self._turn_analyzer.append_audio(frame.audio, self._vad_user_speaking)