diff --git a/examples/quickstart/bot.py b/examples/quickstart/bot.py index d890b8b2c..5977131c7 100644 --- a/examples/quickstart/bot.py +++ b/examples/quickstart/bot.py @@ -36,6 +36,7 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer logger.info("✅ Silero VAD model loaded") +from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.frames.frames import LLMRunFrame @@ -44,7 +45,10 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair +from pipecat.processors.aggregators.llm_response_universal import ( + LLMContextAggregatorPair, + LLMUserAggregatorParams, +) from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport @@ -53,6 +57,10 @@ from pipecat.services.deepgram.stt import DeepgramSTTService from pipecat.services.openai.llm import OpenAILLMService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.daily.transport import DailyParams +from pipecat.turns.user_stop.turn_analyzer_user_turn_stop_strategy import ( + TurnAnalyzerUserTurnStopStrategy, +) +from pipecat.turns.user_turn_strategies import UserTurnStrategies logger.info("✅ All components loaded successfully!") @@ -79,7 +87,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): ] context = LLMContext(messages) - context_aggregator = LLMContextAggregatorPair(context) + context_aggregator = LLMContextAggregatorPair( + context, + user_params=LLMUserAggregatorParams( + user_turn_strategies=UserTurnStrategies( + stop=[ + TurnAnalyzerUserTurnStopStrategy( + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()) + ) + ], + ) + ), + ) rtvi = RTVIProcessor(config=RTVIConfig(config=[])) @@ -130,13 +149,11 @@ async def bot(runner_args: RunnerArguments): audio_in_enabled=True, audio_out_enabled=True, vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), - turn_analyzer=LocalSmartTurnAnalyzerV3(), ), "webrtc": lambda: TransportParams( audio_in_enabled=True, audio_out_enabled=True, vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), - turn_analyzer=LocalSmartTurnAnalyzerV3(), ), } diff --git a/src/pipecat/audio/turn/smart_turn/base_smart_turn.py b/src/pipecat/audio/turn/smart_turn/base_smart_turn.py index 72c3d5c6e..11d3573bf 100644 --- a/src/pipecat/audio/turn/smart_turn/base_smart_turn.py +++ b/src/pipecat/audio/turn/smart_turn/base_smart_turn.py @@ -25,7 +25,7 @@ from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData # Default timing parameters STOP_SECS = 3 -PRE_SPEECH_MS = 0 +PRE_SPEECH_MS = 500 MAX_DURATION_SECONDS = 8 # Max allowed segment duration @@ -35,11 +35,15 @@ class SmartTurnParams(BaseTurnParams): Parameters: stop_secs: Maximum silence duration in seconds before ending turn. pre_speech_ms: Milliseconds of audio to include before speech starts. + vad_start_secs: Seconds VAD waits before confirming speech start (e.g. VAD STARTING window). + This is added to `pre_speech_ms` at inference slicing time so Smart Turn can include + the initial audio that occurred while VAD was still confirming speech. max_duration_secs: Maximum duration in seconds for audio segments. """ stop_secs: float = STOP_SECS pre_speech_ms: float = PRE_SPEECH_MS + vad_start_secs: float = 0.0 max_duration_secs: float = MAX_DURATION_SECONDS @@ -181,7 +185,8 @@ class BaseSmartTurn(BaseTurnAnalyzer): return state, None # Extract recent audio segment for prediction - start_time = self._speech_start_time - (self._params.pre_speech_ms / 1000) + effective_pre_speech_ms = self._params.pre_speech_ms + (self._params.vad_start_secs * 1000) + start_time = self._speech_start_time - (effective_pre_speech_ms / 1000) start_index = 0 for i, (t, _) in enumerate(audio_buffer): if t >= start_time: diff --git a/src/pipecat/turns/user_stop/turn_analyzer_user_turn_stop_strategy.py b/src/pipecat/turns/user_stop/turn_analyzer_user_turn_stop_strategy.py index 5212d832f..28fa9e761 100644 --- a/src/pipecat/turns/user_stop/turn_analyzer_user_turn_stop_strategy.py +++ b/src/pipecat/turns/user_stop/turn_analyzer_user_turn_stop_strategy.py @@ -10,6 +10,7 @@ import asyncio from typing import Optional from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState +from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams from pipecat.frames.frames import ( Frame, InputAudioRawFrame, @@ -86,6 +87,8 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy): if isinstance(frame, StartFrame): await self._start(frame) + elif isinstance(frame, SpeechControlParamsFrame): + await self._handle_speech_control_params(frame) elif isinstance(frame, VADUserStartedSpeakingFrame): await self._handle_vad_user_started_speaking(frame) elif isinstance(frame, VADUserStoppedSpeakingFrame): @@ -102,6 +105,24 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy): self._turn_analyzer.set_sample_rate(frame.audio_in_sample_rate) await self.broadcast_frame(SpeechControlParamsFrame, turn_params=self._turn_analyzer.params) + async def _handle_speech_control_params(self, frame: SpeechControlParamsFrame): + """Sync Smart Turn pre-speech buffering with VAD start delay. + + In the new user-turn-strategies pipeline, `VADUserStartedSpeakingFrame` + is emitted only once VAD has *confirmed* speech (after `vad_params.start_secs`). + Smart Turn should still include the initial audio collected during that + confirmation window, so we record it in `SmartTurnParams.vad_start_secs` and + add it at inference slicing time (preserving `pre_speech_ms` semantics). + """ + if not frame.vad_params: + return + + params = self._turn_analyzer.params + if not isinstance(params, SmartTurnParams): + return + + params.vad_start_secs = frame.vad_params.start_secs + async def _handle_input_audio(self, frame: InputAudioRawFrame): """Handle input audio to check if the turn is completed.""" state = self._turn_analyzer.append_audio(frame.audio, self._vad_user_speaking)