Take into account VAD start_secs when passing audio data to Smart Turn, and add an extra 500ms of pre-speech audio for good measure

2026-01-08 15:42:03 +00:00
parent 16819a5caa
commit 35a99f92ab
3 changed files with 49 additions and 6 deletions
--- a/examples/quickstart/bot.py
+++ b/examples/quickstart/bot.py
@@ -36,6 +36,7 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer

 logger.info("✅ Silero VAD model loaded")

+from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.frames.frames import LLMRunFrame

@@ -44,7 +45,10 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.llm_context import LLMContext
-from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
+from pipecat.processors.aggregators.llm_response_universal import (
+    LLMContextAggregatorPair,
+    LLMUserAggregatorParams,
+)
 from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
@@ -53,6 +57,10 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.daily.transport import DailyParams
+from pipecat.turns.user_stop.turn_analyzer_user_turn_stop_strategy import (
+    TurnAnalyzerUserTurnStopStrategy,
+)
+from pipecat.turns.user_turn_strategies import UserTurnStrategies

 logger.info("✅ All components loaded successfully!")

@@ -79,7 +87,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    ]

    context = LLMContext(messages)
-    context_aggregator = LLMContextAggregatorPair(context)
+    context_aggregator = LLMContextAggregatorPair(
+        context,
+        user_params=LLMUserAggregatorParams(
+            user_turn_strategies=UserTurnStrategies(
+                stop=[
+                    TurnAnalyzerUserTurnStopStrategy(
+                        turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
+                    )
+                ],
+            )
+        ),
+    )

    rtvi = RTVIProcessor(config=RTVIConfig(config=[]))

@@ -130,13 +149,11 @@ async def bot(runner_args: RunnerArguments):
            audio_in_enabled=True,
            audio_out_enabled=True,
            vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
-            turn_analyzer=LocalSmartTurnAnalyzerV3(),
        ),
        "webrtc": lambda: TransportParams(
            audio_in_enabled=True,
            audio_out_enabled=True,
            vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
-            turn_analyzer=LocalSmartTurnAnalyzerV3(),
        ),
    }

--- a/src/pipecat/audio/turn/smart_turn/base_smart_turn.py
+++ b/src/pipecat/audio/turn/smart_turn/base_smart_turn.py
@@ -25,7 +25,7 @@ from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData

 # Default timing parameters
 STOP_SECS = 3
-PRE_SPEECH_MS = 0
+PRE_SPEECH_MS = 500
 MAX_DURATION_SECONDS = 8  # Max allowed segment duration


@@ -35,11 +35,15 @@ class SmartTurnParams(BaseTurnParams):
    Parameters:
        stop_secs: Maximum silence duration in seconds before ending turn.
        pre_speech_ms: Milliseconds of audio to include before speech starts.
+        vad_start_secs: Seconds VAD waits before confirming speech start (e.g. VAD STARTING window).
+            This is added to `pre_speech_ms` at inference slicing time so Smart Turn can include
+            the initial audio that occurred while VAD was still confirming speech.
        max_duration_secs: Maximum duration in seconds for audio segments.
    """

    stop_secs: float = STOP_SECS
    pre_speech_ms: float = PRE_SPEECH_MS
+    vad_start_secs: float = 0.0
    max_duration_secs: float = MAX_DURATION_SECONDS


@@ -181,7 +185,8 @@ class BaseSmartTurn(BaseTurnAnalyzer):
            return state, None

        # Extract recent audio segment for prediction
-        start_time = self._speech_start_time - (self._params.pre_speech_ms / 1000)
+        effective_pre_speech_ms = self._params.pre_speech_ms + (self._params.vad_start_secs * 1000)
+        start_time = self._speech_start_time - (effective_pre_speech_ms / 1000)
        start_index = 0
        for i, (t, _) in enumerate(audio_buffer):
            if t >= start_time:
--- a/src/pipecat/turns/user_stop/turn_analyzer_user_turn_stop_strategy.py
+++ b/src/pipecat/turns/user_stop/turn_analyzer_user_turn_stop_strategy.py
@@ -10,6 +10,7 @@ import asyncio
 from typing import Optional

 from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
+from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.frames.frames import (
    Frame,
    InputAudioRawFrame,
@@ -86,6 +87,8 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy):

        if isinstance(frame, StartFrame):
            await self._start(frame)
+        elif isinstance(frame, SpeechControlParamsFrame):
+            await self._handle_speech_control_params(frame)
        elif isinstance(frame, VADUserStartedSpeakingFrame):
            await self._handle_vad_user_started_speaking(frame)
        elif isinstance(frame, VADUserStoppedSpeakingFrame):
@@ -102,6 +105,24 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy):
        self._turn_analyzer.set_sample_rate(frame.audio_in_sample_rate)
        await self.broadcast_frame(SpeechControlParamsFrame, turn_params=self._turn_analyzer.params)

+    async def _handle_speech_control_params(self, frame: SpeechControlParamsFrame):
+        """Sync Smart Turn pre-speech buffering with VAD start delay.
+
+        In the new user-turn-strategies pipeline, `VADUserStartedSpeakingFrame`
+        is emitted only once VAD has *confirmed* speech (after `vad_params.start_secs`).
+        Smart Turn should still include the initial audio collected during that
+        confirmation window, so we record it in `SmartTurnParams.vad_start_secs` and
+        add it at inference slicing time (preserving `pre_speech_ms` semantics).
+        """
+        if not frame.vad_params:
+            return
+
+        params = self._turn_analyzer.params
+        if not isinstance(params, SmartTurnParams):
+            return
+
+        params.vad_start_secs = frame.vad_params.start_secs
+
    async def _handle_input_audio(self, frame: InputAudioRawFrame):
        """Handle input audio to check if the turn is completed."""
        state = self._turn_analyzer.append_audio(frame.audio, self._vad_user_speaking)