Take into account VAD start_secs when passing audio data to Smart Turn, and add an extra 500ms of pre-speech audio for good measure
This commit is contained in:
@@ -36,6 +36,7 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
|
||||
logger.info("✅ Silero VAD model loaded")
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
|
||||
@@ -44,7 +45,10 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -53,6 +57,10 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.turns.user_stop.turn_analyzer_user_turn_stop_strategy import (
|
||||
TurnAnalyzerUserTurnStopStrategy,
|
||||
)
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
logger.info("✅ All components loaded successfully!")
|
||||
|
||||
@@ -79,7 +87,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[
|
||||
TurnAnalyzerUserTurnStopStrategy(
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
)
|
||||
],
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
@@ -130,13 +149,11 @@ async def bot(runner_args: RunnerArguments):
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
||||
|
||||
# Default timing parameters
|
||||
STOP_SECS = 3
|
||||
PRE_SPEECH_MS = 0
|
||||
PRE_SPEECH_MS = 500
|
||||
MAX_DURATION_SECONDS = 8 # Max allowed segment duration
|
||||
|
||||
|
||||
@@ -35,11 +35,15 @@ class SmartTurnParams(BaseTurnParams):
|
||||
Parameters:
|
||||
stop_secs: Maximum silence duration in seconds before ending turn.
|
||||
pre_speech_ms: Milliseconds of audio to include before speech starts.
|
||||
vad_start_secs: Seconds VAD waits before confirming speech start (e.g. VAD STARTING window).
|
||||
This is added to `pre_speech_ms` at inference slicing time so Smart Turn can include
|
||||
the initial audio that occurred while VAD was still confirming speech.
|
||||
max_duration_secs: Maximum duration in seconds for audio segments.
|
||||
"""
|
||||
|
||||
stop_secs: float = STOP_SECS
|
||||
pre_speech_ms: float = PRE_SPEECH_MS
|
||||
vad_start_secs: float = 0.0
|
||||
max_duration_secs: float = MAX_DURATION_SECONDS
|
||||
|
||||
|
||||
@@ -181,7 +185,8 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
return state, None
|
||||
|
||||
# Extract recent audio segment for prediction
|
||||
start_time = self._speech_start_time - (self._params.pre_speech_ms / 1000)
|
||||
effective_pre_speech_ms = self._params.pre_speech_ms + (self._params.vad_start_secs * 1000)
|
||||
start_time = self._speech_start_time - (effective_pre_speech_ms / 1000)
|
||||
start_index = 0
|
||||
for i, (t, _) in enumerate(audio_buffer):
|
||||
if t >= start_time:
|
||||
|
||||
@@ -10,6 +10,7 @@ import asyncio
|
||||
from typing import Optional
|
||||
|
||||
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
@@ -86,6 +87,8 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy):
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
await self._start(frame)
|
||||
elif isinstance(frame, SpeechControlParamsFrame):
|
||||
await self._handle_speech_control_params(frame)
|
||||
elif isinstance(frame, VADUserStartedSpeakingFrame):
|
||||
await self._handle_vad_user_started_speaking(frame)
|
||||
elif isinstance(frame, VADUserStoppedSpeakingFrame):
|
||||
@@ -102,6 +105,24 @@ class TurnAnalyzerUserTurnStopStrategy(BaseUserTurnStopStrategy):
|
||||
self._turn_analyzer.set_sample_rate(frame.audio_in_sample_rate)
|
||||
await self.broadcast_frame(SpeechControlParamsFrame, turn_params=self._turn_analyzer.params)
|
||||
|
||||
async def _handle_speech_control_params(self, frame: SpeechControlParamsFrame):
|
||||
"""Sync Smart Turn pre-speech buffering with VAD start delay.
|
||||
|
||||
In the new user-turn-strategies pipeline, `VADUserStartedSpeakingFrame`
|
||||
is emitted only once VAD has *confirmed* speech (after `vad_params.start_secs`).
|
||||
Smart Turn should still include the initial audio collected during that
|
||||
confirmation window, so we record it in `SmartTurnParams.vad_start_secs` and
|
||||
add it at inference slicing time (preserving `pre_speech_ms` semantics).
|
||||
"""
|
||||
if not frame.vad_params:
|
||||
return
|
||||
|
||||
params = self._turn_analyzer.params
|
||||
if not isinstance(params, SmartTurnParams):
|
||||
return
|
||||
|
||||
params.vad_start_secs = frame.vad_params.start_secs
|
||||
|
||||
async def _handle_input_audio(self, frame: InputAudioRawFrame):
|
||||
"""Handle input audio to check if the turn is completed."""
|
||||
state = self._turn_analyzer.append_audio(frame.audio, self._vad_user_speaking)
|
||||
|
||||
Reference in New Issue
Block a user