From 86f9ad0c07c9927abaf3f9a33ecbef277d3d068e Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 21 May 2026 15:13:52 -0400 Subject: [PATCH] Show commented-out local-VAD opt-in in no-turn-frames examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For services that don't emit UserStarted/StoppedSpeakingFrame (Nova Sonic, Gemini Live, Ultravox), the absence of those frames means downstream consumers — including the Pipecat Prebuilt UI — can't group user transcripts into discrete turns. The Tier 1 comment block already called this out, but the fix required users to know to add the SileroVADAnalyzer import + LLMUserAggregatorParams kwarg themselves. Make it a copy-paste: include the relevant imports and `user_params=` argument as commented-out code, with a comment explaining that they're not strictly necessary for context aggregation but enable RTVI / turn- dependent processors when needed. Mirror the wording used in the LLMService startup log. Also fix line wrapping in the llm_service.py startup log for the no- turn-frames case (manual edit to that message left the last line over- length). --- examples/realtime/realtime-aws-nova-sonic.py | 30 ++++++++++-------- examples/realtime/realtime-gemini-live.py | 32 +++++++++++++------- examples/realtime/realtime-ultravox.py | 30 ++++++++++-------- src/pipecat/services/llm_service.py | 6 ++-- 4 files changed, 60 insertions(+), 38 deletions(-) diff --git a/examples/realtime/realtime-aws-nova-sonic.py b/examples/realtime/realtime-aws-nova-sonic.py index f0a010a6e..83123e8dc 100644 --- a/examples/realtime/realtime-aws-nova-sonic.py +++ b/examples/realtime/realtime-aws-nova-sonic.py @@ -148,24 +148,30 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): # Set up context and context management. # - # AWS Nova Sonic drives the conversation server-side. It does NOT emit - # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline - # processors that depend on those frames — RTVI client speech events, + # AWS Nova Sonic drives the conversation server-side and does not emit + # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context + # aggregation still works with realtime_service_mode, but pipeline + # processors that depend on those frames (RTVI client speech events, # TurnTrackingObserver, AudioBufferProcessor turn recording, - # UserIdleController, user mute strategies, voicemail detector — won't - # activate with the default server-VAD-only setup. Context aggregation - # still works with realtime_service_mode. + # UserIdleController, user mute strategies, voicemail detector) won't + # activate. The Pipecat Prebuilt UI is one such consumer — without + # these frames it can't group user transcripts into discrete turns + # visually. # - # To produce these frames locally, wire a VAD analyzer (e.g. - # SileroVADAnalyzer) into LLMUserAggregatorParams. Caveat: locally- - # generated turn boundaries are a heuristic and may not match Nova - # Sonic's server-side turn decisions, which is what drives the - # conversation; the two can drift apart in subtle ways especially - # around interruptions and overlapping speech. + # If you need those frames, uncomment the SileroVADAnalyzer import + # above and the `user_params=` argument below. Note: local turn + # detection may not match Nova Sonic's actual server-side turn + # decisions and can desynchronize in subtle ways. + # + # from pipecat.audio.vad.silero import SileroVADAnalyzer + # from pipecat.processors.aggregators.llm_response_universal import ( + # LLMUserAggregatorParams, + # ) context = LLMContext(tools=tools) user_aggregator, assistant_aggregator = LLMContextAggregatorPair( context, realtime_service_mode=RealtimeServiceModeConfig(), + # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()), ) # Build the pipeline diff --git a/examples/realtime/realtime-gemini-live.py b/examples/realtime/realtime-gemini-live.py index 426f12238..1d158e074 100644 --- a/examples/realtime/realtime-gemini-live.py +++ b/examples/realtime/realtime-gemini-live.py @@ -131,22 +131,32 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation) context = LLMContext() - # Gemini Live drives the conversation server-side. It does NOT emit - # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline - # processors that depend on those frames — RTVI client speech events, + # Gemini Live drives the conversation server-side and does not emit + # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context + # aggregation still works with realtime_service_mode, but pipeline + # processors that depend on those frames (RTVI client speech events, # TurnTrackingObserver, AudioBufferProcessor turn recording, - # UserIdleController, user mute strategies, voicemail detector — won't - # activate with the default server-VAD-only setup. Context aggregation - # still works with realtime_service_mode. + # UserIdleController, user mute strategies, voicemail detector) won't + # activate. The Pipecat Prebuilt UI is one such consumer — without + # these frames it can't group user transcripts into discrete turns + # visually. # - # To produce these frames locally, see `realtime-gemini-live-local-vad.py`. - # Caveat: locally-generated turn boundaries are a heuristic and may not - # match Gemini Live's server-side turn decisions, which is what drives the - # conversation; the two can drift apart in subtle ways especially around - # interruptions and overlapping speech. + # If you need those frames, uncomment the SileroVADAnalyzer import + # above and the `user_params=` argument below. Note: local turn + # detection may not match Gemini Live's actual server-side turn + # decisions and can desynchronize in subtle ways. + # + # For local VAD driving the conversation (server VAD disabled), see + # `realtime-gemini-live-local-vad.py` instead. + # + # from pipecat.audio.vad.silero import SileroVADAnalyzer + # from pipecat.processors.aggregators.llm_response_universal import ( + # LLMUserAggregatorParams, + # ) user_aggregator, assistant_aggregator = LLMContextAggregatorPair( context, realtime_service_mode=RealtimeServiceModeConfig(), + # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()), ) pipeline = Pipeline( diff --git a/examples/realtime/realtime-ultravox.py b/examples/realtime/realtime-ultravox.py index 95760d525..6b2c1ee31 100644 --- a/examples/realtime/realtime-ultravox.py +++ b/examples/realtime/realtime-ultravox.py @@ -175,23 +175,29 @@ There is also a secret menu that changes daily. If the user asks about it, use t context = LLMContext([]) - # Ultravox drives the conversation server-side. It does NOT emit - # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline - # processors that depend on those frames — RTVI client speech events, + # Ultravox drives the conversation server-side and does not emit + # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context + # aggregation still works with realtime_service_mode, but pipeline + # processors that depend on those frames (RTVI client speech events, # TurnTrackingObserver, AudioBufferProcessor turn recording, - # UserIdleController, user mute strategies, voicemail detector — won't - # activate with this default setup. Context aggregation still works - # with realtime_service_mode. + # UserIdleController, user mute strategies, voicemail detector) won't + # activate. The Pipecat Prebuilt UI is one such consumer — without + # these frames it can't group user transcripts into discrete turns + # visually. # - # To produce these frames locally, wire a VAD analyzer (e.g. - # SileroVADAnalyzer) into LLMUserAggregatorParams. Caveat: locally- - # generated turn boundaries are a heuristic and may not match - # Ultravox's server-side turn decisions, which is what drives the - # conversation; the two can drift apart in subtle ways especially - # around interruptions and overlapping speech. + # If you need those frames, uncomment the SileroVADAnalyzer import + # above and the `user_params=` argument below. Note: local turn + # detection may not match Ultravox's actual server-side turn + # decisions and can desynchronize in subtle ways. + # + # from pipecat.audio.vad.silero import SileroVADAnalyzer + # from pipecat.processors.aggregators.llm_response_universal import ( + # LLMUserAggregatorParams, + # ) user_aggregator, assistant_aggregator = LLMContextAggregatorPair( context, realtime_service_mode=RealtimeServiceModeConfig(), + # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()), ) # Build the pipeline diff --git a/src/pipecat/services/llm_service.py b/src/pipecat/services/llm_service.py index eb01d26fc..9c3a2158e 100644 --- a/src/pipecat/services/llm_service.py +++ b/src/pipecat/services/llm_service.py @@ -410,9 +410,9 @@ class LLMService(UserTurnCompletionLLMServiceMixin, AIService, Generic[TAdapter] "AudioBufferProcessor turn recording, UserIdleController, user " "mute strategies, voicemail detector) will not activate. To " "produce them locally, add `vad_analyzer=` to " - "LLMUserAggregatorParams. Note: local turn detection is a " - "heuristic; its boundaries may not match the provider's actual " - "server-side turn decisions and can desynchronize in subtle ways." + "LLMUserAggregatorParams. Note: local turn detection may not " + "match the provider's actual server-side turn decisions and " + "can desynchronize in subtle ways." ) async def stop(self, frame: EndFrame):