From 86f9ad0c07c9927abaf3f9a33ecbef277d3d068e Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 21 May 2026 15:13:52 -0400
Subject: [PATCH] Show commented-out local-VAD opt-in in no-turn-frames
 examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For services that don't emit UserStarted/StoppedSpeakingFrame (Nova
Sonic, Gemini Live, Ultravox), the absence of those frames means
downstream consumers — including the Pipecat Prebuilt UI — can't group
user transcripts into discrete turns. The Tier 1 comment block already
called this out, but the fix required users to know to add the
SileroVADAnalyzer import + LLMUserAggregatorParams kwarg themselves.

Make it a copy-paste: include the relevant imports and `user_params=`
argument as commented-out code, with a comment explaining that they're
not strictly necessary for context aggregation but enable RTVI / turn-
dependent processors when needed. Mirror the wording used in the
LLMService startup log.

Also fix line wrapping in the llm_service.py startup log for the no-
turn-frames case (manual edit to that message left the last line over-
length).
---
 examples/realtime/realtime-aws-nova-sonic.py | 30 ++++++++++--------
 examples/realtime/realtime-gemini-live.py    | 32 +++++++++++++-------
 examples/realtime/realtime-ultravox.py       | 30 ++++++++++--------
 src/pipecat/services/llm_service.py          |  6 ++--
 4 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/examples/realtime/realtime-aws-nova-sonic.py b/examples/realtime/realtime-aws-nova-sonic.py
index f0a010a6e..83123e8dc 100644
--- a/examples/realtime/realtime-aws-nova-sonic.py
+++ b/examples/realtime/realtime-aws-nova-sonic.py
@@ -148,24 +148,30 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
 
     # Set up context and context management.
     #
-    # AWS Nova Sonic drives the conversation server-side. It does NOT emit
-    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline
-    # processors that depend on those frames — RTVI client speech events,
+    # AWS Nova Sonic drives the conversation server-side and does not emit
+    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context
+    # aggregation still works with realtime_service_mode, but pipeline
+    # processors that depend on those frames (RTVI client speech events,
     # TurnTrackingObserver, AudioBufferProcessor turn recording,
-    # UserIdleController, user mute strategies, voicemail detector — won't
-    # activate with the default server-VAD-only setup. Context aggregation
-    # still works with realtime_service_mode.
+    # UserIdleController, user mute strategies, voicemail detector) won't
+    # activate. The Pipecat Prebuilt UI is one such consumer — without
+    # these frames it can't group user transcripts into discrete turns
+    # visually.
     #
-    # To produce these frames locally, wire a VAD analyzer (e.g.
-    # SileroVADAnalyzer) into LLMUserAggregatorParams. Caveat: locally-
-    # generated turn boundaries are a heuristic and may not match Nova
-    # Sonic's server-side turn decisions, which is what drives the
-    # conversation; the two can drift apart in subtle ways especially
-    # around interruptions and overlapping speech.
+    # If you need those frames, uncomment the SileroVADAnalyzer import
+    # above and the `user_params=` argument below. Note: local turn
+    # detection may not match Nova Sonic's actual server-side turn
+    # decisions and can desynchronize in subtle ways.
+    #
+    # from pipecat.audio.vad.silero import SileroVADAnalyzer
+    # from pipecat.processors.aggregators.llm_response_universal import (
+    #     LLMUserAggregatorParams,
+    # )
     context = LLMContext(tools=tools)
     user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
         context,
         realtime_service_mode=RealtimeServiceModeConfig(),
+        # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
     )
 
     # Build the pipeline
diff --git a/examples/realtime/realtime-gemini-live.py b/examples/realtime/realtime-gemini-live.py
index 426f12238..1d158e074 100644
--- a/examples/realtime/realtime-gemini-live.py
+++ b/examples/realtime/realtime-gemini-live.py
@@ -131,22 +131,32 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
 
     context = LLMContext()
-    # Gemini Live drives the conversation server-side. It does NOT emit
-    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline
-    # processors that depend on those frames — RTVI client speech events,
+    # Gemini Live drives the conversation server-side and does not emit
+    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context
+    # aggregation still works with realtime_service_mode, but pipeline
+    # processors that depend on those frames (RTVI client speech events,
     # TurnTrackingObserver, AudioBufferProcessor turn recording,
-    # UserIdleController, user mute strategies, voicemail detector — won't
-    # activate with the default server-VAD-only setup. Context aggregation
-    # still works with realtime_service_mode.
+    # UserIdleController, user mute strategies, voicemail detector) won't
+    # activate. The Pipecat Prebuilt UI is one such consumer — without
+    # these frames it can't group user transcripts into discrete turns
+    # visually.
     #
-    # To produce these frames locally, see `realtime-gemini-live-local-vad.py`.
-    # Caveat: locally-generated turn boundaries are a heuristic and may not
-    # match Gemini Live's server-side turn decisions, which is what drives the
-    # conversation; the two can drift apart in subtle ways especially around
-    # interruptions and overlapping speech.
+    # If you need those frames, uncomment the SileroVADAnalyzer import
+    # above and the `user_params=` argument below. Note: local turn
+    # detection may not match Gemini Live's actual server-side turn
+    # decisions and can desynchronize in subtle ways.
+    #
+    # For local VAD driving the conversation (server VAD disabled), see
+    # `realtime-gemini-live-local-vad.py` instead.
+    #
+    # from pipecat.audio.vad.silero import SileroVADAnalyzer
+    # from pipecat.processors.aggregators.llm_response_universal import (
+    #     LLMUserAggregatorParams,
+    # )
     user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
         context,
         realtime_service_mode=RealtimeServiceModeConfig(),
+        # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
     )
 
     pipeline = Pipeline(
diff --git a/examples/realtime/realtime-ultravox.py b/examples/realtime/realtime-ultravox.py
index 95760d525..6b2c1ee31 100644
--- a/examples/realtime/realtime-ultravox.py
+++ b/examples/realtime/realtime-ultravox.py
@@ -175,23 +175,29 @@ There is also a secret menu that changes daily. If the user asks about it, use t
 
     context = LLMContext([])
 
-    # Ultravox drives the conversation server-side. It does NOT emit
-    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline
-    # processors that depend on those frames — RTVI client speech events,
+    # Ultravox drives the conversation server-side and does not emit
+    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context
+    # aggregation still works with realtime_service_mode, but pipeline
+    # processors that depend on those frames (RTVI client speech events,
     # TurnTrackingObserver, AudioBufferProcessor turn recording,
-    # UserIdleController, user mute strategies, voicemail detector — won't
-    # activate with this default setup. Context aggregation still works
-    # with realtime_service_mode.
+    # UserIdleController, user mute strategies, voicemail detector) won't
+    # activate. The Pipecat Prebuilt UI is one such consumer — without
+    # these frames it can't group user transcripts into discrete turns
+    # visually.
     #
-    # To produce these frames locally, wire a VAD analyzer (e.g.
-    # SileroVADAnalyzer) into LLMUserAggregatorParams. Caveat: locally-
-    # generated turn boundaries are a heuristic and may not match
-    # Ultravox's server-side turn decisions, which is what drives the
-    # conversation; the two can drift apart in subtle ways especially
-    # around interruptions and overlapping speech.
+    # If you need those frames, uncomment the SileroVADAnalyzer import
+    # above and the `user_params=` argument below. Note: local turn
+    # detection may not match Ultravox's actual server-side turn
+    # decisions and can desynchronize in subtle ways.
+    #
+    # from pipecat.audio.vad.silero import SileroVADAnalyzer
+    # from pipecat.processors.aggregators.llm_response_universal import (
+    #     LLMUserAggregatorParams,
+    # )
     user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
         context,
         realtime_service_mode=RealtimeServiceModeConfig(),
+        # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
     )
 
     # Build the pipeline
diff --git a/src/pipecat/services/llm_service.py b/src/pipecat/services/llm_service.py
index eb01d26fc..9c3a2158e 100644
--- a/src/pipecat/services/llm_service.py
+++ b/src/pipecat/services/llm_service.py
@@ -410,9 +410,9 @@ class LLMService(UserTurnCompletionLLMServiceMixin, AIService, Generic[TAdapter]
                 "AudioBufferProcessor turn recording, UserIdleController, user "
                 "mute strategies, voicemail detector) will not activate. To "
                 "produce them locally, add `vad_analyzer=` to "
-                "LLMUserAggregatorParams. Note: local turn detection is a "
-                "heuristic; its boundaries may not match the provider's actual "
-                "server-side turn decisions and can desynchronize in subtle ways."
+                "LLMUserAggregatorParams. Note: local turn detection may not "
+                "match the provider's actual server-side turn decisions and "
+                "can desynchronize in subtle ways."
             )
 
     async def stop(self, frame: EndFrame):