Show commented-out local-VAD opt-in in no-turn-frames examples

For services that don't emit UserStarted/StoppedSpeakingFrame (Nova Sonic, Gemini Live, Ultravox), the absence of those frames means downstream consumers — including the Pipecat Prebuilt UI — can't group user transcripts into discrete turns. The Tier 1 comment block already called this out, but the fix required users to know to add the SileroVADAnalyzer import + LLMUserAggregatorParams kwarg themselves. Make it a copy-paste: include the relevant imports and `user_params=` argument as commented-out code, with a comment explaining that they're not strictly necessary for context aggregation but enable RTVI / turn- dependent processors when needed. Mirror the wording used in the LLMService startup log. Also fix line wrapping in the llm_service.py startup log for the no- turn-frames case (manual edit to that message left the last line over- length).
2026-05-21 15:13:52 -04:00
parent cb9fe04e0b
commit 86f9ad0c07
4 changed files with 60 additions and 38 deletions
--- a/examples/realtime/realtime-aws-nova-sonic.py
+++ b/examples/realtime/realtime-aws-nova-sonic.py
@@ -148,24 +148,30 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):

    # Set up context and context management.
    #
-    # AWS Nova Sonic drives the conversation server-side. It does NOT emit
-    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline
-    # processors that depend on those frames — RTVI client speech events,
+    # AWS Nova Sonic drives the conversation server-side and does not emit
+    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context
+    # aggregation still works with realtime_service_mode, but pipeline
+    # processors that depend on those frames (RTVI client speech events,
    # TurnTrackingObserver, AudioBufferProcessor turn recording,
-    # UserIdleController, user mute strategies, voicemail detector — won't
-    # activate with the default server-VAD-only setup. Context aggregation
-    # still works with realtime_service_mode.
+    # UserIdleController, user mute strategies, voicemail detector) won't
+    # activate. The Pipecat Prebuilt UI is one such consumer — without
+    # these frames it can't group user transcripts into discrete turns
+    # visually.
    #
-    # To produce these frames locally, wire a VAD analyzer (e.g.
-    # SileroVADAnalyzer) into LLMUserAggregatorParams. Caveat: locally-
-    # generated turn boundaries are a heuristic and may not match Nova
-    # Sonic's server-side turn decisions, which is what drives the
-    # conversation; the two can drift apart in subtle ways especially
-    # around interruptions and overlapping speech.
+    # If you need those frames, uncomment the SileroVADAnalyzer import
+    # above and the `user_params=` argument below. Note: local turn
+    # detection may not match Nova Sonic's actual server-side turn
+    # decisions and can desynchronize in subtle ways.
+    #
+    # from pipecat.audio.vad.silero import SileroVADAnalyzer
+    # from pipecat.processors.aggregators.llm_response_universal import (
+    #     LLMUserAggregatorParams,
+    # )
    context = LLMContext(tools=tools)
    user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
        context,
        realtime_service_mode=RealtimeServiceModeConfig(),
+        # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
    )

    # Build the pipeline
--- a/examples/realtime/realtime-gemini-live.py
+++ b/examples/realtime/realtime-gemini-live.py
@@ -131,22 +131,32 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)

    context = LLMContext()
-    # Gemini Live drives the conversation server-side. It does NOT emit
-    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline
-    # processors that depend on those frames — RTVI client speech events,
+    # Gemini Live drives the conversation server-side and does not emit
+    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context
+    # aggregation still works with realtime_service_mode, but pipeline
+    # processors that depend on those frames (RTVI client speech events,
    # TurnTrackingObserver, AudioBufferProcessor turn recording,
-    # UserIdleController, user mute strategies, voicemail detector — won't
-    # activate with the default server-VAD-only setup. Context aggregation
-    # still works with realtime_service_mode.
+    # UserIdleController, user mute strategies, voicemail detector) won't
+    # activate. The Pipecat Prebuilt UI is one such consumer — without
+    # these frames it can't group user transcripts into discrete turns
+    # visually.
    #
-    # To produce these frames locally, see `realtime-gemini-live-local-vad.py`.
-    # Caveat: locally-generated turn boundaries are a heuristic and may not
-    # match Gemini Live's server-side turn decisions, which is what drives the
-    # conversation; the two can drift apart in subtle ways especially around
-    # interruptions and overlapping speech.
+    # If you need those frames, uncomment the SileroVADAnalyzer import
+    # above and the `user_params=` argument below. Note: local turn
+    # detection may not match Gemini Live's actual server-side turn
+    # decisions and can desynchronize in subtle ways.
+    #
+    # For local VAD driving the conversation (server VAD disabled), see
+    # `realtime-gemini-live-local-vad.py` instead.
+    #
+    # from pipecat.audio.vad.silero import SileroVADAnalyzer
+    # from pipecat.processors.aggregators.llm_response_universal import (
+    #     LLMUserAggregatorParams,
+    # )
    user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
        context,
        realtime_service_mode=RealtimeServiceModeConfig(),
+        # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
    )

    pipeline = Pipeline(
--- a/examples/realtime/realtime-ultravox.py
+++ b/examples/realtime/realtime-ultravox.py
@@ -175,23 +175,29 @@ There is also a secret menu that changes daily. If the user asks about it, use t

    context = LLMContext([])

-    # Ultravox drives the conversation server-side. It does NOT emit
-    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame, so pipeline
-    # processors that depend on those frames — RTVI client speech events,
+    # Ultravox drives the conversation server-side and does not emit
+    # UserStartedSpeakingFrame / UserStoppedSpeakingFrame. Context
+    # aggregation still works with realtime_service_mode, but pipeline
+    # processors that depend on those frames (RTVI client speech events,
    # TurnTrackingObserver, AudioBufferProcessor turn recording,
-    # UserIdleController, user mute strategies, voicemail detector — won't
-    # activate with this default setup. Context aggregation still works
-    # with realtime_service_mode.
+    # UserIdleController, user mute strategies, voicemail detector) won't
+    # activate. The Pipecat Prebuilt UI is one such consumer — without
+    # these frames it can't group user transcripts into discrete turns
+    # visually.
    #
-    # To produce these frames locally, wire a VAD analyzer (e.g.
-    # SileroVADAnalyzer) into LLMUserAggregatorParams. Caveat: locally-
-    # generated turn boundaries are a heuristic and may not match
-    # Ultravox's server-side turn decisions, which is what drives the
-    # conversation; the two can drift apart in subtle ways especially
-    # around interruptions and overlapping speech.
+    # If you need those frames, uncomment the SileroVADAnalyzer import
+    # above and the `user_params=` argument below. Note: local turn
+    # detection may not match Ultravox's actual server-side turn
+    # decisions and can desynchronize in subtle ways.
+    #
+    # from pipecat.audio.vad.silero import SileroVADAnalyzer
+    # from pipecat.processors.aggregators.llm_response_universal import (
+    #     LLMUserAggregatorParams,
+    # )
    user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
        context,
        realtime_service_mode=RealtimeServiceModeConfig(),
+        # user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
    )

    # Build the pipeline
--- a/src/pipecat/services/llm_service.py
+++ b/src/pipecat/services/llm_service.py
@@ -410,9 +410,9 @@ class LLMService(UserTurnCompletionLLMServiceMixin, AIService, Generic[TAdapter]
                "AudioBufferProcessor turn recording, UserIdleController, user "
                "mute strategies, voicemail detector) will not activate. To "
                "produce them locally, add `vad_analyzer=` to "
-                "LLMUserAggregatorParams. Note: local turn detection is a "
-                "heuristic; its boundaries may not match the provider's actual "
-                "server-side turn decisions and can desynchronize in subtle ways."
+                "LLMUserAggregatorParams. Note: local turn detection may not "
+                "match the provider's actual server-side turn decisions and "
+                "can desynchronize in subtle ways."
            )

    async def stop(self, frame: EndFrame):