From be218e1941eb3cdb169c8cd78f89167faf247246 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 21 May 2026 12:19:24 -0400 Subject: [PATCH] Document the local-VAD-plus-server-VAD duplicate-frames caveat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Realtime services that emit their own UserStartedSpeakingFrame / UserStoppedSpeakingFrame (OpenAI Realtime, Azure Realtime, Inworld, Grok/xAI Realtime) also call broadcast_interruption() from server VAD events. Wiring local VAD on top — without first disabling the service's server-side turn detection — causes the aggregator's VAD-driven strategies to broadcast the same frames again, producing duplicates downstream (TurnTrackingObserver, RTVI, AudioBufferProcessor would see doubled events). This is pre-existing behavior on main, not introduced by this PR. But the realtime_service_mode "with local VAD" example invites the question, so call out the intended pattern explicitly. Update three places: - RealtimeServiceModeConfig docstring: a Note section explaining that local VAD is intended for services without server-emitted turn frames OR services with server-side turn detection disabled, not for "both VADs on". - OpenAI Realtime, Inworld, Grok/xAI service docstrings: a one-line note that wiring local VAD requires disabling server-side turn detection first (with a pointer to the *-local-vad.py example for OpenAI Realtime). No code change — the duplicate behavior is documented as not-recommended rather than auto-suppressed. Auto-suppression via RealtimeServiceMetadataFrame.emits_user_turn_frames was considered but rejected for surprise-factor (users adding local VAD probably expect their VAD-driven frames to fire). --- .../processors/aggregators/llm_response_universal.py | 12 ++++++++++++ src/pipecat/services/inworld/realtime/llm.py | 5 ++++- src/pipecat/services/openai/realtime/llm.py | 6 ++++++ src/pipecat/services/xai/realtime/llm.py | 5 ++++- 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py index 646c7df66..8259a304d 100644 --- a/src/pipecat/processors/aggregators/llm_response_universal.py +++ b/src/pipecat/processors/aggregators/llm_response_universal.py @@ -282,6 +282,18 @@ class RealtimeServiceModeConfig: path when local turn detection drives a realtime conversation. When True, turn-end strategies wait for transcripts to arrive before signalling end-of-turn. + + Note: + Local VAD (via ``LLMUserAggregatorParams.vad_analyzer``) is intended + for use with realtime services that either don't emit + ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` + themselves (Gemini Live, AWS Nova Sonic, Ultravox) or have their + server-side turn detection disabled (e.g. OpenAI Realtime with + ``turn_detection=False``). Wiring local VAD on top of a service + whose server-side turn detection is also active produces duplicate + user-turn frames from both sources — the service broadcasts them, + and the aggregator's local-VAD-driven strategies broadcast them + again. Pick one source. """ context_writes_await_turns: bool = False diff --git a/src/pipecat/services/inworld/realtime/llm.py b/src/pipecat/services/inworld/realtime/llm.py index c151ee2ee..0b6aa0359 100644 --- a/src/pipecat/services/inworld/realtime/llm.py +++ b/src/pipecat/services/inworld/realtime/llm.py @@ -204,7 +204,10 @@ class InworldRealtimeLLMService(LLMService[InworldRealtimeLLMAdapter]): Emits ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` from Inworld's server-side VAD events. Pair with ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())`` - so context writes are decoupled from those frames. + so context writes are decoupled from those frames. If you wire local + VAD (``LLMUserAggregatorParams.vad_analyzer``) on top of this + service, disable Inworld's server-side turn detection first; + otherwise both sources broadcast duplicate user-turn frames. Example:: diff --git a/src/pipecat/services/openai/realtime/llm.py b/src/pipecat/services/openai/realtime/llm.py index 4ef98aa35..fee235a27 100644 --- a/src/pipecat/services/openai/realtime/llm.py +++ b/src/pipecat/services/openai/realtime/llm.py @@ -213,6 +213,12 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())`` so context writes are decoupled from those frames; see the ``examples/realtime/realtime-openai.py`` example. + + If you wire local VAD (``LLMUserAggregatorParams.vad_analyzer``) on + top of this service, disable OpenAI's server-side turn detection + first (``turn_detection=False``); otherwise both sources broadcast + duplicate user-turn frames. See + ``examples/realtime/realtime-openai-local-vad.py``. """ Settings = OpenAIRealtimeLLMSettings diff --git a/src/pipecat/services/xai/realtime/llm.py b/src/pipecat/services/xai/realtime/llm.py index 0b9562fa9..6f60e6c2e 100644 --- a/src/pipecat/services/xai/realtime/llm.py +++ b/src/pipecat/services/xai/realtime/llm.py @@ -199,7 +199,10 @@ class GrokRealtimeLLMService(LLMService[GrokRealtimeLLMAdapter]): Emits ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` from Grok's server-side VAD events. Pair with ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())`` - so context writes are decoupled from those frames. + so context writes are decoupled from those frames. If you wire local + VAD (``LLMUserAggregatorParams.vad_analyzer``) on top of this + service, disable Grok's server-side turn detection first; otherwise + both sources broadcast duplicate user-turn frames. """ Settings = GrokRealtimeLLMSettings