From be218e1941eb3cdb169c8cd78f89167faf247246 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 21 May 2026 12:19:24 -0400
Subject: [PATCH] Document the local-VAD-plus-server-VAD duplicate-frames
 caveat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Realtime services that emit their own UserStartedSpeakingFrame /
UserStoppedSpeakingFrame (OpenAI Realtime, Azure Realtime, Inworld,
Grok/xAI Realtime) also call broadcast_interruption() from server VAD
events. Wiring local VAD on top — without first disabling the service's
server-side turn detection — causes the aggregator's VAD-driven
strategies to broadcast the same frames again, producing duplicates
downstream (TurnTrackingObserver, RTVI, AudioBufferProcessor would see
doubled events).

This is pre-existing behavior on main, not introduced by this PR. But
the realtime_service_mode "with local VAD" example invites the
question, so call out the intended pattern explicitly. Update three
places:

  - RealtimeServiceModeConfig docstring: a Note section explaining
    that local VAD is intended for services without server-emitted
    turn frames OR services with server-side turn detection disabled,
    not for "both VADs on".
  - OpenAI Realtime, Inworld, Grok/xAI service docstrings: a one-line
    note that wiring local VAD requires disabling server-side turn
    detection first (with a pointer to the *-local-vad.py example for
    OpenAI Realtime).

No code change — the duplicate behavior is documented as
not-recommended rather than auto-suppressed. Auto-suppression via
RealtimeServiceMetadataFrame.emits_user_turn_frames was considered but
rejected for surprise-factor (users adding local VAD probably expect
their VAD-driven frames to fire).
---
 .../processors/aggregators/llm_response_universal.py | 12 ++++++++++++
 src/pipecat/services/inworld/realtime/llm.py         |  5 ++++-
 src/pipecat/services/openai/realtime/llm.py          |  6 ++++++
 src/pipecat/services/xai/realtime/llm.py             |  5 ++++-
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py
index 646c7df66..8259a304d 100644
--- a/src/pipecat/processors/aggregators/llm_response_universal.py
+++ b/src/pipecat/processors/aggregators/llm_response_universal.py
@@ -282,6 +282,18 @@ class RealtimeServiceModeConfig:
             path when local turn detection drives a realtime conversation.
             When True, turn-end strategies wait for transcripts to arrive
             before signalling end-of-turn.
+
+    Note:
+        Local VAD (via ``LLMUserAggregatorParams.vad_analyzer``) is intended
+        for use with realtime services that either don't emit
+        ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame``
+        themselves (Gemini Live, AWS Nova Sonic, Ultravox) or have their
+        server-side turn detection disabled (e.g. OpenAI Realtime with
+        ``turn_detection=False``). Wiring local VAD on top of a service
+        whose server-side turn detection is also active produces duplicate
+        user-turn frames from both sources — the service broadcasts them,
+        and the aggregator's local-VAD-driven strategies broadcast them
+        again. Pick one source.
     """
 
     context_writes_await_turns: bool = False
diff --git a/src/pipecat/services/inworld/realtime/llm.py b/src/pipecat/services/inworld/realtime/llm.py
index c151ee2ee..0b6aa0359 100644
--- a/src/pipecat/services/inworld/realtime/llm.py
+++ b/src/pipecat/services/inworld/realtime/llm.py
@@ -204,7 +204,10 @@ class InworldRealtimeLLMService(LLMService[InworldRealtimeLLMAdapter]):
     Emits ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` from
     Inworld's server-side VAD events. Pair with
     ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())``
-    so context writes are decoupled from those frames.
+    so context writes are decoupled from those frames. If you wire local
+    VAD (``LLMUserAggregatorParams.vad_analyzer``) on top of this
+    service, disable Inworld's server-side turn detection first;
+    otherwise both sources broadcast duplicate user-turn frames.
 
     Example::
 
diff --git a/src/pipecat/services/openai/realtime/llm.py b/src/pipecat/services/openai/realtime/llm.py
index 4ef98aa35..fee235a27 100644
--- a/src/pipecat/services/openai/realtime/llm.py
+++ b/src/pipecat/services/openai/realtime/llm.py
@@ -213,6 +213,12 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
     ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())``
     so context writes are decoupled from those frames; see the
     ``examples/realtime/realtime-openai.py`` example.
+
+    If you wire local VAD (``LLMUserAggregatorParams.vad_analyzer``) on
+    top of this service, disable OpenAI's server-side turn detection
+    first (``turn_detection=False``); otherwise both sources broadcast
+    duplicate user-turn frames. See
+    ``examples/realtime/realtime-openai-local-vad.py``.
     """
 
     Settings = OpenAIRealtimeLLMSettings
diff --git a/src/pipecat/services/xai/realtime/llm.py b/src/pipecat/services/xai/realtime/llm.py
index 0b9562fa9..6f60e6c2e 100644
--- a/src/pipecat/services/xai/realtime/llm.py
+++ b/src/pipecat/services/xai/realtime/llm.py
@@ -199,7 +199,10 @@ class GrokRealtimeLLMService(LLMService[GrokRealtimeLLMAdapter]):
     Emits ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` from
     Grok's server-side VAD events. Pair with
     ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())``
-    so context writes are decoupled from those frames.
+    so context writes are decoupled from those frames. If you wire local
+    VAD (``LLMUserAggregatorParams.vad_analyzer``) on top of this
+    service, disable Grok's server-side turn detection first; otherwise
+    both sources broadcast duplicate user-turn frames.
     """
 
     Settings = GrokRealtimeLLMSettings