diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py index 646c7df66..8259a304d 100644 --- a/src/pipecat/processors/aggregators/llm_response_universal.py +++ b/src/pipecat/processors/aggregators/llm_response_universal.py @@ -282,6 +282,18 @@ class RealtimeServiceModeConfig: path when local turn detection drives a realtime conversation. When True, turn-end strategies wait for transcripts to arrive before signalling end-of-turn. + + Note: + Local VAD (via ``LLMUserAggregatorParams.vad_analyzer``) is intended + for use with realtime services that either don't emit + ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` + themselves (Gemini Live, AWS Nova Sonic, Ultravox) or have their + server-side turn detection disabled (e.g. OpenAI Realtime with + ``turn_detection=False``). Wiring local VAD on top of a service + whose server-side turn detection is also active produces duplicate + user-turn frames from both sources — the service broadcasts them, + and the aggregator's local-VAD-driven strategies broadcast them + again. Pick one source. """ context_writes_await_turns: bool = False diff --git a/src/pipecat/services/inworld/realtime/llm.py b/src/pipecat/services/inworld/realtime/llm.py index c151ee2ee..0b6aa0359 100644 --- a/src/pipecat/services/inworld/realtime/llm.py +++ b/src/pipecat/services/inworld/realtime/llm.py @@ -204,7 +204,10 @@ class InworldRealtimeLLMService(LLMService[InworldRealtimeLLMAdapter]): Emits ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` from Inworld's server-side VAD events. Pair with ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())`` - so context writes are decoupled from those frames. + so context writes are decoupled from those frames. If you wire local + VAD (``LLMUserAggregatorParams.vad_analyzer``) on top of this + service, disable Inworld's server-side turn detection first; + otherwise both sources broadcast duplicate user-turn frames. Example:: diff --git a/src/pipecat/services/openai/realtime/llm.py b/src/pipecat/services/openai/realtime/llm.py index 4ef98aa35..fee235a27 100644 --- a/src/pipecat/services/openai/realtime/llm.py +++ b/src/pipecat/services/openai/realtime/llm.py @@ -213,6 +213,12 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())`` so context writes are decoupled from those frames; see the ``examples/realtime/realtime-openai.py`` example. + + If you wire local VAD (``LLMUserAggregatorParams.vad_analyzer``) on + top of this service, disable OpenAI's server-side turn detection + first (``turn_detection=False``); otherwise both sources broadcast + duplicate user-turn frames. See + ``examples/realtime/realtime-openai-local-vad.py``. """ Settings = OpenAIRealtimeLLMSettings diff --git a/src/pipecat/services/xai/realtime/llm.py b/src/pipecat/services/xai/realtime/llm.py index 0b9562fa9..6f60e6c2e 100644 --- a/src/pipecat/services/xai/realtime/llm.py +++ b/src/pipecat/services/xai/realtime/llm.py @@ -199,7 +199,10 @@ class GrokRealtimeLLMService(LLMService[GrokRealtimeLLMAdapter]): Emits ``UserStartedSpeakingFrame`` / ``UserStoppedSpeakingFrame`` from Grok's server-side VAD events. Pair with ``LLMContextAggregatorPair(..., realtime_service_mode=RealtimeServiceModeConfig())`` - so context writes are decoupled from those frames. + so context writes are decoupled from those frames. If you wire local + VAD (``LLMUserAggregatorParams.vad_analyzer``) on top of this + service, disable Grok's server-side turn detection first; otherwise + both sources broadcast duplicate user-turn frames. """ Settings = GrokRealtimeLLMSettings