From 0f1ff16af12e26e9cf05e4c6b2add398ff6f96b5 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Fri, 13 Mar 2026 15:15:45 -0400 Subject: [PATCH] Add sync_with_audio support for OutputImageRawFrame Add a `sync_with_audio` field to `OutputImageRawFrame` that routes image frames through the audio queue in the output transport, ensuring images are only displayed after all preceding audio has been sent. This enables proper audio/image synchronization in pipelines like the calendar month narration example. Update the 05-sync-speech-and-image example to use an `ImageAudioSync` processor that sets this flag on image frames. --- .../foundational/05-sync-speech-and-image.py | 18 +++++++++++++++++- src/pipecat/frames/frames.py | 8 ++++++++ src/pipecat/transports/base_output.py | 6 +++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/examples/foundational/05-sync-speech-and-image.py b/examples/foundational/05-sync-speech-and-image.py index 1e4cf34a4..35f8f411c 100644 --- a/examples/foundational/05-sync-speech-and-image.py +++ b/examples/foundational/05-sync-speech-and-image.py @@ -16,6 +16,7 @@ from pipecat.frames.frames import ( Frame, LLMContextFrame, LLMFullResponseStartFrame, + OutputImageRawFrame, TextFrame, ) from pipecat.pipeline.pipeline import Pipeline @@ -44,6 +45,18 @@ class MonthFrame(DataFrame): return f"{self.name}(month: {self.month})" +class ImageAudioSync(FrameProcessor): + """Marks output image frames to be synchronized with audio playback.""" + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, OutputImageRawFrame): + frame.sync_with_audio = True + + await self.push_frame(frame, direction) + + class MonthPrepender(FrameProcessor): def __init__(self): super().__init__() @@ -129,7 +142,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): sentence_aggregator, # Aggregates LLM output into full sentences SyncParallelPipeline( # Run pipelines in parallel aggregating the result [month_prepender, tts], # Create "Month: sentence" and output audio - [imagegen], # Generate image + [ + imagegen, # Generate image + ImageAudioSync(), # Mark image as needing sync output w/audio + ], ), transport.output(), # Transport output ] diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index fbc01c488..7107cfd97 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -274,8 +274,16 @@ class OutputImageRawFrame(DataFrame, ImageRawFrame): An image that will be shown by the transport. If the transport supports multiple video destinations (e.g. multiple video tracks) the destination name can be specified in transport_destination. + + Parameters: + sync_with_audio: If True, the image is queued with audio frames so + it is only displayed after all preceding audio has been sent. + Defaults to False (image is displayed immediately when the output + transport receives it). """ + sync_with_audio: bool = field(default=False, init=False) + def __str__(self): pts = format_pts(self.pts) return f"{self.name}(pts: {pts}, destination: {self.transport_destination}, size: {self.size}, format: {self.format})" diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index e14ae3828..01af97be8 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -569,7 +569,11 @@ class BaseOutputTransport(FrameProcessor): if not self._params.video_out_enabled: return - if self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame): + if isinstance(frame, OutputImageRawFrame) and frame.sync_with_audio: + # Route through the audio queue so the image is only + # displayed after all preceding audio has been sent. + await self._audio_queue.put(frame) + elif self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame): await self._video_queue.put(frame) elif isinstance(frame, OutputImageRawFrame): await self._set_video_image(frame)