From 0f1ff16af12e26e9cf05e4c6b2add398ff6f96b5 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Fri, 13 Mar 2026 15:15:45 -0400
Subject: [PATCH] Add sync_with_audio support for OutputImageRawFrame

Add a `sync_with_audio` field to `OutputImageRawFrame` that routes image
frames through the audio queue in the output transport, ensuring images
are only displayed after all preceding audio has been sent. This enables
proper audio/image synchronization in pipelines like the calendar month
narration example.

Update the 05-sync-speech-and-image example to use an `ImageAudioSync`
processor that sets this flag on image frames.
---
 .../foundational/05-sync-speech-and-image.py   | 18 +++++++++++++++++-
 src/pipecat/frames/frames.py                   |  8 ++++++++
 src/pipecat/transports/base_output.py          |  6 +++++-
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/examples/foundational/05-sync-speech-and-image.py b/examples/foundational/05-sync-speech-and-image.py
index 1e4cf34a4..35f8f411c 100644
--- a/examples/foundational/05-sync-speech-and-image.py
+++ b/examples/foundational/05-sync-speech-and-image.py
@@ -16,6 +16,7 @@ from pipecat.frames.frames import (
     Frame,
     LLMContextFrame,
     LLMFullResponseStartFrame,
+    OutputImageRawFrame,
     TextFrame,
 )
 from pipecat.pipeline.pipeline import Pipeline
@@ -44,6 +45,18 @@ class MonthFrame(DataFrame):
         return f"{self.name}(month: {self.month})"
 
 
+class ImageAudioSync(FrameProcessor):
+    """Marks output image frames to be synchronized with audio playback."""
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, OutputImageRawFrame):
+            frame.sync_with_audio = True
+
+        await self.push_frame(frame, direction)
+
+
 class MonthPrepender(FrameProcessor):
     def __init__(self):
         super().__init__()
@@ -129,7 +142,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
                 sentence_aggregator,  # Aggregates LLM output into full sentences
                 SyncParallelPipeline(  # Run pipelines in parallel aggregating the result
                     [month_prepender, tts],  # Create "Month: sentence" and output audio
-                    [imagegen],  # Generate image
+                    [
+                        imagegen,  # Generate image
+                        ImageAudioSync(),  # Mark image as needing sync output w/audio
+                    ],
                 ),
                 transport.output(),  # Transport output
             ]
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index fbc01c488..7107cfd97 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -274,8 +274,16 @@ class OutputImageRawFrame(DataFrame, ImageRawFrame):
     An image that will be shown by the transport. If the transport supports
     multiple video destinations (e.g. multiple video tracks) the destination
     name can be specified in transport_destination.
+
+    Parameters:
+        sync_with_audio: If True, the image is queued with audio frames so
+            it is only displayed after all preceding audio has been sent.
+            Defaults to False (image is displayed immediately when the output
+            transport receives it).
     """
 
+    sync_with_audio: bool = field(default=False, init=False)
+
     def __str__(self):
         pts = format_pts(self.pts)
         return f"{self.name}(pts: {pts}, destination: {self.transport_destination}, size: {self.size}, format: {self.format})"
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index e14ae3828..01af97be8 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -569,7 +569,11 @@ class BaseOutputTransport(FrameProcessor):
             if not self._params.video_out_enabled:
                 return
 
-            if self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame):
+            if isinstance(frame, OutputImageRawFrame) and frame.sync_with_audio:
+                # Route through the audio queue so the image is only
+                # displayed after all preceding audio has been sent.
+                await self._audio_queue.put(frame)
+            elif self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame):
                 await self._video_queue.put(frame)
             elif isinstance(frame, OutputImageRawFrame):
                 await self._set_video_image(frame)