Add sync_with_audio support for OutputImageRawFrame

Add a `sync_with_audio` field to `OutputImageRawFrame` that routes image frames through the audio queue in the output transport, ensuring images are only displayed after all preceding audio has been sent. This enables proper audio/image synchronization in pipelines like the calendar month narration example. Update the 05-sync-speech-and-image example to use an `ImageAudioSync` processor that sets this flag on image frames.
2026-03-13 15:15:45 -04:00
parent 1ede8460a2
commit 0f1ff16af1
3 changed files with 30 additions and 2 deletions
--- a/examples/foundational/05-sync-speech-and-image.py
+++ b/examples/foundational/05-sync-speech-and-image.py
@@ -16,6 +16,7 @@ from pipecat.frames.frames import (
    Frame,
    LLMContextFrame,
    LLMFullResponseStartFrame,
+    OutputImageRawFrame,
    TextFrame,
 )
 from pipecat.pipeline.pipeline import Pipeline
@@ -44,6 +45,18 @@ class MonthFrame(DataFrame):
        return f"{self.name}(month: {self.month})"


+class ImageAudioSync(FrameProcessor):
+    """Marks output image frames to be synchronized with audio playback."""
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, OutputImageRawFrame):
+            frame.sync_with_audio = True
+
+        await self.push_frame(frame, direction)
+
+
 class MonthPrepender(FrameProcessor):
    def __init__(self):
        super().__init__()
@@ -129,7 +142,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
                sentence_aggregator,  # Aggregates LLM output into full sentences
                SyncParallelPipeline(  # Run pipelines in parallel aggregating the result
                    [month_prepender, tts],  # Create "Month: sentence" and output audio
-                    [imagegen],  # Generate image
+                    [
+                        imagegen,  # Generate image
+                        ImageAudioSync(),  # Mark image as needing sync output w/audio
+                    ],
                ),
                transport.output(),  # Transport output
            ]
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -274,8 +274,16 @@ class OutputImageRawFrame(DataFrame, ImageRawFrame):
    An image that will be shown by the transport. If the transport supports
    multiple video destinations (e.g. multiple video tracks) the destination
    name can be specified in transport_destination.
+
+    Parameters:
+        sync_with_audio: If True, the image is queued with audio frames so
+            it is only displayed after all preceding audio has been sent.
+            Defaults to False (image is displayed immediately when the output
+            transport receives it).
    """

+    sync_with_audio: bool = field(default=False, init=False)
+
    def __str__(self):
        pts = format_pts(self.pts)
        return f"{self.name}(pts: {pts}, destination: {self.transport_destination}, size: {self.size}, format: {self.format})"
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -569,7 +569,11 @@ class BaseOutputTransport(FrameProcessor):
            if not self._params.video_out_enabled:
                return

-            if self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame):
+            if isinstance(frame, OutputImageRawFrame) and frame.sync_with_audio:
+                # Route through the audio queue so the image is only
+                # displayed after all preceding audio has been sent.
+                await self._audio_queue.put(frame)
+            elif self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame):
                await self._video_queue.put(frame)
            elif isinstance(frame, OutputImageRawFrame):
                await self._set_video_image(frame)