Add sync_with_audio support for OutputImageRawFrame

Add a `sync_with_audio` field to `OutputImageRawFrame` that routes image
frames through the audio queue in the output transport, ensuring images
are only displayed after all preceding audio has been sent. This enables
proper audio/image synchronization in pipelines like the calendar month
narration example.

Update the 05-sync-speech-and-image example to use an `ImageAudioSync`
processor that sets this flag on image frames.
This commit is contained in:
Paul Kompfner
2026-03-13 15:15:45 -04:00
parent 1ede8460a2
commit 0f1ff16af1
3 changed files with 30 additions and 2 deletions

View File

@@ -16,6 +16,7 @@ from pipecat.frames.frames import (
Frame,
LLMContextFrame,
LLMFullResponseStartFrame,
OutputImageRawFrame,
TextFrame,
)
from pipecat.pipeline.pipeline import Pipeline
@@ -44,6 +45,18 @@ class MonthFrame(DataFrame):
return f"{self.name}(month: {self.month})"
class ImageAudioSync(FrameProcessor):
"""Marks output image frames to be synchronized with audio playback."""
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, OutputImageRawFrame):
frame.sync_with_audio = True
await self.push_frame(frame, direction)
class MonthPrepender(FrameProcessor):
def __init__(self):
super().__init__()
@@ -129,7 +142,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
sentence_aggregator, # Aggregates LLM output into full sentences
SyncParallelPipeline( # Run pipelines in parallel aggregating the result
[month_prepender, tts], # Create "Month: sentence" and output audio
[imagegen], # Generate image
[
imagegen, # Generate image
ImageAudioSync(), # Mark image as needing sync output w/audio
],
),
transport.output(), # Transport output
]

View File

@@ -274,8 +274,16 @@ class OutputImageRawFrame(DataFrame, ImageRawFrame):
An image that will be shown by the transport. If the transport supports
multiple video destinations (e.g. multiple video tracks) the destination
name can be specified in transport_destination.
Parameters:
sync_with_audio: If True, the image is queued with audio frames so
it is only displayed after all preceding audio has been sent.
Defaults to False (image is displayed immediately when the output
transport receives it).
"""
sync_with_audio: bool = field(default=False, init=False)
def __str__(self):
pts = format_pts(self.pts)
return f"{self.name}(pts: {pts}, destination: {self.transport_destination}, size: {self.size}, format: {self.format})"

View File

@@ -569,7 +569,11 @@ class BaseOutputTransport(FrameProcessor):
if not self._params.video_out_enabled:
return
if self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame):
if isinstance(frame, OutputImageRawFrame) and frame.sync_with_audio:
# Route through the audio queue so the image is only
# displayed after all preceding audio has been sent.
await self._audio_queue.put(frame)
elif self._params.video_out_is_live and isinstance(frame, OutputImageRawFrame):
await self._video_queue.put(frame)
elif isinstance(frame, OutputImageRawFrame):
await self._set_video_image(frame)