explicit MetadataFrame

wip
2025-01-31 19:27:01 +00:00 · 2025-01-31 19:27:01 +00:00
4 changed files with 120 additions and 4 deletions
--- a/examples/storytelling-chatbot/src/bot.py
+++ b/examples/storytelling-chatbot/src/bot.py
@@ -12,19 +12,33 @@ import sys
 import aiohttp
 from dotenv import load_dotenv
 from loguru import logger
-from processors import StoryImageProcessor, StoryProcessor
+from processors import StoryImageFrame, StoryImageProcessor, StoryPageFrame, StoryProcessor
 from prompts import CUE_USER_TURN, LLM_BASE_PROMPT
 from utils.helpers import load_images, load_sounds
 from pipecat.audio.vad.silero import SileroVADAnalyzer
-from pipecat.frames.frames import EndFrame
+from pipecat.frames.frames import (
    AudioRawFrame,
    EndFrame,
    Frame,
    ImageRawFrame,
    MetadataFrame,
    SystemFrame,
    TextFrame,
    TTSStartedFrame,
    TTSStoppedFrame,
 )
 from pipecat.observers.base_observer import BaseObserver
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.processors.logger import FrameLogger
 from pipecat.services.elevenlabs import ElevenLabsTTSService
 from pipecat.services.fal import FalImageGenService
-from pipecat.services.google import GoogleLLMService
+from pipecat.services.google import GoogleImageGenService, GoogleLLMService
 from pipecat.transports.services.daily import (
    DailyParams,
    DailyTransport,
@@ -40,6 +54,62 @@ sounds = load_sounds(["listening.wav"])
 images = load_images(["book1.png", "book2.png"])
 class DebugObserver(BaseObserver):
    """Observer to log interruptions and bot speaking events to the console.
    Logs all frame instances of:
    - StartInterruptionFrame
    - BotStartedSpeakingFrame
    - BotStoppedSpeakingFrame
    This allows you to see the frame flow from processor to processor through the pipeline for these frames.
    Log format: [EVENT TYPE]: [source processor] → [destination processor] at [timestamp]s
    """
    async def on_push_frame(
        self,
        src: FrameProcessor,
        dst: FrameProcessor,
        frame: Frame,
        direction: FrameDirection,
        timestamp: int,
    ):
        # Convert timestamp to seconds for readability
        time_sec = timestamp / 1_000_000_000
        # Create direction arrow
        arrow = "→" if direction == FrameDirection.DOWNSTREAM else "←"
        if isinstance(frame, ImageRawFrame):
            logger.info(
                f"⚡ RAW IMAGE FRAME: {src} {arrow} {dst} at {time_sec:.2f}s, metadata: {frame.metadata}"
            )
        elif isinstance(frame, StoryPageFrame):
            logger.info(
                f"⚡ STORY PAGE FRAME: {src} {arrow} {dst} at {time_sec:.2f}s, metadata: {frame.metadata}"
            )
        elif isinstance(frame, StoryImageFrame):
            logger.info(
                f"⚡ STORY IMAGE FRAME: {src} {arrow} {dst} at {time_sec:.2f}s, metadata: {frame.metadata}"
            )
        elif isinstance(frame, TextFrame):
            logger.info(
                f"⚡ TEXT FRAME: {src} {arrow} {dst} at {time_sec:.2f}s, metadata: {frame.metadata}"
            )
        elif isinstance(frame, TTSStartedFrame):
            logger.info(
                f"⚡ TTS STARTED FRAME: {src} {arrow} {dst} at {time_sec:.2f}s, metadata: {frame.metadata}"
            )
        elif isinstance(frame, TTSStoppedFrame):
            logger.info(
                f"⚡ TTS STOPPED FRAME: {src} {arrow} {dst} at {time_sec:.2f}s, metadata: {frame.metadata}"
            )
        elif isinstance(frame, MetadataFrame):
            logger.info(
                f"⚡ METADATA FRAME: {src} {arrow} {dst} at {time_sec:.2f}s, metadata: {frame.metadata}"
            )
 async def main(room_url, token=None):
    async with aiohttp.ClientSession() as session:
        # -------------- Transport --------------- #
@@ -90,13 +160,18 @@ async def main(room_url, token=None):
        runner = PipelineRunner()
        logger.debug("Waiting for participant...")
        after = FrameLogger("After", "red", ignored_frame_types=[SystemFrame, AudioRawFrame])
        before = FrameLogger("Before", "cyan", ignored_frame_types=[SystemFrame, AudioRawFrame])
        main_pipeline = Pipeline(
            [
                transport.input(),
                context_aggregator.user(),
                llm_service,
                story_processor,
                # SyncParallelPipeline([image_processor], [tts_service]),
                before,
                image_processor,
                after,
                tts_service,
                transport.output(),
                context_aggregator.assistant(),
@@ -109,6 +184,7 @@ async def main(room_url, token=None):
                allow_interruptions=True,
                enable_metrics=True,
                enable_usage_metrics=True,
                observers=[DebugObserver()],
            ),
        )
--- a/examples/storytelling-chatbot/src/processors.py
+++ b/examples/storytelling-chatbot/src/processors.py
@@ -17,6 +17,7 @@ from pipecat.frames.frames import (
    Frame,
    LLMFullResponseEndFrame,
    LLMMessagesFrame,
    MetadataFrame,
    TextFrame,
    UserStoppedSpeakingFrame,
 )
@@ -75,6 +76,7 @@ class StoryImageProcessor(FrameProcessor):
        if isinstance(frame, StoryPageFrame):
            # Special syntax for the first page
            print(f"!!! generating image for story page frame # {frame.metadata['story_page_id']}")
            if self.pages == []:
                prompt = FIRST_IMAGE_PROMPT % frame.text
            else:
@@ -98,10 +100,14 @@ class StoryImageProcessor(FrameProcessor):
                    async for i in self._image_gen_service.run_image_gen(
                        IMAGE_GEN_PROMPT % image_description
                    ):
                        print(
                            f"@@@ about to push a storyimageframe, input metadata is {self._input_frame_metadata}"
                        )
                        await self.push_frame(i)
            except TimeoutError:
                logger.debug("Image gen timeout")
                pass
            print(f"### past image gen try block, source frame is {frame.name}")
            await self.stop_ttfb_metrics()
            # Push the StoryPageFrame so it gets TTS
            await self.push_frame(frame)
@@ -128,6 +134,7 @@ class StoryProcessor(FrameProcessor):
        self._messages = messages
        self._text = ""
        self._story = story
        self._current_page = 0
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)
@@ -148,6 +155,7 @@ class StoryProcessor(FrameProcessor):
        # Driven by the prompt, the LLM should have asked the user for input
        elif isinstance(frame, LLMFullResponseEndFrame):
            # We use a different frame type, as to avoid image generation ingest
            await self.push_frame(MetadataFrame())
            await self.push_frame(StoryPromptFrame(self._text))
            self._text = ""
            await self.push_frame(frame)
@@ -187,7 +195,14 @@ class StoryProcessor(FrameProcessor):
                if len(before_break) > 2:
                    self._story.append(before_break)
-                    await self.push_frame(StoryPageFrame(before_break))
+                    mf = MetadataFrame()
                    mf.metadata = {"story_page_id": self._current_page}
                    await self.push_frame(mf)
                    spf = StoryPageFrame(before_break)
                    spf.metadata["story_page_id"] = self._current_page
                    self._current_page += 1
                    await self.push_frame(spf)
                    # await self.push_frame(sounds["ding"])
                    await self.push_frame(DailyTransportMessageFrame(CUE_ASSISTANT_TURN))
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -91,6 +91,13 @@ class ControlFrame(Frame):
    pass
@dataclass
 class MetadataFrame(ControlFrame):
    """Used to set default metadata for downstream processors to apply to newly
    created frames such as a frame_group_id.
    """
 #
 # Mixins
 #
--- a/src/pipecat/processors/frame_processor.py
+++ b/src/pipecat/processors/frame_processor.py
@@ -16,6 +16,7 @@ from pipecat.frames.frames import (
    CancelFrame,
    ErrorFrame,
    Frame,
    MetadataFrame,
    StartFrame,
    StartInterruptionFrame,
    StopInterruptionFrame,
@@ -84,6 +85,9 @@ class FrameProcessor:
        # exception to this rule. This create this task.
        self.__push_frame_task: Optional[asyncio.Task] = None
        # This enables an input frame's metadata to be copied to output frames
        self._input_frame_metadata = {}
    @property
    def id(self) -> int:
        return self._id
@@ -224,6 +228,12 @@ class FrameProcessor:
        self.__input_event.set()
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        # System frames skip the queue and blow up determinism
        if isinstance(frame, MetadataFrame):
            self._input_frame_metadata = frame.metadata
            # print(
            #     f"!!! PROCESS: I am {self._name}, input frame is a {frame.name}, metadata is {self._input_frame_metadata}"
            # )
        if isinstance(frame, StartFrame):
            self._clock = frame.clock
            self._task_manager = frame.task_manager
@@ -248,6 +258,14 @@ class FrameProcessor:
        if isinstance(frame, SystemFrame):
            await self.__internal_push_frame(frame, direction)
        else:
            # print(
            #     f"!!! PUSH: I am {self._name}, input frame is a {frame.name}, combining input frame metadata: {self._input_frame_metadata} with frame metadata: {frame.metadata}"
            # )
            new_metadata = self._input_frame_metadata | frame.metadata
            frame.metadata = new_metadata
            # print(
            #     f"!!! PUSH2: I am {self._name}, input frame is a {frame.name}, frame metadata is now {frame.metadata}"
            # )
            await self.__push_queue.put((frame, direction))
    def event_handler(self, event_name: str):
Author	SHA1	Message	Date
Chad Bailey	0369733f9c	explicit MetadataFrame	2025-01-31 19:27:01 +00:00
Chad Bailey	74b85a450f	wip	2025-01-31 19:27:01 +00:00