From d3d50ac5805d5adc4da9e0e17d7d9dd9d245eaf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 17 Dec 2025 10:34:54 -0800 Subject: [PATCH 1/2] frames: added vision response and text frames --- changelog/3252.added.md | 2 ++ src/pipecat/frames/frames.py | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 changelog/3252.added.md diff --git a/changelog/3252.added.md b/changelog/3252.added.md new file mode 100644 index 000000000..c6f85b713 --- /dev/null +++ b/changelog/3252.added.md @@ -0,0 +1,2 @@ +- Added `VisionFullResponseStartFrame`, `VisionFullResponseEndFrame` and + `VisionTextFrame`. This are used by vision services similar to LLM services. diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 8d2a51333..998ca3cf0 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -400,6 +400,13 @@ class AggregatedTextFrame(TextFrame): aggregated_by: AggregationType | str +@dataclass +class VisionTextFrame(LLMTextFrame): + """Text frame generated by vision services.""" + + pass + + @dataclass class TTSTextFrame(AggregatedTextFrame): """Text frame generated by Text-to-Speech services.""" @@ -1766,6 +1773,25 @@ class FunctionCallInProgressFrame(ControlFrame, UninterruptibleFrame): cancel_on_interruption: bool = False +@dataclass +class VisionFullResponseStartFrame(LLMFullResponseStartFrame): + """Frame indicating the beginning of a vision model response. + + Used to indicate the beginning of a vision model response. Followed by one + or more VisionTextFrames and a final VisionFullResponseEndFrame. + + """ + + pass + + +@dataclass +class VisionFullResponseEndFrame(LLMFullResponseEndFrame): + """Frame indicating the end of a Vision model response.""" + + pass + + @dataclass class TTSStartedFrame(ControlFrame): """Frame indicating the beginning of a TTS response. From 159e403ae4f9847adde8349664e173bfdb74a1c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 17 Dec 2025 10:36:04 -0800 Subject: [PATCH 2/2] MoondreamService: yield vision response and text frames --- changelog/3252.changed.md | 2 ++ src/pipecat/services/moondream/vision.py | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) create mode 100644 changelog/3252.changed.md diff --git a/changelog/3252.changed.md b/changelog/3252.changed.md new file mode 100644 index 000000000..11cdca2ff --- /dev/null +++ b/changelog/3252.changed.md @@ -0,0 +1,2 @@ +- `MoondreamService` now pushes `VisionFullResponseStartFrame`, + `VisionFullResponseEndFrame` and `VisionTextFrame`. diff --git a/src/pipecat/services/moondream/vision.py b/src/pipecat/services/moondream/vision.py index e9ce86383..a5dc5af4c 100644 --- a/src/pipecat/services/moondream/vision.py +++ b/src/pipecat/services/moondream/vision.py @@ -19,8 +19,10 @@ from PIL import Image from pipecat.frames.frames import ( ErrorFrame, Frame, - TextFrame, UserImageRawFrame, + VisionFullResponseEndFrame, + VisionFullResponseStartFrame, + VisionTextFrame, ) from pipecat.services.vision_service import VisionService @@ -104,10 +106,6 @@ class MoondreamService(VisionService): Args: frame: The image frame to process. - - Yields: - Frame: TextFrame containing the generated image description, or ErrorFrame - if analysis fails. """ if not self._model: yield ErrorFrame("Moondream model not available") @@ -123,4 +121,6 @@ class MoondreamService(VisionService): description = await asyncio.to_thread(get_image_description, frame.image, frame.text) - yield TextFrame(text=description) + yield VisionFullResponseStartFrame() + yield VisionTextFrame(text=description) + yield VisionFullResponseEndFrame()