Merge pull request #3252 from pipecat-ai/aleix/vision-response-frames

add vision response and text frames
2025-12-17 11:01:06 -08:00
parent e03e5f3a59 159e403ae4
commit 7e4dbf42e8
4 changed files with 36 additions and 6 deletions
--- a/changelog/3252.added.md
+++ b/changelog/3252.added.md
@@ -0,0 +1,2 @@
+- Added `VisionFullResponseStartFrame`, `VisionFullResponseEndFrame` and
+  `VisionTextFrame`. This are used by vision services similar to LLM services.
--- a/changelog/3252.changed.md
+++ b/changelog/3252.changed.md
@@ -0,0 +1,2 @@
+- `MoondreamService` now pushes `VisionFullResponseStartFrame`,
+  `VisionFullResponseEndFrame` and `VisionTextFrame`.
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -400,6 +400,13 @@ class AggregatedTextFrame(TextFrame):
    aggregated_by: AggregationType | str


+@dataclass
+class VisionTextFrame(LLMTextFrame):
+    """Text frame generated by vision services."""
+
+    pass
+
+
@dataclass
 class TTSTextFrame(AggregatedTextFrame):
    """Text frame generated by Text-to-Speech services."""
@@ -1766,6 +1773,25 @@ class FunctionCallInProgressFrame(ControlFrame, UninterruptibleFrame):
    cancel_on_interruption: bool = False


+@dataclass
+class VisionFullResponseStartFrame(LLMFullResponseStartFrame):
+    """Frame indicating the beginning of a vision model response.
+
+    Used to indicate the beginning of a vision model response. Followed by one
+    or more VisionTextFrames and a final VisionFullResponseEndFrame.
+
+    """
+
+    pass
+
+
+@dataclass
+class VisionFullResponseEndFrame(LLMFullResponseEndFrame):
+    """Frame indicating the end of a Vision model response."""
+
+    pass
+
+
@dataclass
 class TTSStartedFrame(ControlFrame):
    """Frame indicating the beginning of a TTS response.
--- a/src/pipecat/services/moondream/vision.py
+++ b/src/pipecat/services/moondream/vision.py
@@ -19,8 +19,10 @@ from PIL import Image
 from pipecat.frames.frames import (
    ErrorFrame,
    Frame,
-    TextFrame,
    UserImageRawFrame,
+    VisionFullResponseEndFrame,
+    VisionFullResponseStartFrame,
+    VisionTextFrame,
 )
 from pipecat.services.vision_service import VisionService

@@ -104,10 +106,6 @@ class MoondreamService(VisionService):

        Args:
            frame: The image frame to process.
-
-        Yields:
-            Frame: TextFrame containing the generated image description, or ErrorFrame
-                  if analysis fails.
        """
        if not self._model:
            yield ErrorFrame("Moondream model not available")
@@ -123,4 +121,6 @@ class MoondreamService(VisionService):

        description = await asyncio.to_thread(get_image_description, frame.image, frame.text)

-        yield TextFrame(text=description)
+        yield VisionFullResponseStartFrame()
+        yield VisionTextFrame(text=description)
+        yield VisionFullResponseEndFrame()