diff --git a/changelog/3252.added.md b/changelog/3252.added.md new file mode 100644 index 000000000..c6f85b713 --- /dev/null +++ b/changelog/3252.added.md @@ -0,0 +1,2 @@ +- Added `VisionFullResponseStartFrame`, `VisionFullResponseEndFrame` and + `VisionTextFrame`. This are used by vision services similar to LLM services. diff --git a/changelog/3252.changed.md b/changelog/3252.changed.md new file mode 100644 index 000000000..11cdca2ff --- /dev/null +++ b/changelog/3252.changed.md @@ -0,0 +1,2 @@ +- `MoondreamService` now pushes `VisionFullResponseStartFrame`, + `VisionFullResponseEndFrame` and `VisionTextFrame`. diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 8d2a51333..998ca3cf0 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -400,6 +400,13 @@ class AggregatedTextFrame(TextFrame): aggregated_by: AggregationType | str +@dataclass +class VisionTextFrame(LLMTextFrame): + """Text frame generated by vision services.""" + + pass + + @dataclass class TTSTextFrame(AggregatedTextFrame): """Text frame generated by Text-to-Speech services.""" @@ -1766,6 +1773,25 @@ class FunctionCallInProgressFrame(ControlFrame, UninterruptibleFrame): cancel_on_interruption: bool = False +@dataclass +class VisionFullResponseStartFrame(LLMFullResponseStartFrame): + """Frame indicating the beginning of a vision model response. + + Used to indicate the beginning of a vision model response. Followed by one + or more VisionTextFrames and a final VisionFullResponseEndFrame. + + """ + + pass + + +@dataclass +class VisionFullResponseEndFrame(LLMFullResponseEndFrame): + """Frame indicating the end of a Vision model response.""" + + pass + + @dataclass class TTSStartedFrame(ControlFrame): """Frame indicating the beginning of a TTS response. diff --git a/src/pipecat/services/moondream/vision.py b/src/pipecat/services/moondream/vision.py index e9ce86383..a5dc5af4c 100644 --- a/src/pipecat/services/moondream/vision.py +++ b/src/pipecat/services/moondream/vision.py @@ -19,8 +19,10 @@ from PIL import Image from pipecat.frames.frames import ( ErrorFrame, Frame, - TextFrame, UserImageRawFrame, + VisionFullResponseEndFrame, + VisionFullResponseStartFrame, + VisionTextFrame, ) from pipecat.services.vision_service import VisionService @@ -104,10 +106,6 @@ class MoondreamService(VisionService): Args: frame: The image frame to process. - - Yields: - Frame: TextFrame containing the generated image description, or ErrorFrame - if analysis fails. """ if not self._model: yield ErrorFrame("Moondream model not available") @@ -123,4 +121,6 @@ class MoondreamService(VisionService): description = await asyncio.to_thread(get_image_description, frame.image, frame.text) - yield TextFrame(text=description) + yield VisionFullResponseStartFrame() + yield VisionTextFrame(text=description) + yield VisionFullResponseEndFrame()