Merge pull request #3252 from pipecat-ai/aleix/vision-response-frames
add vision response and text frames
This commit is contained in:
2
changelog/3252.added.md
Normal file
2
changelog/3252.added.md
Normal file
@@ -0,0 +1,2 @@
|
||||
- Added `VisionFullResponseStartFrame`, `VisionFullResponseEndFrame` and
|
||||
`VisionTextFrame`. This are used by vision services similar to LLM services.
|
||||
2
changelog/3252.changed.md
Normal file
2
changelog/3252.changed.md
Normal file
@@ -0,0 +1,2 @@
|
||||
- `MoondreamService` now pushes `VisionFullResponseStartFrame`,
|
||||
`VisionFullResponseEndFrame` and `VisionTextFrame`.
|
||||
@@ -400,6 +400,13 @@ class AggregatedTextFrame(TextFrame):
|
||||
aggregated_by: AggregationType | str
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionTextFrame(LLMTextFrame):
|
||||
"""Text frame generated by vision services."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSTextFrame(AggregatedTextFrame):
|
||||
"""Text frame generated by Text-to-Speech services."""
|
||||
@@ -1766,6 +1773,25 @@ class FunctionCallInProgressFrame(ControlFrame, UninterruptibleFrame):
|
||||
cancel_on_interruption: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionFullResponseStartFrame(LLMFullResponseStartFrame):
|
||||
"""Frame indicating the beginning of a vision model response.
|
||||
|
||||
Used to indicate the beginning of a vision model response. Followed by one
|
||||
or more VisionTextFrames and a final VisionFullResponseEndFrame.
|
||||
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionFullResponseEndFrame(LLMFullResponseEndFrame):
|
||||
"""Frame indicating the end of a Vision model response."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSStartedFrame(ControlFrame):
|
||||
"""Frame indicating the beginning of a TTS response.
|
||||
|
||||
@@ -19,8 +19,10 @@ from PIL import Image
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
TextFrame,
|
||||
UserImageRawFrame,
|
||||
VisionFullResponseEndFrame,
|
||||
VisionFullResponseStartFrame,
|
||||
VisionTextFrame,
|
||||
)
|
||||
from pipecat.services.vision_service import VisionService
|
||||
|
||||
@@ -104,10 +106,6 @@ class MoondreamService(VisionService):
|
||||
|
||||
Args:
|
||||
frame: The image frame to process.
|
||||
|
||||
Yields:
|
||||
Frame: TextFrame containing the generated image description, or ErrorFrame
|
||||
if analysis fails.
|
||||
"""
|
||||
if not self._model:
|
||||
yield ErrorFrame("Moondream model not available")
|
||||
@@ -123,4 +121,6 @@ class MoondreamService(VisionService):
|
||||
|
||||
description = await asyncio.to_thread(get_image_description, frame.image, frame.text)
|
||||
|
||||
yield TextFrame(text=description)
|
||||
yield VisionFullResponseStartFrame()
|
||||
yield VisionTextFrame(text=description)
|
||||
yield VisionFullResponseEndFrame()
|
||||
|
||||
Reference in New Issue
Block a user