From 58f3965cdce77fd50f5e8fb6e2e887f89005347f Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Thu, 29 Aug 2024 11:19:00 -0700 Subject: [PATCH] first working llama-vision --- src/pipecat/services/moondream.py | 2 +- src/pipecat/services/together.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/pipecat/services/moondream.py b/src/pipecat/services/moondream.py index cff8d3172..10ac3353e 100644 --- a/src/pipecat/services/moondream.py +++ b/src/pipecat/services/moondream.py @@ -48,7 +48,7 @@ class MoondreamService(VisionService): self, *, model="vikhyatk/moondream2", - revision="2024-04-02", + revision="2024-08-26", use_cpu=False ): super().__init__() diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py index 49759cb01..1c841bc8a 100644 --- a/src/pipecat/services/together.py +++ b/src/pipecat/services/together.py @@ -181,7 +181,11 @@ class TogetherLLMService(LLMService): # We get here if the LLM returns a function call with invalid JSON arguments. This could happen # because of LLM non-determinism, or maybe more often because of user error in the prompt. # Should we do anything more than log a warning? - logger.debug(f"Error parsing function arguments: {error}") + logger.debug( + f"Error parsing function arguments: {error} - {function_call_accumulator}") + +# Error parsing function arguments: Extra data: line 1 column 23 (char 22) +# - {"location": "London"}" class TogetherLLMContext(OpenAILLMContext): @@ -190,6 +194,7 @@ class TogetherLLMContext(OpenAILLMContext): messages: list[dict] | None = None, ): super().__init__(messages=messages) + self._user_image_request_context = {} @classmethod def from_openai_context(cls, openai_context: OpenAILLMContext): @@ -219,9 +224,17 @@ class TogetherUserContextAggregator(LLMUserContextAggregator): if isinstance(context, OpenAILLMContext): self._context = TogetherLLMContext.from_openai_context(context) + def get_messages_frame(self): + return OpenAILLMContextFrame(self._context) + async def push_messages_frame(self): - frame = OpenAILLMContextFrame(self._context) - await self.push_frame(frame) + await self.push_frame(self.get_messages_frame()) + + def append_image_description_tool_message(self, description): + self._context.add_message({ + "role": "tool", + "content": json.dumps({"image_description": description}) + }) async def process_frame(self, frame, direction): await super().process_frame(frame, direction) @@ -244,6 +257,18 @@ class TogetherUserContextAggregator(LLMUserContextAggregator): else: if frame.user_id in self._context._user_image_request_context: del self._context._user_image_request_context[frame.user_id] + elif isinstance(frame, UserImageRawFrame): + text = self._context._user_image_request_context.get(frame.user_id) or "" + if text: + del self._context._user_image_request_context[frame.user_id] + frame = VisionImageRawFrame( + image=frame.image, + size=frame.size, + format=frame.format, + text=text, + ) + await self.push_frame(frame) + except Exception as e: logger.error(f"Error processing frame: {e}")