From 58f3965cdce77fd50f5e8fb6e2e887f89005347f Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Thu, 29 Aug 2024 11:19:00 -0700
Subject: [PATCH] first working llama-vision

---
 src/pipecat/services/moondream.py |  2 +-
 src/pipecat/services/together.py  | 31 ++++++++++++++++++++++++++++---
 2 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/src/pipecat/services/moondream.py b/src/pipecat/services/moondream.py
index cff8d3172..10ac3353e 100644
--- a/src/pipecat/services/moondream.py
+++ b/src/pipecat/services/moondream.py
@@ -48,7 +48,7 @@ class MoondreamService(VisionService):
         self,
             *,
         model="vikhyatk/moondream2",
-        revision="2024-04-02",
+        revision="2024-08-26",
         use_cpu=False
     ):
         super().__init__()
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 49759cb01..1c841bc8a 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -181,7 +181,11 @@ class TogetherLLMService(LLMService):
                 # We get here if the LLM returns a function call with invalid JSON arguments. This could happen
                 # because of LLM non-determinism, or maybe more often because of user error in the prompt.
                 # Should we do anything more than log a warning?
-                logger.debug(f"Error parsing function arguments: {error}")
+                logger.debug(
+                    f"Error parsing function arguments: {error} - {function_call_accumulator}")
+
+# Error parsing function arguments: Extra data: line 1 column 23 (char 22)
+# - <function=get_current_weather>{"location": "London"}"</function>
 
 
 class TogetherLLMContext(OpenAILLMContext):
@@ -190,6 +194,7 @@ class TogetherLLMContext(OpenAILLMContext):
         messages: list[dict] | None = None,
     ):
         super().__init__(messages=messages)
+        self._user_image_request_context = {}
 
     @classmethod
     def from_openai_context(cls, openai_context: OpenAILLMContext):
@@ -219,9 +224,17 @@ class TogetherUserContextAggregator(LLMUserContextAggregator):
         if isinstance(context, OpenAILLMContext):
             self._context = TogetherLLMContext.from_openai_context(context)
 
+    def get_messages_frame(self):
+        return OpenAILLMContextFrame(self._context)
+
     async def push_messages_frame(self):
-        frame = OpenAILLMContextFrame(self._context)
-        await self.push_frame(frame)
+        await self.push_frame(self.get_messages_frame())
+
+    def append_image_description_tool_message(self, description):
+        self._context.add_message({
+            "role": "tool",
+            "content": json.dumps({"image_description": description})
+        })
 
     async def process_frame(self, frame, direction):
         await super().process_frame(frame, direction)
@@ -244,6 +257,18 @@ class TogetherUserContextAggregator(LLMUserContextAggregator):
                 else:
                     if frame.user_id in self._context._user_image_request_context:
                         del self._context._user_image_request_context[frame.user_id]
+            elif isinstance(frame, UserImageRawFrame):
+                text = self._context._user_image_request_context.get(frame.user_id) or ""
+                if text:
+                    del self._context._user_image_request_context[frame.user_id]
+                    frame = VisionImageRawFrame(
+                        image=frame.image,
+                        size=frame.size,
+                        format=frame.format,
+                        text=text,
+                    )
+                    await self.push_frame(frame)
+
         except Exception as e:
             logger.error(f"Error processing frame: {e}")