openai: remove from_image_frame() and use add_image_frame_message()

2024-12-10 17:08:12 -08:00
parent 246c825a82
commit a618bd3fa6
2 changed files with 8 additions and 28 deletions
--- a/src/pipecat/processors/aggregators/openai_llm_context.py
+++ b/src/pipecat/processors/aggregators/openai_llm_context.py
@@ -19,7 +19,6 @@ from pipecat.frames.frames import (
    Frame,
    FunctionCallInProgressFrame,
    FunctionCallResultFrame,
-    VisionImageRawFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor

@@ -71,28 +70,6 @@ class OpenAILLMContext:
            context.add_message(message)
        return context

-    # todo: deprecate from_image_frame. It's only used to create a single-use
-    # context, which isn't useful for most real-world applications.
-    @staticmethod
-    def from_image_frame(frame: VisionImageRawFrame) -> "OpenAILLMContext":
-        """
-        For images, we are deviating from the OpenAI messages shape. OpenAI
-        expects images to be base64 encoded, but other vision models may not.
-        So we'll store the image as bytes and do the base64 encoding as needed
-        in the LLM service.
-
-        NOTE: the above only applies to the deprecated use of this method. The
-        add_image_frame_message() below does the base64 encoding as expected
-        in the OpenAI format.
-        """
-        context = OpenAILLMContext()
-        buffer = io.BytesIO()
-        Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
-        context.add_message(
-            {"content": frame.text, "role": "user", "data": buffer, "mime_type": "image/jpeg"}
-        )
-        return context
-
    @property
    def messages(self) -> List[ChatCompletionMessageParam]:
        return self._messages
@@ -167,12 +144,12 @@ class OpenAILLMContext:
        Image.frombytes(format, size, image).save(buffer, format="JPEG")
        encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")

-        content = [
-            {"type": "text", "text": text},
-            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
-        ]
+        content = []
        if text:
            content.append({"type": "text", "text": text})
+        content.append(
+            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
+        )
        self.add_message({"role": "user", "content": content})

    def add_audio_frames_message(self, *, audio_frames: list[AudioRawFrame], text: str = None):
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -294,7 +294,10 @@ class BaseOpenAILLMService(LLMService):
        elif isinstance(frame, LLMMessagesFrame):
            context = OpenAILLMContext.from_messages(frame.messages)
        elif isinstance(frame, VisionImageRawFrame):
-            context = OpenAILLMContext.from_image_frame(frame)
+            context = OpenAILLMContext()
+            context.add_image_frame_message(
+                format=frame.format, size=frame.size, image=frame.image, text=frame.text
+            )
        elif isinstance(frame, LLMUpdateSettingsFrame):
            await self._update_settings(frame.settings)
        else: