diff --git a/src/pipecat/processors/aggregators/openai_llm_context.py b/src/pipecat/processors/aggregators/openai_llm_context.py index f647936c9..6e7474c17 100644 --- a/src/pipecat/processors/aggregators/openai_llm_context.py +++ b/src/pipecat/processors/aggregators/openai_llm_context.py @@ -19,7 +19,6 @@ from pipecat.frames.frames import ( Frame, FunctionCallInProgressFrame, FunctionCallResultFrame, - VisionImageRawFrame, ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -71,28 +70,6 @@ class OpenAILLMContext: context.add_message(message) return context - # todo: deprecate from_image_frame. It's only used to create a single-use - # context, which isn't useful for most real-world applications. - @staticmethod - def from_image_frame(frame: VisionImageRawFrame) -> "OpenAILLMContext": - """ - For images, we are deviating from the OpenAI messages shape. OpenAI - expects images to be base64 encoded, but other vision models may not. - So we'll store the image as bytes and do the base64 encoding as needed - in the LLM service. - - NOTE: the above only applies to the deprecated use of this method. The - add_image_frame_message() below does the base64 encoding as expected - in the OpenAI format. - """ - context = OpenAILLMContext() - buffer = io.BytesIO() - Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG") - context.add_message( - {"content": frame.text, "role": "user", "data": buffer, "mime_type": "image/jpeg"} - ) - return context - @property def messages(self) -> List[ChatCompletionMessageParam]: return self._messages @@ -167,12 +144,12 @@ class OpenAILLMContext: Image.frombytes(format, size, image).save(buffer, format="JPEG") encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") - content = [ - {"type": "text", "text": text}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}, - ] + content = [] if text: content.append({"type": "text", "text": text}) + content.append( + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}, + ) self.add_message({"role": "user", "content": content}) def add_audio_frames_message(self, *, audio_frames: list[AudioRawFrame], text: str = None): diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py index 4e11edc22..43ad16536 100644 --- a/src/pipecat/services/openai.py +++ b/src/pipecat/services/openai.py @@ -294,7 +294,10 @@ class BaseOpenAILLMService(LLMService): elif isinstance(frame, LLMMessagesFrame): context = OpenAILLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): - context = OpenAILLMContext.from_image_frame(frame) + context = OpenAILLMContext() + context.add_image_frame_message( + format=frame.format, size=frame.size, image=frame.image, text=frame.text + ) elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) else: