From b094418d1e65185e6d00a357aa7e783275d5a61e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 28 Oct 2025 15:00:13 -0700 Subject: [PATCH] LLMContext: add create_image_message and create_audio_message --- CHANGELOG.md | 9 ++ .../processors/aggregators/llm_context.py | 149 +++++++++--------- 2 files changed, 86 insertions(+), 72 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0ba85a6e..0aab3986b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added support for including images or audio to LLM context messages using + `LLMContext.create_image_message()` and `LLMContext.create_audio_message()`. + For example, when creating `LLMMessagesAppendFrame`: + + ```python + message = LLMContext.create_image_message(image=..., size= ...) + await self.push_frame(LLMMessagesAppendFrame(messages=[message], run_llm=True)) + ``` + - New event handlers for the `DeepgramFluxSTTService`: `on_start_of_turn`, `on_turn_resumed`, `on_end_of_turn`, `on_eager_end_of_turn`, `on_update`. diff --git a/src/pipecat/processors/aggregators/llm_context.py b/src/pipecat/processors/aggregators/llm_context.py index 8dc79fb50..768df0e5e 100644 --- a/src/pipecat/processors/aggregators/llm_context.py +++ b/src/pipecat/processors/aggregators/llm_context.py @@ -16,6 +16,7 @@ service-specific adapter. import base64 import io +import wave from dataclasses import dataclass from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union @@ -113,6 +114,77 @@ class LLMContext: self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools) self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice + @staticmethod + def create_image_message( + *, + role: str = "user", + format: str, + size: tuple[int, int], + image: bytes, + text: Optional[str] = None, + ) -> LLMContextMessage: + """Create a context message containing an image. + + Args: + role: The role of this message (defaults to "user"). + format: Image format (e.g., 'RGB', 'RGBA'). + size: Image dimensions as (width, height) tuple. + image: Raw image bytes. + text: Optional text to include with the image. + """ + buffer = io.BytesIO() + Image.frombytes(format, size, image).save(buffer, format="JPEG") + encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") + + content = [] + if text: + content.append({"type": "text", "text": text}) + + content.append( + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}, + }, + ) + + return {"role": role, "content": content} + + @staticmethod + def create_audio_message( + *, role: str = "user", audio_frames: list[AudioRawFrame], text: str = "Audio follows" + ) -> LLMContextMessage: + """Create a context message containing audio. + + Args: + role: The role of this message (defaults to "user"). + audio_frames: List of audio frame objects to include. + text: Optional text to include with the audio. + """ + sample_rate = audio_frames[0].sample_rate + num_channels = audio_frames[0].num_channels + + content = [] + content.append({"type": "text", "text": text}) + data = b"".join(frame.audio for frame in audio_frames) + + with io.BytesIO() as buffer: + with wave.open(buffer, "wb") as wf: + wf.setsampwidth(2) + wf.setnchannels(num_channels) + wf.setframerate(sample_rate) + wf.writeframes(data) + + encoded_audio = base64.b64encode(buffer.getvalue()).decode("utf-8") + + content.append( + { + "type": "input_audio", + "input_audio": {"data": encoded_audio, "format": "wav"}, + } + ) + + return {"role": role, "content": content} + @property def messages(self) -> List[LLMContextMessage]: """Get the current messages list. @@ -238,7 +310,7 @@ class LLMContext: self._tool_choice = tool_choice def add_image_frame_message( - self, *, format: str, size: tuple[int, int], image: bytes, text: str = None + self, *, format: str, size: tuple[int, int], image: bytes, text: Optional[str] = None ): """Add a message containing an image frame. @@ -248,17 +320,8 @@ class LLMContext: image: Raw image bytes. text: Optional text to include with the image. """ - buffer = io.BytesIO() - Image.frombytes(format, size, image).save(buffer, format="JPEG") - encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") - - content = [] - if text: - content.append({"type": "text", "text": text}) - content.append( - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}, - ) - self.add_message({"role": "user", "content": content}) + message = LLMContext.create_image_message(format=format, size=size, image=image, text=text) + self.add_message(message) def add_audio_frames_message( self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows" @@ -269,66 +332,8 @@ class LLMContext: audio_frames: List of audio frame objects to include. text: Optional text to include with the audio. """ - if not audio_frames: - return - - sample_rate = audio_frames[0].sample_rate - num_channels = audio_frames[0].num_channels - - content = [] - content.append({"type": "text", "text": text}) - data = b"".join(frame.audio for frame in audio_frames) - data = bytes( - self._create_wav_header( - sample_rate, - num_channels, - 16, - len(data), - ) - + data - ) - encoded_audio = base64.b64encode(data).decode("utf-8") - content.append( - { - "type": "input_audio", - "input_audio": {"data": encoded_audio, "format": "wav"}, - } - ) - self.add_message({"role": "user", "content": content}) - - def _create_wav_header(self, sample_rate, num_channels, bits_per_sample, data_size): - """Create a WAV file header for audio data. - - Args: - sample_rate: Audio sample rate in Hz. - num_channels: Number of audio channels. - bits_per_sample: Bits per audio sample. - data_size: Size of audio data in bytes. - - Returns: - WAV header as a bytearray. - """ - # RIFF chunk descriptor - header = bytearray() - header.extend(b"RIFF") # ChunkID - header.extend((data_size + 36).to_bytes(4, "little")) # ChunkSize: total size - 8 - header.extend(b"WAVE") # Format - # "fmt " sub-chunk - header.extend(b"fmt ") # Subchunk1ID - header.extend((16).to_bytes(4, "little")) # Subchunk1Size (16 for PCM) - header.extend((1).to_bytes(2, "little")) # AudioFormat (1 for PCM) - header.extend(num_channels.to_bytes(2, "little")) # NumChannels - header.extend(sample_rate.to_bytes(4, "little")) # SampleRate - # Calculate byte rate and block align - byte_rate = sample_rate * num_channels * (bits_per_sample // 8) - block_align = num_channels * (bits_per_sample // 8) - header.extend(byte_rate.to_bytes(4, "little")) # ByteRate - header.extend(block_align.to_bytes(2, "little")) # BlockAlign - header.extend(bits_per_sample.to_bytes(2, "little")) # BitsPerSample - # "data" sub-chunk - header.extend(b"data") # Subchunk2ID - header.extend(data_size.to_bytes(4, "little")) # Subchunk2Size - return header + message = LLMContext.create_audio_message(audio_frames=audio_frames, text=text) + self.add_message(message) @staticmethod def _normalize_and_validate_tools(tools: ToolsSchema | NotGiven) -> ToolsSchema | NotGiven: