LLMContext: add create_image_message and create_audio_message

This commit is contained in:
Aleix Conchillo Flaqué
2025-10-28 15:00:13 -07:00
parent 08a1e09020
commit b094418d1e
2 changed files with 86 additions and 72 deletions

View File

@@ -9,6 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added support for including images or audio to LLM context messages using
`LLMContext.create_image_message()` and `LLMContext.create_audio_message()`.
For example, when creating `LLMMessagesAppendFrame`:
```python
message = LLMContext.create_image_message(image=..., size= ...)
await self.push_frame(LLMMessagesAppendFrame(messages=[message], run_llm=True))
```
- New event handlers for the `DeepgramFluxSTTService`: `on_start_of_turn`,
`on_turn_resumed`, `on_end_of_turn`, `on_eager_end_of_turn`, `on_update`.

View File

@@ -16,6 +16,7 @@ service-specific adapter.
import base64
import io
import wave
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
@@ -113,6 +114,77 @@ class LLMContext:
self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools)
self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice
@staticmethod
def create_image_message(
*,
role: str = "user",
format: str,
size: tuple[int, int],
image: bytes,
text: Optional[str] = None,
) -> LLMContextMessage:
"""Create a context message containing an image.
Args:
role: The role of this message (defaults to "user").
format: Image format (e.g., 'RGB', 'RGBA').
size: Image dimensions as (width, height) tuple.
image: Raw image bytes.
text: Optional text to include with the image.
"""
buffer = io.BytesIO()
Image.frombytes(format, size, image).save(buffer, format="JPEG")
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
content = []
if text:
content.append({"type": "text", "text": text})
content.append(
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
},
)
return {"role": role, "content": content}
@staticmethod
def create_audio_message(
*, role: str = "user", audio_frames: list[AudioRawFrame], text: str = "Audio follows"
) -> LLMContextMessage:
"""Create a context message containing audio.
Args:
role: The role of this message (defaults to "user").
audio_frames: List of audio frame objects to include.
text: Optional text to include with the audio.
"""
sample_rate = audio_frames[0].sample_rate
num_channels = audio_frames[0].num_channels
content = []
content.append({"type": "text", "text": text})
data = b"".join(frame.audio for frame in audio_frames)
with io.BytesIO() as buffer:
with wave.open(buffer, "wb") as wf:
wf.setsampwidth(2)
wf.setnchannels(num_channels)
wf.setframerate(sample_rate)
wf.writeframes(data)
encoded_audio = base64.b64encode(buffer.getvalue()).decode("utf-8")
content.append(
{
"type": "input_audio",
"input_audio": {"data": encoded_audio, "format": "wav"},
}
)
return {"role": role, "content": content}
@property
def messages(self) -> List[LLMContextMessage]:
"""Get the current messages list.
@@ -238,7 +310,7 @@ class LLMContext:
self._tool_choice = tool_choice
def add_image_frame_message(
self, *, format: str, size: tuple[int, int], image: bytes, text: str = None
self, *, format: str, size: tuple[int, int], image: bytes, text: Optional[str] = None
):
"""Add a message containing an image frame.
@@ -248,17 +320,8 @@ class LLMContext:
image: Raw image bytes.
text: Optional text to include with the image.
"""
buffer = io.BytesIO()
Image.frombytes(format, size, image).save(buffer, format="JPEG")
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
content = []
if text:
content.append({"type": "text", "text": text})
content.append(
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
)
self.add_message({"role": "user", "content": content})
message = LLMContext.create_image_message(format=format, size=size, image=image, text=text)
self.add_message(message)
def add_audio_frames_message(
self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows"
@@ -269,66 +332,8 @@ class LLMContext:
audio_frames: List of audio frame objects to include.
text: Optional text to include with the audio.
"""
if not audio_frames:
return
sample_rate = audio_frames[0].sample_rate
num_channels = audio_frames[0].num_channels
content = []
content.append({"type": "text", "text": text})
data = b"".join(frame.audio for frame in audio_frames)
data = bytes(
self._create_wav_header(
sample_rate,
num_channels,
16,
len(data),
)
+ data
)
encoded_audio = base64.b64encode(data).decode("utf-8")
content.append(
{
"type": "input_audio",
"input_audio": {"data": encoded_audio, "format": "wav"},
}
)
self.add_message({"role": "user", "content": content})
def _create_wav_header(self, sample_rate, num_channels, bits_per_sample, data_size):
"""Create a WAV file header for audio data.
Args:
sample_rate: Audio sample rate in Hz.
num_channels: Number of audio channels.
bits_per_sample: Bits per audio sample.
data_size: Size of audio data in bytes.
Returns:
WAV header as a bytearray.
"""
# RIFF chunk descriptor
header = bytearray()
header.extend(b"RIFF") # ChunkID
header.extend((data_size + 36).to_bytes(4, "little")) # ChunkSize: total size - 8
header.extend(b"WAVE") # Format
# "fmt " sub-chunk
header.extend(b"fmt ") # Subchunk1ID
header.extend((16).to_bytes(4, "little")) # Subchunk1Size (16 for PCM)
header.extend((1).to_bytes(2, "little")) # AudioFormat (1 for PCM)
header.extend(num_channels.to_bytes(2, "little")) # NumChannels
header.extend(sample_rate.to_bytes(4, "little")) # SampleRate
# Calculate byte rate and block align
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
header.extend(byte_rate.to_bytes(4, "little")) # ByteRate
header.extend(block_align.to_bytes(2, "little")) # BlockAlign
header.extend(bits_per_sample.to_bytes(2, "little")) # BitsPerSample
# "data" sub-chunk
header.extend(b"data") # Subchunk2ID
header.extend(data_size.to_bytes(4, "little")) # Subchunk2Size
return header
message = LLMContext.create_audio_message(audio_frames=audio_frames, text=text)
self.add_message(message)
@staticmethod
def _normalize_and_validate_tools(tools: ToolsSchema | NotGiven) -> ToolsSchema | NotGiven: