google: always send text part when sending inline audio

This commit is contained in:
Aleix Conchillo Flaqué
2025-02-25 22:26:30 -08:00
parent 96c6aeaada
commit bb89a036e5
2 changed files with 9 additions and 3 deletions

View File

@@ -78,6 +78,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
### Fixed
- Fixed a `GoogleLLMService` that was causing an exception when sending inline
audio in some cases.
- Fixed an `AudioContextWordTTSService` issue that would cause an `EndFrame` to
disconnect from the TTS service before audio from all the contexts was
received. This affected services like Cartesia and Rime.

View File

@@ -722,7 +722,9 @@ class GoogleLLMContext(OpenAILLMContext):
self.add_message(glm.Content(role="user", parts=parts))
def add_audio_frames_message(self, *, audio_frames: list[AudioRawFrame], text: str = None):
def add_audio_frames_message(
self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows"
):
if not audio_frames:
return
@@ -731,8 +733,9 @@ class GoogleLLMContext(OpenAILLMContext):
parts = []
data = b"".join(frame.audio for frame in audio_frames)
if text:
parts.append(glm.Part(text=text))
# NOTE(aleix): According to the docs only text or inline_data should be needed.
# (see https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference)
parts.append(glm.Part(text=text))
parts.append(
glm.Part(
inline_data=glm.Blob(