google: always send text part when sending inline audio

2025-02-25 22:26:30 -08:00
parent 96c6aeaada
commit bb89a036e5
2 changed files with 9 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -78,6 +78,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))

 ### Fixed

+- Fixed a `GoogleLLMService` that was causing an exception when sending inline
+  audio in some cases.
+
 - Fixed an `AudioContextWordTTSService` issue that would cause an `EndFrame` to
  disconnect from the TTS service before audio from all the contexts was
  received. This affected services like Cartesia and Rime.
--- a/src/pipecat/services/google/google.py
+++ b/src/pipecat/services/google/google.py
@@ -722,7 +722,9 @@ class GoogleLLMContext(OpenAILLMContext):

        self.add_message(glm.Content(role="user", parts=parts))

-    def add_audio_frames_message(self, *, audio_frames: list[AudioRawFrame], text: str = None):
+    def add_audio_frames_message(
+        self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows"
+    ):
        if not audio_frames:
            return

@@ -731,8 +733,9 @@ class GoogleLLMContext(OpenAILLMContext):

        parts = []
        data = b"".join(frame.audio for frame in audio_frames)
-        if text:
-            parts.append(glm.Part(text=text))
+        # NOTE(aleix): According to the docs only text or inline_data should be needed.
+        # (see https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference)
+        parts.append(glm.Part(text=text))
        parts.append(
            glm.Part(
                inline_data=glm.Blob(