diff --git a/CHANGELOG.md b/CHANGELOG.md index afba77a64..3d41cb420 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -78,6 +78,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general")) ### Fixed +- Fixed a `GoogleLLMService` that was causing an exception when sending inline + audio in some cases. + - Fixed an `AudioContextWordTTSService` issue that would cause an `EndFrame` to disconnect from the TTS service before audio from all the contexts was received. This affected services like Cartesia and Rime. diff --git a/src/pipecat/services/google/google.py b/src/pipecat/services/google/google.py index 1dafc92bc..c0941ee33 100644 --- a/src/pipecat/services/google/google.py +++ b/src/pipecat/services/google/google.py @@ -722,7 +722,9 @@ class GoogleLLMContext(OpenAILLMContext): self.add_message(glm.Content(role="user", parts=parts)) - def add_audio_frames_message(self, *, audio_frames: list[AudioRawFrame], text: str = None): + def add_audio_frames_message( + self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows" + ): if not audio_frames: return @@ -731,8 +733,9 @@ class GoogleLLMContext(OpenAILLMContext): parts = [] data = b"".join(frame.audio for frame in audio_frames) - if text: - parts.append(glm.Part(text=text)) + # NOTE(aleix): According to the docs only text or inline_data should be needed. + # (see https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference) + parts.append(glm.Part(text=text)) parts.append( glm.Part( inline_data=glm.Blob(