From 38e2d3767425bed5647c36f698564f65ad209b5d Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Fri, 2 May 2025 07:36:12 -0400 Subject: [PATCH] Restore TEXT modalities support to GeminiMultimodalLiveLLMService --- .../services/gemini_multimodal_live/gemini.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index 791888993..b337c50e2 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -354,6 +354,7 @@ class GeminiMultimodalLiveLLMService(LLMService): self._bot_is_speaking = False self._user_audio_buffer = bytearray() self._bot_audio_buffer = bytearray() + self._bot_text_buffer = "" self._sample_rate = 24000 @@ -852,6 +853,15 @@ class GeminiMultimodalLiveLLMService(LLMService): if not part: return + # part.text is added when `modalities` is set to TEXT; otherwise, it's None + text = part.text + if text: + if not self._bot_text_buffer: + await self.push_frame(LLMFullResponseStartFrame()) + + self._bot_text_buffer += text + await self.push_frame(LLMTextFrame(text=text)) + inline_data = part.inlineData if not inline_data: return @@ -892,6 +902,13 @@ class GeminiMultimodalLiveLLMService(LLMService): async def _handle_evt_turn_complete(self, evt): self._bot_is_speaking = False + text = self._bot_text_buffer + self._bot_text_buffer = "" + + # Pertains to modalities set to TEXT only + if text: + await self.push_frame(LLMFullResponseEndFrame()) + await self.push_frame(TTSStoppedFrame()) await self.push_frame(LLMFullResponseEndFrame()) @@ -899,6 +916,9 @@ class GeminiMultimodalLiveLLMService(LLMService): if not evt.serverContent.outputTranscription: return + # This is the output transcription text when modalities is set to AUDIO. + # In this case, we push LLMTextFrame and TTSTextFrame to be handled by the + # downstream assistant context aggregator. text = evt.serverContent.outputTranscription.text if not text: