Fix: GeminiMultimodalLiveLLMService was appending tokens to the context

This commit is contained in:
Mark Backman
2025-04-29 15:40:18 -04:00
parent 27d4c927a8
commit 685f951ae2
3 changed files with 18 additions and 24 deletions

View File

@@ -80,6 +80,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed an issue with `GeminiMultimodalLiveLLMService` where the context
contained tokens instead of words.
- Fixed an issue with HTTP Smart Turn handling, where the service returns a 500
error. Previously, this would cause an unhandled exception. Now, a 500 error
is treated as an incomplete response.

View File

@@ -36,6 +36,7 @@ from pipecat.frames.frames import (
StartInterruptionFrame,
TextFrame,
TranscriptionFrame,
TTSTextFrame,
UserImageRawFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
@@ -493,7 +494,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
await self._handle_llm_start(frame)
elif isinstance(frame, LLMFullResponseEndFrame):
await self._handle_llm_end(frame)
elif isinstance(frame, TextFrame):
elif isinstance(frame, TTSTextFrame):
await self._handle_text(frame)
elif isinstance(frame, LLMMessagesAppendFrame):
self.add_messages(frame.messages)
@@ -620,7 +621,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
self._started -= 1
await self.push_aggregation()
async def _handle_text(self, frame: TextFrame):
async def _handle_text(self, frame: TTSTextFrame):
if not self._started:
return

View File

@@ -344,7 +344,6 @@ class GeminiMultimodalLiveLLMService(LLMService):
self._bot_is_speaking = False
self._user_audio_buffer = bytearray()
self._bot_audio_buffer = bytearray()
self._bot_text_buffer = ""
self._sample_rate = 24000
@@ -427,7 +426,9 @@ class GeminiMultimodalLiveLLMService(LLMService):
#
async def _handle_interruption(self):
pass
self._bot_is_speaking = False
await self.push_frame(TTSStoppedFrame())
await self.push_frame(LLMFullResponseEndFrame())
async def _handle_user_started_speaking(self, frame):
self._user_is_speaking = True
@@ -839,14 +840,6 @@ class GeminiMultimodalLiveLLMService(LLMService):
if not part:
return
text = part.text
if text:
if not self._bot_text_buffer:
await self.push_frame(LLMFullResponseStartFrame())
self._bot_text_buffer += text
await self.push_frame(LLMTextFrame(text=text))
inline_data = part.inlineData
if not inline_data:
return
@@ -861,6 +854,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
if not self._bot_is_speaking:
self._bot_is_speaking = True
await self.push_frame(TTSStartedFrame())
await self.push_frame(LLMFullResponseStartFrame())
self._bot_audio_buffer.extend(audio)
frame = TTSAudioRawFrame(
@@ -886,24 +880,20 @@ class GeminiMultimodalLiveLLMService(LLMService):
async def _handle_evt_turn_complete(self, evt):
self._bot_is_speaking = False
text = self._bot_text_buffer
self._bot_text_buffer = ""
if text:
await self.push_frame(LLMFullResponseEndFrame())
await self.push_frame(TTSStoppedFrame())
await self.push_frame(LLMFullResponseEndFrame())
async def _handle_evt_output_transcription(self, evt):
if not evt.serverContent.outputTranscription:
return
text = evt.serverContent.outputTranscription.text
if text:
await self.push_frame(LLMFullResponseStartFrame())
await self.push_frame(LLMTextFrame(text=text))
await self.push_frame(TTSTextFrame(text=text))
await self.push_frame(LLMFullResponseEndFrame())
if not text:
return
await self.push_frame(LLMTextFrame(text=text))
await self.push_frame(TTSTextFrame(text=text))
def create_context_aggregator(
self,
@@ -934,6 +924,6 @@ class GeminiMultimodalLiveLLMService(LLMService):
GeminiMultimodalLiveContext.upgrade(context)
user = GeminiMultimodalLiveUserContextAggregator(context, params=user_params)
assistant_params.expect_stripped_words = True
assistant_params.expect_stripped_words = False
assistant = GeminiMultimodalLiveAssistantContextAggregator(context, params=assistant_params)
return GeminiMultimodalLiveContextAggregatorPair(_user=user, _assistant=assistant)