diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c6ff6143..2689d1f3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,6 +80,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Improved `concatenate_aggregated_text()` to one word outputs from OpenAI + Realtime and Gemini Live. Text fragments are now correctly concatenated + without spaces when these patterns are detected. + - `STTMuteFilter` no longer sends `STTMuteFrame` to the STT service. The filter now blocks frames locally without instructing the STT service to stop processing audio. This prevents inactivity-related errors (such as 409 errors diff --git a/tests/test_transcript_processor.py b/tests/test_transcript_processor.py index b433951ce..d45d5ba3b 100644 --- a/tests/test_transcript_processor.py +++ b/tests/test_transcript_processor.py @@ -479,3 +479,103 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase): self.assertEqual(message.role, "assistant") # Should be properly joined without extra spaces self.assertEqual(message.content, "Hello there! How's it going?") + + async def test_openai_realtime_syllable_fragments(self): + """Test OpenAI Realtime syllable-by-syllable output with standalone punctuation + + OpenAI Realtime can output single words as syllable fragments with punctuation + as a separate fragment. Example: ["Met", "am", "orph", "osis", "."] + This should be concatenated without spaces to form "Metamorphosis." + """ + processor = AssistantTranscriptProcessor() + + received_updates = [] + + @processor.event_handler("on_transcript_update") + async def handle_update(proc, frame: TranscriptionUpdateFrame): + received_updates.append(frame) + + # Simulate OpenAI Realtime syllable-by-syllable output + frames_to_send = [ + BotStartedSpeakingFrame(), + SleepFrame(), + TTSTextFrame(text="Met"), + TTSTextFrame(text="am"), + TTSTextFrame(text="orph"), + TTSTextFrame(text="osis"), + TTSTextFrame(text="."), # Standalone punctuation fragment + BotStoppedSpeakingFrame(), + ] + + expected_down_frames = [ + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame, + TTSTextFrame, + TTSTextFrame, + TTSTextFrame, + TTSTextFrame, + TTSTextFrame, + TranscriptionUpdateFrame, + ] + + await run_test( + processor, + frames_to_send=frames_to_send, + expected_down_frames=expected_down_frames, + ) + + # Verify syllables are concatenated without spaces + self.assertEqual(len(received_updates), 1) + message = received_updates[0].messages[0] + self.assertEqual(message.role, "assistant") + self.assertEqual(message.content, "Metamorphosis.") + + async def test_gemini_live_syllable_fragments_with_newline(self): + """Test Gemini Live syllable-by-syllable output with trailing newline + + Gemini Live can output syllable fragments where the last fragment contains + trailing whitespace like newlines. Example: ["Met", "amo", "rph", "osi", "s.\\n"] + This should be concatenated without spaces to form "Metamorphosis." + """ + processor = AssistantTranscriptProcessor() + + received_updates = [] + + @processor.event_handler("on_transcript_update") + async def handle_update(proc, frame: TranscriptionUpdateFrame): + received_updates.append(frame) + + # Simulate Gemini Live syllable-by-syllable output with trailing newline + frames_to_send = [ + BotStartedSpeakingFrame(), + SleepFrame(), + TTSTextFrame(text="Met"), + TTSTextFrame(text="amo"), + TTSTextFrame(text="rph"), + TTSTextFrame(text="osi"), + TTSTextFrame(text="s.\n"), # Last fragment with trailing newline + BotStoppedSpeakingFrame(), + ] + + expected_down_frames = [ + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame, + TTSTextFrame, + TTSTextFrame, + TTSTextFrame, + TTSTextFrame, + TTSTextFrame, + TranscriptionUpdateFrame, + ] + + await run_test( + processor, + frames_to_send=frames_to_send, + expected_down_frames=expected_down_frames, + ) + + # Verify syllables are concatenated without spaces and newline is stripped + self.assertEqual(len(received_updates), 1) + message = received_updates[0].messages[0] + self.assertEqual(message.role, "assistant") + self.assertEqual(message.content, "Metamorphosis.")