From 2300941bb880d4af1bbe484848da01de7e45522c Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 10 Nov 2025 09:55:46 -0500 Subject: [PATCH] Revert "Merge pull request #3004 from pipecat-ai/mb/improve-concat-aggregated-text" This reverts commit 5e7f59a0b03c840e05dc342b2988037578cb0f8f, reversing changes made to 2ad4122b77e2cb27145281c28991dc7af613b15b. --- CHANGELOG.md | 4 -- src/pipecat/utils/string.py | 40 ++---------- tests/test_transcript_processor.py | 100 ----------------------------- 3 files changed, 6 insertions(+), 138 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6394f744a..8443b2348 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,10 +90,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Updated `simli-ai` to 0.1.25. -- Improved `concatenate_aggregated_text()` to one word outputs from OpenAI - Realtime and Gemini Live. Text fragments are now correctly concatenated - without spaces when these patterns are detected. - - `STTMuteFilter` no longer sends `STTMuteFrame` to the STT service. The filter now blocks frames locally without instructing the STT service to stop processing audio. This prevents inactivity-related errors (such as 409 errors diff --git a/src/pipecat/utils/string.py b/src/pipecat/utils/string.py index 298a09472..25ce6afd5 100644 --- a/src/pipecat/utils/string.py +++ b/src/pipecat/utils/string.py @@ -218,43 +218,15 @@ def concatenate_aggregated_text(text_parts: List[str]) -> str: has_leading_spaces = any(part and part[0] == " " for part in text_parts[1:]) has_trailing_spaces = any(part and part[-1] == " " for part in text_parts[:-1]) - # Check for trailing non-space whitespace (e.g., \n, \r, \t) which indicates - # syllable-by-syllable output with line breaks. - # Example: Gemini Live: ["Met", "amo", "rph", "osi", "s.\n"] - has_trailing_whitespace = any( - part and part[-1] != " " and part[-1].isspace() for part in text_parts - ) + # If there are embedded spaces in the fragments, use direct concatenation + contains_spacing_between_fragments = has_leading_spaces or has_trailing_spaces - # Check if we have punctuation-only fragments, which indicates syllable-by-syllable - # output where punctuation arrives as a separate fragment. - # Example: OpenAI Realtime single word: ["Met", "am", "orph", "osis", "."] - punctuation_chars = ".,!?;:—-'\"…" - has_punctuation_only = any( - part and len(part.strip()) == 0 or all(c in punctuation_chars for c in part) - for part in text_parts - ) - - # If there are embedded spaces or other whitespace in the fragments, use direct concatenation - contains_spacing_between_fragments = ( - has_leading_spaces or has_trailing_spaces or has_trailing_whitespace - ) - - # Apply corresponding joining method based on detected spacing patterns: - - if has_punctuation_only and not contains_spacing_between_fragments: - # Syllable-by-syllable output with standalone punctuation fragment. Examples: - # - OpenAI Realtime: ["Met", "am", "orph", "osis", "."] → "Metamorphosis." - result = "".join(text_parts) - elif contains_spacing_between_fragments: - # Fragments already have embedded spacing or trailing whitespace - concatenate directly. Examples: - # - OpenAI Realtime: ['Hey', ' there', '!', ' Great', ' to', ' meet', ' you', '!'] - # - Gemini Live (spaces): ['Hel', 'lo.', ' Wo', 'u', 'ld ', 'you', ' li', 'ke ', 'to ', 'he', 'ar a joke?\n'] - # - Gemini Live (newline): ["Met", "amo", "rph", "osi", "s.\n"] → "Metamorphosis." - # - Sentence level TTS services: ['Hello!', ' How can I assist you today?'] + # Apply corresponding joining method + if contains_spacing_between_fragments: + # Fragments already have spacing - just concatenate result = "".join(text_parts) else: - # Word-by-word fragments without spacing - join with spaces. Examples: - # - Word level TTS services: ["Hello", "there.", "How", "are", "you?"] → "Hello there. How are you?" + # Word-by-word fragments - join with spaces result = " ".join(text_parts) # Clean up any excessive whitespace diff --git a/tests/test_transcript_processor.py b/tests/test_transcript_processor.py index d45d5ba3b..b433951ce 100644 --- a/tests/test_transcript_processor.py +++ b/tests/test_transcript_processor.py @@ -479,103 +479,3 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase): self.assertEqual(message.role, "assistant") # Should be properly joined without extra spaces self.assertEqual(message.content, "Hello there! How's it going?") - - async def test_openai_realtime_syllable_fragments(self): - """Test OpenAI Realtime syllable-by-syllable output with standalone punctuation - - OpenAI Realtime can output single words as syllable fragments with punctuation - as a separate fragment. Example: ["Met", "am", "orph", "osis", "."] - This should be concatenated without spaces to form "Metamorphosis." - """ - processor = AssistantTranscriptProcessor() - - received_updates = [] - - @processor.event_handler("on_transcript_update") - async def handle_update(proc, frame: TranscriptionUpdateFrame): - received_updates.append(frame) - - # Simulate OpenAI Realtime syllable-by-syllable output - frames_to_send = [ - BotStartedSpeakingFrame(), - SleepFrame(), - TTSTextFrame(text="Met"), - TTSTextFrame(text="am"), - TTSTextFrame(text="orph"), - TTSTextFrame(text="osis"), - TTSTextFrame(text="."), # Standalone punctuation fragment - BotStoppedSpeakingFrame(), - ] - - expected_down_frames = [ - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TranscriptionUpdateFrame, - ] - - await run_test( - processor, - frames_to_send=frames_to_send, - expected_down_frames=expected_down_frames, - ) - - # Verify syllables are concatenated without spaces - self.assertEqual(len(received_updates), 1) - message = received_updates[0].messages[0] - self.assertEqual(message.role, "assistant") - self.assertEqual(message.content, "Metamorphosis.") - - async def test_gemini_live_syllable_fragments_with_newline(self): - """Test Gemini Live syllable-by-syllable output with trailing newline - - Gemini Live can output syllable fragments where the last fragment contains - trailing whitespace like newlines. Example: ["Met", "amo", "rph", "osi", "s.\\n"] - This should be concatenated without spaces to form "Metamorphosis." - """ - processor = AssistantTranscriptProcessor() - - received_updates = [] - - @processor.event_handler("on_transcript_update") - async def handle_update(proc, frame: TranscriptionUpdateFrame): - received_updates.append(frame) - - # Simulate Gemini Live syllable-by-syllable output with trailing newline - frames_to_send = [ - BotStartedSpeakingFrame(), - SleepFrame(), - TTSTextFrame(text="Met"), - TTSTextFrame(text="amo"), - TTSTextFrame(text="rph"), - TTSTextFrame(text="osi"), - TTSTextFrame(text="s.\n"), # Last fragment with trailing newline - BotStoppedSpeakingFrame(), - ] - - expected_down_frames = [ - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TranscriptionUpdateFrame, - ] - - await run_test( - processor, - frames_to_send=frames_to_send, - expected_down_frames=expected_down_frames, - ) - - # Verify syllables are concatenated without spaces and newline is stripped - self.assertEqual(len(received_updates), 1) - message = received_updates[0].messages[0] - self.assertEqual(message.role, "assistant") - self.assertEqual(message.content, "Metamorphosis.")