From 6885d07e880341d1a5ae46054ae8d64609c3e9eb Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Mon, 17 Mar 2025 16:30:46 -0400
Subject: [PATCH] Simplify the TranscriptProcessor _emit_aggregated_text logic

---
 .../processors/transcript_processor.py        |  86 +++---
 tests/test_transcript_processor.py            | 248 +-----------------
 2 files changed, 50 insertions(+), 284 deletions(-)

diff --git a/src/pipecat/processors/transcript_processor.py b/src/pipecat/processors/transcript_processor.py
index 6a7793335..3eaff66ca 100644
--- a/src/pipecat/processors/transcript_processor.py
+++ b/src/pipecat/processors/transcript_processor.py
@@ -90,52 +90,62 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
         self._aggregation_start_time: Optional[str] = None
 
     async def _emit_aggregated_text(self):
-        """Emit aggregated text as a transcript message.
+        """Aggregates and emits text fragments as a transcript message.
 
-        This method intelligently joins text fragments to create natural spacing,
-        handling both word-by-word and pre-spaced text fragments appropriately.
+        This method uses a heuristic to automatically detect whether text fragments
+        use pre-spacing (spaces at the beginning of fragments) or not, and applies
+        the appropriate joining strategy. It handles fragments from different TTS
+        services with different formatting patterns.
 
-        The implementation handles two common patterns from TTS services:
+        Examples:
+            Pre-spaced fragments (concatenated):
+                ```
+                TTSTextFrame: ["Hello"]
+                TTSTextFrame: [" there"]
+                TTSTextFrame: ["!"]
+                TTSTextFrame: [" How"]
+                TTSTextFrame: ["'s"]
+                TTSTextFrame: [" it"]
+                TTSTextFrame: [" going"]
+                TTSTextFrame: ["?"]
+                ```
+                Result: "Hello there! How's it going?"
 
-        1. Word-by-word fragments without spacing:
-        ```
-        TTSTextFrame: ['Hello.']
-        TTSTextFrame: ['How']
-        TTSTextFrame: ['can']
-        TTSTextFrame: ['I']
-        TTSTextFrame: ['assist']
-        TTSTextFrame: ['you']
-        TTSTextFrame: ['today?']
-        ```
-        Result: "Hello. How can I assist you today?"
-
-        2. Pre-spaced fragments:
-        ```
-        TTSTextFrame: ['Hello']
-        TTSTextFrame: [' there']
-        TTSTextFrame: ['!']
-        TTSTextFrame: [' How']
-        TTSTextFrame: ["'s"]
-        TTSTextFrame: [' it']
-        TTSTextFrame: [' going']
-        TTSTextFrame: ['?']
-        ```
-        Result: "Hello there! How's it going?"
+            Word-by-word fragments (joined with spaces):
+                ```
+                TTSTextFrame: ["Hello"]
+                TTSTextFrame: ["there!"]
+                TTSTextFrame: ["How"]
+                TTSTextFrame: ["is"]
+                TTSTextFrame: ["it"]
+                TTSTextFrame: ["going?"]
+                ```
+                Result: "Hello there! How is it going?"
         """
         if self._current_text_parts and self._aggregation_start_time:
-            # Build content with intelligent spacing
-            content = ""
-            for i, part in enumerate(self._current_text_parts):
-                # Add a space only when the current part doesn't start with
-                # whitespace or punctuation/special characters
-                if i > 0 and not part.startswith((" ", ".", ",", "!", "?", ";", ":", "'", '"')):
-                    content += " "
-                content += part
+            # Heuristic to detect pre-spaced fragments
+            uses_prespacing = False
+            if len(self._current_text_parts) > 1:
+                # Check if any fragment after the first one starts with whitespace
+                has_spaced_parts = any(
+                    part and part[0].isspace() for part in self._current_text_parts[1:]
+                )
+                if has_spaced_parts:
+                    uses_prespacing = True
 
+            # Apply appropriate joining method
+            if uses_prespacing:
+                # Pre-spaced fragments - just concatenate
+                content = "".join(self._current_text_parts)
+            else:
+                # Word-by-word fragments - join with spaces
+                content = " ".join(self._current_text_parts)
+
+            # Clean up any excessive whitespace
             content = content.strip()
 
             if content:
-                logger.debug(f"Emitting aggregated assistant message: {content}")
+                logger.trace(f"Emitting aggregated assistant message: {content}")
                 message = TranscriptionMessage(
                     role="assistant",
                     content=content,
@@ -143,7 +153,7 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
                 )
                 await self._emit_update([message])
             else:
-                logger.debug("No content to emit after stripping whitespace")
+                logger.trace("No content to emit after stripping whitespace")
 
             # Reset aggregation state
             self._current_text_parts = []
diff --git a/tests/test_transcript_processor.py b/tests/test_transcript_processor.py
index 5f80b3ca6..d13246b2c 100644
--- a/tests/test_transcript_processor.py
+++ b/tests/test_transcript_processor.py
@@ -235,8 +235,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
             BotStartedSpeakingFrame(),
             SleepFrame(sleep=0.1),
             TTSTextFrame(text="Hello"),
-            TTSTextFrame(text="world"),
-            TTSTextFrame(text="!"),
+            TTSTextFrame(text="world!"),
             SleepFrame(sleep=0.1),
             StartInterruptionFrame(),  # User interrupts here
             BotStartedSpeakingFrame(),
@@ -251,8 +250,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
         expected_down_frames = [
             BotStartedSpeakingFrame,
             TTSTextFrame,  # "Hello"
-            TTSTextFrame,  # "world"
-            TTSTextFrame,  # "!"
+            TTSTextFrame,  # "world!"
             TranscriptionUpdateFrame,  # First message (emitted due to interruption)
             StartInterruptionFrame,  # Interruption frame comes after the update
             BotStartedSpeakingFrame,
@@ -480,245 +478,3 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
         self.assertEqual(message.role, "assistant")
         # Should be properly joined without extra spaces
         self.assertEqual(message.content, "Hello there! How's it going?")
-
-    async def test_mixed_spacing_styles(self):
-        """Test handling mixed word-by-word and pre-spaced fragments"""
-        processor = AssistantTranscriptProcessor()
-
-        received_updates = []
-
-        @processor.event_handler("on_transcript_update")
-        async def handle_update(proc, frame: TranscriptionUpdateFrame):
-            received_updates.append(frame)
-
-        # Mix of spacing styles within the same utterance
-        frames_to_send = [
-            BotStartedSpeakingFrame(),
-            SleepFrame(sleep=0.1),
-            # Word-by-word style
-            TTSTextFrame(text="First"),
-            TTSTextFrame(text="style."),
-            # Pre-spaced style
-            TTSTextFrame(text=" Second"),
-            TTSTextFrame(text=" style"),
-            TTSTextFrame(text="!"),
-            BotStoppedSpeakingFrame(),
-        ]
-
-        expected_down_frames = [
-            BotStartedSpeakingFrame,
-            BotStoppedSpeakingFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TranscriptionUpdateFrame,
-        ]
-
-        await run_test(
-            processor,
-            frames_to_send=frames_to_send,
-            expected_down_frames=expected_down_frames,
-        )
-
-        self.assertEqual(len(received_updates), 1)
-        message = received_updates[0].messages[0]
-        self.assertEqual(message.content, "First style. Second style!")
-
-    async def test_punctuation_handling(self):
-        """Test handling of various punctuation patterns"""
-        processor = AssistantTranscriptProcessor()
-
-        received_updates = []
-
-        @processor.event_handler("on_transcript_update")
-        async def handle_update(proc, frame: TranscriptionUpdateFrame):
-            received_updates.append(frame)
-
-        # Test various punctuation types
-        frames_to_send = [
-            BotStartedSpeakingFrame(),
-            SleepFrame(sleep=0.1),
-            TTSTextFrame(text="Commas"),
-            TTSTextFrame(text=","),
-            TTSTextFrame(text="colons"),
-            TTSTextFrame(text=":"),
-            TTSTextFrame(text="semicolons"),
-            TTSTextFrame(text=";"),
-            TTSTextFrame(text="quotes"),
-            TTSTextFrame(text="'"),
-            TTSTextFrame(text="and"),
-            TTSTextFrame(text='"'),
-            TTSTextFrame(text="double quotes"),
-            TTSTextFrame(text="!"),
-            BotStoppedSpeakingFrame(),
-        ]
-
-        expected_down_frames = [
-            BotStartedSpeakingFrame,
-            BotStoppedSpeakingFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TranscriptionUpdateFrame,
-        ]
-
-        await run_test(
-            processor,
-            frames_to_send=frames_to_send,
-            expected_down_frames=expected_down_frames,
-        )
-
-        self.assertEqual(len(received_updates), 1)
-        message = received_updates[0].messages[0]
-        self.assertEqual(
-            message.content, "Commas, colons: semicolons; quotes' and\" double quotes!"
-        )
-
-    async def test_complex_mixed_case(self):
-        """Test a complex mix of patterns to ensure robustness"""
-        processor = AssistantTranscriptProcessor()
-
-        received_updates = []
-
-        @processor.event_handler("on_transcript_update")
-        async def handle_update(proc, frame: TranscriptionUpdateFrame):
-            received_updates.append(frame)
-
-        # Complex mixed case with various patterns
-        frames_to_send = [
-            BotStartedSpeakingFrame(),
-            SleepFrame(sleep=0.1),
-            # Pre-spaced fragments
-            TTSTextFrame(text="Hello"),
-            TTSTextFrame(text=" there"),
-            TTSTextFrame(text="!"),
-            # Sentence boundary
-            TTSTextFrame(text=" I'm"),
-            TTSTextFrame(text=" testing"),
-            TTSTextFrame(text=" spacing"),
-            TTSTextFrame(text="."),
-            # Word-by-word fragments
-            TTSTextFrame(text="Does"),
-            TTSTextFrame(text="this"),
-            TTSTextFrame(text="work"),
-            TTSTextFrame(text="correctly"),
-            TTSTextFrame(text="?"),
-            # Mixed punctuation and spacing
-            TTSTextFrame(text=" Let's"),
-            TTSTextFrame(text=" see:"),
-            TTSTextFrame(text="commas"),
-            TTSTextFrame(text=","),
-            TTSTextFrame(text=" semicolons"),
-            TTSTextFrame(text=";"),
-            TTSTextFrame(text=" and"),
-            TTSTextFrame(text=" quotes"),
-            TTSTextFrame(text="'"),
-            TTSTextFrame(text="!"),
-            BotStoppedSpeakingFrame(),
-        ]
-
-        expected_down_frames = [
-            BotStartedSpeakingFrame,
-            BotStoppedSpeakingFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TranscriptionUpdateFrame,
-        ]
-
-        await run_test(
-            processor,
-            frames_to_send=frames_to_send,
-            expected_down_frames=expected_down_frames,
-        )
-
-        self.assertEqual(len(received_updates), 1)
-        message = received_updates[0].messages[0]
-        expected = "Hello there! I'm testing spacing. Does this work correctly? Let's see: commas, semicolons; and quotes'!"
-        self.assertEqual(message.content, expected)
-
-    async def test_multiple_consecutive_punctuation(self):
-        """Test handling of multiple consecutive punctuation marks"""
-        processor = AssistantTranscriptProcessor()
-
-        received_updates = []
-
-        @processor.event_handler("on_transcript_update")
-        async def handle_update(proc, frame: TranscriptionUpdateFrame):
-            received_updates.append(frame)
-
-        frames_to_send = [
-            BotStartedSpeakingFrame(),
-            SleepFrame(sleep=0.1),
-            TTSTextFrame(text="Wow"),
-            TTSTextFrame(text="!"),
-            TTSTextFrame(text="!"),
-            TTSTextFrame(text="!"),
-            TTSTextFrame(text=" That's"),
-            TTSTextFrame(text=" amazing"),
-            TTSTextFrame(text="..."),
-            TTSTextFrame(text=" Don't"),
-            TTSTextFrame(text=" you"),
-            TTSTextFrame(text=" think"),
-            TTSTextFrame(text="?"),
-            TTSTextFrame(text="?"),
-            BotStoppedSpeakingFrame(),
-        ]
-
-        expected_down_frames = [
-            BotStartedSpeakingFrame,
-            BotStoppedSpeakingFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TTSTextFrame,
-            TranscriptionUpdateFrame,
-        ]
-
-        await run_test(
-            processor,
-            frames_to_send=frames_to_send,
-            expected_down_frames=expected_down_frames,
-        )
-
-        self.assertEqual(len(received_updates), 1)
-        message = received_updates[0].messages[0]
-        self.assertEqual(message.content, "Wow!!! That's amazing... Don't you think??")