From 6885d07e880341d1a5ae46054ae8d64609c3e9eb Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 17 Mar 2025 16:30:46 -0400 Subject: [PATCH] Simplify the TranscriptProcessor _emit_aggregated_text logic --- .../processors/transcript_processor.py | 86 +++--- tests/test_transcript_processor.py | 248 +----------------- 2 files changed, 50 insertions(+), 284 deletions(-) diff --git a/src/pipecat/processors/transcript_processor.py b/src/pipecat/processors/transcript_processor.py index 6a7793335..3eaff66ca 100644 --- a/src/pipecat/processors/transcript_processor.py +++ b/src/pipecat/processors/transcript_processor.py @@ -90,52 +90,62 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor): self._aggregation_start_time: Optional[str] = None async def _emit_aggregated_text(self): - """Emit aggregated text as a transcript message. + """Aggregates and emits text fragments as a transcript message. - This method intelligently joins text fragments to create natural spacing, - handling both word-by-word and pre-spaced text fragments appropriately. + This method uses a heuristic to automatically detect whether text fragments + use pre-spacing (spaces at the beginning of fragments) or not, and applies + the appropriate joining strategy. It handles fragments from different TTS + services with different formatting patterns. - The implementation handles two common patterns from TTS services: + Examples: + Pre-spaced fragments (concatenated): + ``` + TTSTextFrame: ["Hello"] + TTSTextFrame: [" there"] + TTSTextFrame: ["!"] + TTSTextFrame: [" How"] + TTSTextFrame: ["'s"] + TTSTextFrame: [" it"] + TTSTextFrame: [" going"] + TTSTextFrame: ["?"] + ``` + Result: "Hello there! How's it going?" - 1. Word-by-word fragments without spacing: - ``` - TTSTextFrame: ['Hello.'] - TTSTextFrame: ['How'] - TTSTextFrame: ['can'] - TTSTextFrame: ['I'] - TTSTextFrame: ['assist'] - TTSTextFrame: ['you'] - TTSTextFrame: ['today?'] - ``` - Result: "Hello. How can I assist you today?" - - 2. Pre-spaced fragments: - ``` - TTSTextFrame: ['Hello'] - TTSTextFrame: [' there'] - TTSTextFrame: ['!'] - TTSTextFrame: [' How'] - TTSTextFrame: ["'s"] - TTSTextFrame: [' it'] - TTSTextFrame: [' going'] - TTSTextFrame: ['?'] - ``` - Result: "Hello there! How's it going?" + Word-by-word fragments (joined with spaces): + ``` + TTSTextFrame: ["Hello"] + TTSTextFrame: ["there!"] + TTSTextFrame: ["How"] + TTSTextFrame: ["is"] + TTSTextFrame: ["it"] + TTSTextFrame: ["going?"] + ``` + Result: "Hello there! How is it going?" """ if self._current_text_parts and self._aggregation_start_time: - # Build content with intelligent spacing - content = "" - for i, part in enumerate(self._current_text_parts): - # Add a space only when the current part doesn't start with - # whitespace or punctuation/special characters - if i > 0 and not part.startswith((" ", ".", ",", "!", "?", ";", ":", "'", '"')): - content += " " - content += part + # Heuristic to detect pre-spaced fragments + uses_prespacing = False + if len(self._current_text_parts) > 1: + # Check if any fragment after the first one starts with whitespace + has_spaced_parts = any( + part and part[0].isspace() for part in self._current_text_parts[1:] + ) + if has_spaced_parts: + uses_prespacing = True + # Apply appropriate joining method + if uses_prespacing: + # Pre-spaced fragments - just concatenate + content = "".join(self._current_text_parts) + else: + # Word-by-word fragments - join with spaces + content = " ".join(self._current_text_parts) + + # Clean up any excessive whitespace content = content.strip() if content: - logger.debug(f"Emitting aggregated assistant message: {content}") + logger.trace(f"Emitting aggregated assistant message: {content}") message = TranscriptionMessage( role="assistant", content=content, @@ -143,7 +153,7 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor): ) await self._emit_update([message]) else: - logger.debug("No content to emit after stripping whitespace") + logger.trace("No content to emit after stripping whitespace") # Reset aggregation state self._current_text_parts = [] diff --git a/tests/test_transcript_processor.py b/tests/test_transcript_processor.py index 5f80b3ca6..d13246b2c 100644 --- a/tests/test_transcript_processor.py +++ b/tests/test_transcript_processor.py @@ -235,8 +235,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase): BotStartedSpeakingFrame(), SleepFrame(sleep=0.1), TTSTextFrame(text="Hello"), - TTSTextFrame(text="world"), - TTSTextFrame(text="!"), + TTSTextFrame(text="world!"), SleepFrame(sleep=0.1), StartInterruptionFrame(), # User interrupts here BotStartedSpeakingFrame(), @@ -251,8 +250,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase): expected_down_frames = [ BotStartedSpeakingFrame, TTSTextFrame, # "Hello" - TTSTextFrame, # "world" - TTSTextFrame, # "!" + TTSTextFrame, # "world!" TranscriptionUpdateFrame, # First message (emitted due to interruption) StartInterruptionFrame, # Interruption frame comes after the update BotStartedSpeakingFrame, @@ -480,245 +478,3 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase): self.assertEqual(message.role, "assistant") # Should be properly joined without extra spaces self.assertEqual(message.content, "Hello there! How's it going?") - - async def test_mixed_spacing_styles(self): - """Test handling mixed word-by-word and pre-spaced fragments""" - processor = AssistantTranscriptProcessor() - - received_updates = [] - - @processor.event_handler("on_transcript_update") - async def handle_update(proc, frame: TranscriptionUpdateFrame): - received_updates.append(frame) - - # Mix of spacing styles within the same utterance - frames_to_send = [ - BotStartedSpeakingFrame(), - SleepFrame(sleep=0.1), - # Word-by-word style - TTSTextFrame(text="First"), - TTSTextFrame(text="style."), - # Pre-spaced style - TTSTextFrame(text=" Second"), - TTSTextFrame(text=" style"), - TTSTextFrame(text="!"), - BotStoppedSpeakingFrame(), - ] - - expected_down_frames = [ - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TranscriptionUpdateFrame, - ] - - await run_test( - processor, - frames_to_send=frames_to_send, - expected_down_frames=expected_down_frames, - ) - - self.assertEqual(len(received_updates), 1) - message = received_updates[0].messages[0] - self.assertEqual(message.content, "First style. Second style!") - - async def test_punctuation_handling(self): - """Test handling of various punctuation patterns""" - processor = AssistantTranscriptProcessor() - - received_updates = [] - - @processor.event_handler("on_transcript_update") - async def handle_update(proc, frame: TranscriptionUpdateFrame): - received_updates.append(frame) - - # Test various punctuation types - frames_to_send = [ - BotStartedSpeakingFrame(), - SleepFrame(sleep=0.1), - TTSTextFrame(text="Commas"), - TTSTextFrame(text=","), - TTSTextFrame(text="colons"), - TTSTextFrame(text=":"), - TTSTextFrame(text="semicolons"), - TTSTextFrame(text=";"), - TTSTextFrame(text="quotes"), - TTSTextFrame(text="'"), - TTSTextFrame(text="and"), - TTSTextFrame(text='"'), - TTSTextFrame(text="double quotes"), - TTSTextFrame(text="!"), - BotStoppedSpeakingFrame(), - ] - - expected_down_frames = [ - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TranscriptionUpdateFrame, - ] - - await run_test( - processor, - frames_to_send=frames_to_send, - expected_down_frames=expected_down_frames, - ) - - self.assertEqual(len(received_updates), 1) - message = received_updates[0].messages[0] - self.assertEqual( - message.content, "Commas, colons: semicolons; quotes' and\" double quotes!" - ) - - async def test_complex_mixed_case(self): - """Test a complex mix of patterns to ensure robustness""" - processor = AssistantTranscriptProcessor() - - received_updates = [] - - @processor.event_handler("on_transcript_update") - async def handle_update(proc, frame: TranscriptionUpdateFrame): - received_updates.append(frame) - - # Complex mixed case with various patterns - frames_to_send = [ - BotStartedSpeakingFrame(), - SleepFrame(sleep=0.1), - # Pre-spaced fragments - TTSTextFrame(text="Hello"), - TTSTextFrame(text=" there"), - TTSTextFrame(text="!"), - # Sentence boundary - TTSTextFrame(text=" I'm"), - TTSTextFrame(text=" testing"), - TTSTextFrame(text=" spacing"), - TTSTextFrame(text="."), - # Word-by-word fragments - TTSTextFrame(text="Does"), - TTSTextFrame(text="this"), - TTSTextFrame(text="work"), - TTSTextFrame(text="correctly"), - TTSTextFrame(text="?"), - # Mixed punctuation and spacing - TTSTextFrame(text=" Let's"), - TTSTextFrame(text=" see:"), - TTSTextFrame(text="commas"), - TTSTextFrame(text=","), - TTSTextFrame(text=" semicolons"), - TTSTextFrame(text=";"), - TTSTextFrame(text=" and"), - TTSTextFrame(text=" quotes"), - TTSTextFrame(text="'"), - TTSTextFrame(text="!"), - BotStoppedSpeakingFrame(), - ] - - expected_down_frames = [ - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TranscriptionUpdateFrame, - ] - - await run_test( - processor, - frames_to_send=frames_to_send, - expected_down_frames=expected_down_frames, - ) - - self.assertEqual(len(received_updates), 1) - message = received_updates[0].messages[0] - expected = "Hello there! I'm testing spacing. Does this work correctly? Let's see: commas, semicolons; and quotes'!" - self.assertEqual(message.content, expected) - - async def test_multiple_consecutive_punctuation(self): - """Test handling of multiple consecutive punctuation marks""" - processor = AssistantTranscriptProcessor() - - received_updates = [] - - @processor.event_handler("on_transcript_update") - async def handle_update(proc, frame: TranscriptionUpdateFrame): - received_updates.append(frame) - - frames_to_send = [ - BotStartedSpeakingFrame(), - SleepFrame(sleep=0.1), - TTSTextFrame(text="Wow"), - TTSTextFrame(text="!"), - TTSTextFrame(text="!"), - TTSTextFrame(text="!"), - TTSTextFrame(text=" That's"), - TTSTextFrame(text=" amazing"), - TTSTextFrame(text="..."), - TTSTextFrame(text=" Don't"), - TTSTextFrame(text=" you"), - TTSTextFrame(text=" think"), - TTSTextFrame(text="?"), - TTSTextFrame(text="?"), - BotStoppedSpeakingFrame(), - ] - - expected_down_frames = [ - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TTSTextFrame, - TranscriptionUpdateFrame, - ] - - await run_test( - processor, - frames_to_send=frames_to_send, - expected_down_frames=expected_down_frames, - ) - - self.assertEqual(len(received_updates), 1) - message = received_updates[0].messages[0] - self.assertEqual(message.content, "Wow!!! That's amazing... Don't you think??")