Simplify the TranscriptProcessor _emit_aggregated_text logic

This commit is contained in:
Mark Backman
2025-03-17 16:30:46 -04:00
parent acd0660f66
commit 6885d07e88
2 changed files with 50 additions and 284 deletions

View File

@@ -90,52 +90,62 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
self._aggregation_start_time: Optional[str] = None
async def _emit_aggregated_text(self):
"""Emit aggregated text as a transcript message.
"""Aggregates and emits text fragments as a transcript message.
This method intelligently joins text fragments to create natural spacing,
handling both word-by-word and pre-spaced text fragments appropriately.
This method uses a heuristic to automatically detect whether text fragments
use pre-spacing (spaces at the beginning of fragments) or not, and applies
the appropriate joining strategy. It handles fragments from different TTS
services with different formatting patterns.
The implementation handles two common patterns from TTS services:
Examples:
Pre-spaced fragments (concatenated):
```
TTSTextFrame: ["Hello"]
TTSTextFrame: [" there"]
TTSTextFrame: ["!"]
TTSTextFrame: [" How"]
TTSTextFrame: ["'s"]
TTSTextFrame: [" it"]
TTSTextFrame: [" going"]
TTSTextFrame: ["?"]
```
Result: "Hello there! How's it going?"
1. Word-by-word fragments without spacing:
```
TTSTextFrame: ['Hello.']
TTSTextFrame: ['How']
TTSTextFrame: ['can']
TTSTextFrame: ['I']
TTSTextFrame: ['assist']
TTSTextFrame: ['you']
TTSTextFrame: ['today?']
```
Result: "Hello. How can I assist you today?"
2. Pre-spaced fragments:
```
TTSTextFrame: ['Hello']
TTSTextFrame: [' there']
TTSTextFrame: ['!']
TTSTextFrame: [' How']
TTSTextFrame: ["'s"]
TTSTextFrame: [' it']
TTSTextFrame: [' going']
TTSTextFrame: ['?']
```
Result: "Hello there! How's it going?"
Word-by-word fragments (joined with spaces):
```
TTSTextFrame: ["Hello"]
TTSTextFrame: ["there!"]
TTSTextFrame: ["How"]
TTSTextFrame: ["is"]
TTSTextFrame: ["it"]
TTSTextFrame: ["going?"]
```
Result: "Hello there! How is it going?"
"""
if self._current_text_parts and self._aggregation_start_time:
# Build content with intelligent spacing
content = ""
for i, part in enumerate(self._current_text_parts):
# Add a space only when the current part doesn't start with
# whitespace or punctuation/special characters
if i > 0 and not part.startswith((" ", ".", ",", "!", "?", ";", ":", "'", '"')):
content += " "
content += part
# Heuristic to detect pre-spaced fragments
uses_prespacing = False
if len(self._current_text_parts) > 1:
# Check if any fragment after the first one starts with whitespace
has_spaced_parts = any(
part and part[0].isspace() for part in self._current_text_parts[1:]
)
if has_spaced_parts:
uses_prespacing = True
# Apply appropriate joining method
if uses_prespacing:
# Pre-spaced fragments - just concatenate
content = "".join(self._current_text_parts)
else:
# Word-by-word fragments - join with spaces
content = " ".join(self._current_text_parts)
# Clean up any excessive whitespace
content = content.strip()
if content:
logger.debug(f"Emitting aggregated assistant message: {content}")
logger.trace(f"Emitting aggregated assistant message: {content}")
message = TranscriptionMessage(
role="assistant",
content=content,
@@ -143,7 +153,7 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
)
await self._emit_update([message])
else:
logger.debug("No content to emit after stripping whitespace")
logger.trace("No content to emit after stripping whitespace")
# Reset aggregation state
self._current_text_parts = []

View File

@@ -235,8 +235,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
BotStartedSpeakingFrame(),
SleepFrame(sleep=0.1),
TTSTextFrame(text="Hello"),
TTSTextFrame(text="world"),
TTSTextFrame(text="!"),
TTSTextFrame(text="world!"),
SleepFrame(sleep=0.1),
StartInterruptionFrame(), # User interrupts here
BotStartedSpeakingFrame(),
@@ -251,8 +250,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
expected_down_frames = [
BotStartedSpeakingFrame,
TTSTextFrame, # "Hello"
TTSTextFrame, # "world"
TTSTextFrame, # "!"
TTSTextFrame, # "world!"
TranscriptionUpdateFrame, # First message (emitted due to interruption)
StartInterruptionFrame, # Interruption frame comes after the update
BotStartedSpeakingFrame,
@@ -480,245 +478,3 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
self.assertEqual(message.role, "assistant")
# Should be properly joined without extra spaces
self.assertEqual(message.content, "Hello there! How's it going?")
async def test_mixed_spacing_styles(self):
"""Test handling mixed word-by-word and pre-spaced fragments"""
processor = AssistantTranscriptProcessor()
received_updates = []
@processor.event_handler("on_transcript_update")
async def handle_update(proc, frame: TranscriptionUpdateFrame):
received_updates.append(frame)
# Mix of spacing styles within the same utterance
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(sleep=0.1),
# Word-by-word style
TTSTextFrame(text="First"),
TTSTextFrame(text="style."),
# Pre-spaced style
TTSTextFrame(text=" Second"),
TTSTextFrame(text=" style"),
TTSTextFrame(text="!"),
BotStoppedSpeakingFrame(),
]
expected_down_frames = [
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TranscriptionUpdateFrame,
]
await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=expected_down_frames,
)
self.assertEqual(len(received_updates), 1)
message = received_updates[0].messages[0]
self.assertEqual(message.content, "First style. Second style!")
async def test_punctuation_handling(self):
"""Test handling of various punctuation patterns"""
processor = AssistantTranscriptProcessor()
received_updates = []
@processor.event_handler("on_transcript_update")
async def handle_update(proc, frame: TranscriptionUpdateFrame):
received_updates.append(frame)
# Test various punctuation types
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(sleep=0.1),
TTSTextFrame(text="Commas"),
TTSTextFrame(text=","),
TTSTextFrame(text="colons"),
TTSTextFrame(text=":"),
TTSTextFrame(text="semicolons"),
TTSTextFrame(text=";"),
TTSTextFrame(text="quotes"),
TTSTextFrame(text="'"),
TTSTextFrame(text="and"),
TTSTextFrame(text='"'),
TTSTextFrame(text="double quotes"),
TTSTextFrame(text="!"),
BotStoppedSpeakingFrame(),
]
expected_down_frames = [
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TranscriptionUpdateFrame,
]
await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=expected_down_frames,
)
self.assertEqual(len(received_updates), 1)
message = received_updates[0].messages[0]
self.assertEqual(
message.content, "Commas, colons: semicolons; quotes' and\" double quotes!"
)
async def test_complex_mixed_case(self):
"""Test a complex mix of patterns to ensure robustness"""
processor = AssistantTranscriptProcessor()
received_updates = []
@processor.event_handler("on_transcript_update")
async def handle_update(proc, frame: TranscriptionUpdateFrame):
received_updates.append(frame)
# Complex mixed case with various patterns
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(sleep=0.1),
# Pre-spaced fragments
TTSTextFrame(text="Hello"),
TTSTextFrame(text=" there"),
TTSTextFrame(text="!"),
# Sentence boundary
TTSTextFrame(text=" I'm"),
TTSTextFrame(text=" testing"),
TTSTextFrame(text=" spacing"),
TTSTextFrame(text="."),
# Word-by-word fragments
TTSTextFrame(text="Does"),
TTSTextFrame(text="this"),
TTSTextFrame(text="work"),
TTSTextFrame(text="correctly"),
TTSTextFrame(text="?"),
# Mixed punctuation and spacing
TTSTextFrame(text=" Let's"),
TTSTextFrame(text=" see:"),
TTSTextFrame(text="commas"),
TTSTextFrame(text=","),
TTSTextFrame(text=" semicolons"),
TTSTextFrame(text=";"),
TTSTextFrame(text=" and"),
TTSTextFrame(text=" quotes"),
TTSTextFrame(text="'"),
TTSTextFrame(text="!"),
BotStoppedSpeakingFrame(),
]
expected_down_frames = [
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TranscriptionUpdateFrame,
]
await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=expected_down_frames,
)
self.assertEqual(len(received_updates), 1)
message = received_updates[0].messages[0]
expected = "Hello there! I'm testing spacing. Does this work correctly? Let's see: commas, semicolons; and quotes'!"
self.assertEqual(message.content, expected)
async def test_multiple_consecutive_punctuation(self):
"""Test handling of multiple consecutive punctuation marks"""
processor = AssistantTranscriptProcessor()
received_updates = []
@processor.event_handler("on_transcript_update")
async def handle_update(proc, frame: TranscriptionUpdateFrame):
received_updates.append(frame)
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(sleep=0.1),
TTSTextFrame(text="Wow"),
TTSTextFrame(text="!"),
TTSTextFrame(text="!"),
TTSTextFrame(text="!"),
TTSTextFrame(text=" That's"),
TTSTextFrame(text=" amazing"),
TTSTextFrame(text="..."),
TTSTextFrame(text=" Don't"),
TTSTextFrame(text=" you"),
TTSTextFrame(text=" think"),
TTSTextFrame(text="?"),
TTSTextFrame(text="?"),
BotStoppedSpeakingFrame(),
]
expected_down_frames = [
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TTSTextFrame,
TranscriptionUpdateFrame,
]
await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=expected_down_frames,
)
self.assertEqual(len(received_updates), 1)
message = received_updates[0].messages[0]
self.assertEqual(message.content, "Wow!!! That's amazing... Don't you think??")