Simplify the TranscriptProcessor _emit_aggregated_text logic
This commit is contained in:
@@ -90,52 +90,62 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
|
||||
self._aggregation_start_time: Optional[str] = None
|
||||
|
||||
async def _emit_aggregated_text(self):
|
||||
"""Emit aggregated text as a transcript message.
|
||||
"""Aggregates and emits text fragments as a transcript message.
|
||||
|
||||
This method intelligently joins text fragments to create natural spacing,
|
||||
handling both word-by-word and pre-spaced text fragments appropriately.
|
||||
This method uses a heuristic to automatically detect whether text fragments
|
||||
use pre-spacing (spaces at the beginning of fragments) or not, and applies
|
||||
the appropriate joining strategy. It handles fragments from different TTS
|
||||
services with different formatting patterns.
|
||||
|
||||
The implementation handles two common patterns from TTS services:
|
||||
Examples:
|
||||
Pre-spaced fragments (concatenated):
|
||||
```
|
||||
TTSTextFrame: ["Hello"]
|
||||
TTSTextFrame: [" there"]
|
||||
TTSTextFrame: ["!"]
|
||||
TTSTextFrame: [" How"]
|
||||
TTSTextFrame: ["'s"]
|
||||
TTSTextFrame: [" it"]
|
||||
TTSTextFrame: [" going"]
|
||||
TTSTextFrame: ["?"]
|
||||
```
|
||||
Result: "Hello there! How's it going?"
|
||||
|
||||
1. Word-by-word fragments without spacing:
|
||||
```
|
||||
TTSTextFrame: ['Hello.']
|
||||
TTSTextFrame: ['How']
|
||||
TTSTextFrame: ['can']
|
||||
TTSTextFrame: ['I']
|
||||
TTSTextFrame: ['assist']
|
||||
TTSTextFrame: ['you']
|
||||
TTSTextFrame: ['today?']
|
||||
```
|
||||
Result: "Hello. How can I assist you today?"
|
||||
|
||||
2. Pre-spaced fragments:
|
||||
```
|
||||
TTSTextFrame: ['Hello']
|
||||
TTSTextFrame: [' there']
|
||||
TTSTextFrame: ['!']
|
||||
TTSTextFrame: [' How']
|
||||
TTSTextFrame: ["'s"]
|
||||
TTSTextFrame: [' it']
|
||||
TTSTextFrame: [' going']
|
||||
TTSTextFrame: ['?']
|
||||
```
|
||||
Result: "Hello there! How's it going?"
|
||||
Word-by-word fragments (joined with spaces):
|
||||
```
|
||||
TTSTextFrame: ["Hello"]
|
||||
TTSTextFrame: ["there!"]
|
||||
TTSTextFrame: ["How"]
|
||||
TTSTextFrame: ["is"]
|
||||
TTSTextFrame: ["it"]
|
||||
TTSTextFrame: ["going?"]
|
||||
```
|
||||
Result: "Hello there! How is it going?"
|
||||
"""
|
||||
if self._current_text_parts and self._aggregation_start_time:
|
||||
# Build content with intelligent spacing
|
||||
content = ""
|
||||
for i, part in enumerate(self._current_text_parts):
|
||||
# Add a space only when the current part doesn't start with
|
||||
# whitespace or punctuation/special characters
|
||||
if i > 0 and not part.startswith((" ", ".", ",", "!", "?", ";", ":", "'", '"')):
|
||||
content += " "
|
||||
content += part
|
||||
# Heuristic to detect pre-spaced fragments
|
||||
uses_prespacing = False
|
||||
if len(self._current_text_parts) > 1:
|
||||
# Check if any fragment after the first one starts with whitespace
|
||||
has_spaced_parts = any(
|
||||
part and part[0].isspace() for part in self._current_text_parts[1:]
|
||||
)
|
||||
if has_spaced_parts:
|
||||
uses_prespacing = True
|
||||
|
||||
# Apply appropriate joining method
|
||||
if uses_prespacing:
|
||||
# Pre-spaced fragments - just concatenate
|
||||
content = "".join(self._current_text_parts)
|
||||
else:
|
||||
# Word-by-word fragments - join with spaces
|
||||
content = " ".join(self._current_text_parts)
|
||||
|
||||
# Clean up any excessive whitespace
|
||||
content = content.strip()
|
||||
|
||||
if content:
|
||||
logger.debug(f"Emitting aggregated assistant message: {content}")
|
||||
logger.trace(f"Emitting aggregated assistant message: {content}")
|
||||
message = TranscriptionMessage(
|
||||
role="assistant",
|
||||
content=content,
|
||||
@@ -143,7 +153,7 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
|
||||
)
|
||||
await self._emit_update([message])
|
||||
else:
|
||||
logger.debug("No content to emit after stripping whitespace")
|
||||
logger.trace("No content to emit after stripping whitespace")
|
||||
|
||||
# Reset aggregation state
|
||||
self._current_text_parts = []
|
||||
|
||||
@@ -235,8 +235,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(sleep=0.1),
|
||||
TTSTextFrame(text="Hello"),
|
||||
TTSTextFrame(text="world"),
|
||||
TTSTextFrame(text="!"),
|
||||
TTSTextFrame(text="world!"),
|
||||
SleepFrame(sleep=0.1),
|
||||
StartInterruptionFrame(), # User interrupts here
|
||||
BotStartedSpeakingFrame(),
|
||||
@@ -251,8 +250,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
expected_down_frames = [
|
||||
BotStartedSpeakingFrame,
|
||||
TTSTextFrame, # "Hello"
|
||||
TTSTextFrame, # "world"
|
||||
TTSTextFrame, # "!"
|
||||
TTSTextFrame, # "world!"
|
||||
TranscriptionUpdateFrame, # First message (emitted due to interruption)
|
||||
StartInterruptionFrame, # Interruption frame comes after the update
|
||||
BotStartedSpeakingFrame,
|
||||
@@ -480,245 +478,3 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
self.assertEqual(message.role, "assistant")
|
||||
# Should be properly joined without extra spaces
|
||||
self.assertEqual(message.content, "Hello there! How's it going?")
|
||||
|
||||
async def test_mixed_spacing_styles(self):
|
||||
"""Test handling mixed word-by-word and pre-spaced fragments"""
|
||||
processor = AssistantTranscriptProcessor()
|
||||
|
||||
received_updates = []
|
||||
|
||||
@processor.event_handler("on_transcript_update")
|
||||
async def handle_update(proc, frame: TranscriptionUpdateFrame):
|
||||
received_updates.append(frame)
|
||||
|
||||
# Mix of spacing styles within the same utterance
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(sleep=0.1),
|
||||
# Word-by-word style
|
||||
TTSTextFrame(text="First"),
|
||||
TTSTextFrame(text="style."),
|
||||
# Pre-spaced style
|
||||
TTSTextFrame(text=" Second"),
|
||||
TTSTextFrame(text=" style"),
|
||||
TTSTextFrame(text="!"),
|
||||
BotStoppedSpeakingFrame(),
|
||||
]
|
||||
|
||||
expected_down_frames = [
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TranscriptionUpdateFrame,
|
||||
]
|
||||
|
||||
await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=expected_down_frames,
|
||||
)
|
||||
|
||||
self.assertEqual(len(received_updates), 1)
|
||||
message = received_updates[0].messages[0]
|
||||
self.assertEqual(message.content, "First style. Second style!")
|
||||
|
||||
async def test_punctuation_handling(self):
|
||||
"""Test handling of various punctuation patterns"""
|
||||
processor = AssistantTranscriptProcessor()
|
||||
|
||||
received_updates = []
|
||||
|
||||
@processor.event_handler("on_transcript_update")
|
||||
async def handle_update(proc, frame: TranscriptionUpdateFrame):
|
||||
received_updates.append(frame)
|
||||
|
||||
# Test various punctuation types
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(sleep=0.1),
|
||||
TTSTextFrame(text="Commas"),
|
||||
TTSTextFrame(text=","),
|
||||
TTSTextFrame(text="colons"),
|
||||
TTSTextFrame(text=":"),
|
||||
TTSTextFrame(text="semicolons"),
|
||||
TTSTextFrame(text=";"),
|
||||
TTSTextFrame(text="quotes"),
|
||||
TTSTextFrame(text="'"),
|
||||
TTSTextFrame(text="and"),
|
||||
TTSTextFrame(text='"'),
|
||||
TTSTextFrame(text="double quotes"),
|
||||
TTSTextFrame(text="!"),
|
||||
BotStoppedSpeakingFrame(),
|
||||
]
|
||||
|
||||
expected_down_frames = [
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TranscriptionUpdateFrame,
|
||||
]
|
||||
|
||||
await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=expected_down_frames,
|
||||
)
|
||||
|
||||
self.assertEqual(len(received_updates), 1)
|
||||
message = received_updates[0].messages[0]
|
||||
self.assertEqual(
|
||||
message.content, "Commas, colons: semicolons; quotes' and\" double quotes!"
|
||||
)
|
||||
|
||||
async def test_complex_mixed_case(self):
|
||||
"""Test a complex mix of patterns to ensure robustness"""
|
||||
processor = AssistantTranscriptProcessor()
|
||||
|
||||
received_updates = []
|
||||
|
||||
@processor.event_handler("on_transcript_update")
|
||||
async def handle_update(proc, frame: TranscriptionUpdateFrame):
|
||||
received_updates.append(frame)
|
||||
|
||||
# Complex mixed case with various patterns
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(sleep=0.1),
|
||||
# Pre-spaced fragments
|
||||
TTSTextFrame(text="Hello"),
|
||||
TTSTextFrame(text=" there"),
|
||||
TTSTextFrame(text="!"),
|
||||
# Sentence boundary
|
||||
TTSTextFrame(text=" I'm"),
|
||||
TTSTextFrame(text=" testing"),
|
||||
TTSTextFrame(text=" spacing"),
|
||||
TTSTextFrame(text="."),
|
||||
# Word-by-word fragments
|
||||
TTSTextFrame(text="Does"),
|
||||
TTSTextFrame(text="this"),
|
||||
TTSTextFrame(text="work"),
|
||||
TTSTextFrame(text="correctly"),
|
||||
TTSTextFrame(text="?"),
|
||||
# Mixed punctuation and spacing
|
||||
TTSTextFrame(text=" Let's"),
|
||||
TTSTextFrame(text=" see:"),
|
||||
TTSTextFrame(text="commas"),
|
||||
TTSTextFrame(text=","),
|
||||
TTSTextFrame(text=" semicolons"),
|
||||
TTSTextFrame(text=";"),
|
||||
TTSTextFrame(text=" and"),
|
||||
TTSTextFrame(text=" quotes"),
|
||||
TTSTextFrame(text="'"),
|
||||
TTSTextFrame(text="!"),
|
||||
BotStoppedSpeakingFrame(),
|
||||
]
|
||||
|
||||
expected_down_frames = [
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TranscriptionUpdateFrame,
|
||||
]
|
||||
|
||||
await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=expected_down_frames,
|
||||
)
|
||||
|
||||
self.assertEqual(len(received_updates), 1)
|
||||
message = received_updates[0].messages[0]
|
||||
expected = "Hello there! I'm testing spacing. Does this work correctly? Let's see: commas, semicolons; and quotes'!"
|
||||
self.assertEqual(message.content, expected)
|
||||
|
||||
async def test_multiple_consecutive_punctuation(self):
|
||||
"""Test handling of multiple consecutive punctuation marks"""
|
||||
processor = AssistantTranscriptProcessor()
|
||||
|
||||
received_updates = []
|
||||
|
||||
@processor.event_handler("on_transcript_update")
|
||||
async def handle_update(proc, frame: TranscriptionUpdateFrame):
|
||||
received_updates.append(frame)
|
||||
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(sleep=0.1),
|
||||
TTSTextFrame(text="Wow"),
|
||||
TTSTextFrame(text="!"),
|
||||
TTSTextFrame(text="!"),
|
||||
TTSTextFrame(text="!"),
|
||||
TTSTextFrame(text=" That's"),
|
||||
TTSTextFrame(text=" amazing"),
|
||||
TTSTextFrame(text="..."),
|
||||
TTSTextFrame(text=" Don't"),
|
||||
TTSTextFrame(text=" you"),
|
||||
TTSTextFrame(text=" think"),
|
||||
TTSTextFrame(text="?"),
|
||||
TTSTextFrame(text="?"),
|
||||
BotStoppedSpeakingFrame(),
|
||||
]
|
||||
|
||||
expected_down_frames = [
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TTSTextFrame,
|
||||
TranscriptionUpdateFrame,
|
||||
]
|
||||
|
||||
await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=expected_down_frames,
|
||||
)
|
||||
|
||||
self.assertEqual(len(received_updates), 1)
|
||||
message = received_updates[0].messages[0]
|
||||
self.assertEqual(message.content, "Wow!!! That's amazing... Don't you think??")
|
||||
|
||||
Reference in New Issue
Block a user