Adding support for new bot-output RTVI Message:

1. TTSTextFrames now include metadata about whether the text was spoken or not along with a type string to describe what the text represents: ex. "sentence", "word", "custom aggregation" 2. Expanded how aggregators work so that the aggregate method returns aggregated text along with the type of aggregation used to create it 3. Deprecated the RTVI bot-transcription event in lieu of... 4. Introduced support for a new bot-output event. This event is meant to be the one stop shop for communicating what the bot actually "says". It is based off TTSTextFrames to communicate both sentence by sentence (or whatever aggregation is used) as well as word by word. In addition, it will include LLMTextFrames, aggregated by sentence when tts is turned off (i.e. skip_tts is true). Resolves pipecat-ai/pipecat-client-web#158
2025-10-21 12:16:01 -04:00
parent d1116d149e
commit fe9aa3383e
12 changed files with 259 additions and 101 deletions
--- a/tests/test_transcript_processor.py
+++ b/tests/test_transcript_processor.py
@@ -130,11 +130,11 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
        frames_to_send = [
            BotStartedSpeakingFrame(),
            SleepFrame(),  # Wait for StartedSpeaking to process
-            TTSTextFrame(text="Hello"),
-            TTSTextFrame(text="world!"),
-            TTSTextFrame(text="How"),
-            TTSTextFrame(text="are"),
-            TTSTextFrame(text="you?"),
+            TTSTextFrame(text="Hello", aggregated_by="word"),
+            TTSTextFrame(text="world!", aggregated_by="word"),
+            TTSTextFrame(text="How", aggregated_by="word"),
+            TTSTextFrame(text="are", aggregated_by="word"),
+            TTSTextFrame(text="you?", aggregated_by="word"),
            SleepFrame(),  # Wait for text frames to queue
            BotStoppedSpeakingFrame(),
        ]
@@ -195,9 +195,9 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
        frames_to_send = [
            BotStartedSpeakingFrame(),
            SleepFrame(),
-            TTSTextFrame(text=""),  # Empty text
-            TTSTextFrame(text="   "),  # Just whitespace
-            TTSTextFrame(text="\n"),  # Just newline
+            TTSTextFrame(text="", aggregated_by="word"),  # Empty text
+            TTSTextFrame(text="   ", aggregated_by="word"),  # Just whitespace
+            TTSTextFrame(text="\n", aggregated_by="word"),  # Just newline
            BotStoppedSpeakingFrame(),
            # Pipeline ends here; run_test will automatically send EndFrame
        ]
@@ -235,14 +235,14 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
        frames_to_send = [
            BotStartedSpeakingFrame(),
            SleepFrame(),
-            TTSTextFrame(text="Hello"),
-            TTSTextFrame(text="world!"),
+            TTSTextFrame(text="Hello", aggregated_by="word"),
+            TTSTextFrame(text="world!", aggregated_by="word"),
            SleepFrame(),
            InterruptionFrame(),  # User interrupts here
            SleepFrame(),
            BotStartedSpeakingFrame(),
-            TTSTextFrame(text="New"),
-            TTSTextFrame(text="response"),
+            TTSTextFrame(text="New", aggregated_by="word"),
+            TTSTextFrame(text="response", aggregated_by="word"),
            SleepFrame(),
            BotStoppedSpeakingFrame(),
        ]
@@ -299,8 +299,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
        frames_to_send = [
            BotStartedSpeakingFrame(),
            SleepFrame(),
-            TTSTextFrame(text="Hello"),
-            TTSTextFrame(text="world"),
+            TTSTextFrame(text="Hello", aggregated_by="word"),
+            TTSTextFrame(text="world", aggregated_by="word"),
            # Pipeline ends here; run_test will automatically send EndFrame
        ]

@@ -338,8 +338,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
        frames_to_send = [
            BotStartedSpeakingFrame(),
            SleepFrame(),
-            TTSTextFrame(text="Hello"),
-            TTSTextFrame(text="world"),
+            TTSTextFrame(text="Hello", aggregated_by="word"),
+            TTSTextFrame(text="world", aggregated_by="word"),
            SleepFrame(),  # Ensure messages are processed
            CancelFrame(),
        ]
@@ -401,8 +401,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
        frames_to_send = [
            BotStartedSpeakingFrame(),
            SleepFrame(),
-            TTSTextFrame(text="Assistant"),
-            TTSTextFrame(text="message"),
+            TTSTextFrame(text="Assistant", aggregated_by="word"),
+            TTSTextFrame(text="message", aggregated_by="word"),
            BotStoppedSpeakingFrame(),
        ]

@@ -439,7 +439,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):

        # Test the specific pattern shared
        def make_tts_text_frame(text: str) -> TTSTextFrame:
-            frame = TTSTextFrame(text=text)
+            frame = TTSTextFrame(text=text, aggregated_by="word")
            frame.includes_inter_frame_spaces = True
            return frame