Adding support for new bot-output RTVI Message:

1. TTSTextFrames now include metadata about whether the text was spoken
   or not along with a type string to describe what the text represents:
   ex. "sentence", "word", "custom aggregation"
2. Expanded how aggregators work so that the aggregate method returns
   aggregated text along with the type of aggregation used to create it
3. Deprecated the RTVI bot-transcription event in lieu of...
4. Introduced support for a new bot-output event. This event is meant
   to be the one stop shop for communicating what the bot actually "says".
   It is based off TTSTextFrames to communicate both sentence by sentence
   (or whatever aggregation is used) as well as word by word. In addition,
   it will include LLMTextFrames, aggregated by sentence when tts is
   turned off (i.e. skip_tts is true).

Resolves pipecat-ai/pipecat-client-web#158
This commit is contained in:
mattie ruth backman
2025-10-21 12:16:01 -04:00
parent d1116d149e
commit fe9aa3383e
12 changed files with 259 additions and 101 deletions

View File

@@ -130,11 +130,11 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(), # Wait for StartedSpeaking to process
TTSTextFrame(text="Hello"),
TTSTextFrame(text="world!"),
TTSTextFrame(text="How"),
TTSTextFrame(text="are"),
TTSTextFrame(text="you?"),
TTSTextFrame(text="Hello", aggregated_by="word"),
TTSTextFrame(text="world!", aggregated_by="word"),
TTSTextFrame(text="How", aggregated_by="word"),
TTSTextFrame(text="are", aggregated_by="word"),
TTSTextFrame(text="you?", aggregated_by="word"),
SleepFrame(), # Wait for text frames to queue
BotStoppedSpeakingFrame(),
]
@@ -195,9 +195,9 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(),
TTSTextFrame(text=""), # Empty text
TTSTextFrame(text=" "), # Just whitespace
TTSTextFrame(text="\n"), # Just newline
TTSTextFrame(text="", aggregated_by="word"), # Empty text
TTSTextFrame(text=" ", aggregated_by="word"), # Just whitespace
TTSTextFrame(text="\n", aggregated_by="word"), # Just newline
BotStoppedSpeakingFrame(),
# Pipeline ends here; run_test will automatically send EndFrame
]
@@ -235,14 +235,14 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(),
TTSTextFrame(text="Hello"),
TTSTextFrame(text="world!"),
TTSTextFrame(text="Hello", aggregated_by="word"),
TTSTextFrame(text="world!", aggregated_by="word"),
SleepFrame(),
InterruptionFrame(), # User interrupts here
SleepFrame(),
BotStartedSpeakingFrame(),
TTSTextFrame(text="New"),
TTSTextFrame(text="response"),
TTSTextFrame(text="New", aggregated_by="word"),
TTSTextFrame(text="response", aggregated_by="word"),
SleepFrame(),
BotStoppedSpeakingFrame(),
]
@@ -299,8 +299,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(),
TTSTextFrame(text="Hello"),
TTSTextFrame(text="world"),
TTSTextFrame(text="Hello", aggregated_by="word"),
TTSTextFrame(text="world", aggregated_by="word"),
# Pipeline ends here; run_test will automatically send EndFrame
]
@@ -338,8 +338,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(),
TTSTextFrame(text="Hello"),
TTSTextFrame(text="world"),
TTSTextFrame(text="Hello", aggregated_by="word"),
TTSTextFrame(text="world", aggregated_by="word"),
SleepFrame(), # Ensure messages are processed
CancelFrame(),
]
@@ -401,8 +401,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
frames_to_send = [
BotStartedSpeakingFrame(),
SleepFrame(),
TTSTextFrame(text="Assistant"),
TTSTextFrame(text="message"),
TTSTextFrame(text="Assistant", aggregated_by="word"),
TTSTextFrame(text="message", aggregated_by="word"),
BotStoppedSpeakingFrame(),
]
@@ -439,7 +439,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
# Test the specific pattern shared
def make_tts_text_frame(text: str) -> TTSTextFrame:
frame = TTSTextFrame(text=text)
frame = TTSTextFrame(text=text, aggregated_by="word")
frame.includes_inter_frame_spaces = True
return frame