Adding support for new bot-output RTVI Message:
1. TTSTextFrames now include metadata about whether the text was spoken or not along with a type string to describe what the text represents: ex. "sentence", "word", "custom aggregation" 2. Expanded how aggregators work so that the aggregate method returns aggregated text along with the type of aggregation used to create it 3. Deprecated the RTVI bot-transcription event in lieu of... 4. Introduced support for a new bot-output event. This event is meant to be the one stop shop for communicating what the bot actually "says". It is based off TTSTextFrames to communicate both sentence by sentence (or whatever aggregation is used) as well as word by word. In addition, it will include LLMTextFrames, aggregated by sentence when tts is turned off (i.e. skip_tts is true). Resolves pipecat-ai/pipecat-client-web#158
This commit is contained in:
@@ -130,11 +130,11 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(), # Wait for StartedSpeaking to process
|
||||
TTSTextFrame(text="Hello"),
|
||||
TTSTextFrame(text="world!"),
|
||||
TTSTextFrame(text="How"),
|
||||
TTSTextFrame(text="are"),
|
||||
TTSTextFrame(text="you?"),
|
||||
TTSTextFrame(text="Hello", aggregated_by="word"),
|
||||
TTSTextFrame(text="world!", aggregated_by="word"),
|
||||
TTSTextFrame(text="How", aggregated_by="word"),
|
||||
TTSTextFrame(text="are", aggregated_by="word"),
|
||||
TTSTextFrame(text="you?", aggregated_by="word"),
|
||||
SleepFrame(), # Wait for text frames to queue
|
||||
BotStoppedSpeakingFrame(),
|
||||
]
|
||||
@@ -195,9 +195,9 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(),
|
||||
TTSTextFrame(text=""), # Empty text
|
||||
TTSTextFrame(text=" "), # Just whitespace
|
||||
TTSTextFrame(text="\n"), # Just newline
|
||||
TTSTextFrame(text="", aggregated_by="word"), # Empty text
|
||||
TTSTextFrame(text=" ", aggregated_by="word"), # Just whitespace
|
||||
TTSTextFrame(text="\n", aggregated_by="word"), # Just newline
|
||||
BotStoppedSpeakingFrame(),
|
||||
# Pipeline ends here; run_test will automatically send EndFrame
|
||||
]
|
||||
@@ -235,14 +235,14 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(),
|
||||
TTSTextFrame(text="Hello"),
|
||||
TTSTextFrame(text="world!"),
|
||||
TTSTextFrame(text="Hello", aggregated_by="word"),
|
||||
TTSTextFrame(text="world!", aggregated_by="word"),
|
||||
SleepFrame(),
|
||||
InterruptionFrame(), # User interrupts here
|
||||
SleepFrame(),
|
||||
BotStartedSpeakingFrame(),
|
||||
TTSTextFrame(text="New"),
|
||||
TTSTextFrame(text="response"),
|
||||
TTSTextFrame(text="New", aggregated_by="word"),
|
||||
TTSTextFrame(text="response", aggregated_by="word"),
|
||||
SleepFrame(),
|
||||
BotStoppedSpeakingFrame(),
|
||||
]
|
||||
@@ -299,8 +299,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(),
|
||||
TTSTextFrame(text="Hello"),
|
||||
TTSTextFrame(text="world"),
|
||||
TTSTextFrame(text="Hello", aggregated_by="word"),
|
||||
TTSTextFrame(text="world", aggregated_by="word"),
|
||||
# Pipeline ends here; run_test will automatically send EndFrame
|
||||
]
|
||||
|
||||
@@ -338,8 +338,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(),
|
||||
TTSTextFrame(text="Hello"),
|
||||
TTSTextFrame(text="world"),
|
||||
TTSTextFrame(text="Hello", aggregated_by="word"),
|
||||
TTSTextFrame(text="world", aggregated_by="word"),
|
||||
SleepFrame(), # Ensure messages are processed
|
||||
CancelFrame(),
|
||||
]
|
||||
@@ -401,8 +401,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
frames_to_send = [
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(),
|
||||
TTSTextFrame(text="Assistant"),
|
||||
TTSTextFrame(text="message"),
|
||||
TTSTextFrame(text="Assistant", aggregated_by="word"),
|
||||
TTSTextFrame(text="message", aggregated_by="word"),
|
||||
BotStoppedSpeakingFrame(),
|
||||
]
|
||||
|
||||
@@ -439,7 +439,7 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
# Test the specific pattern shared
|
||||
def make_tts_text_frame(text: str) -> TTSTextFrame:
|
||||
frame = TTSTextFrame(text=text)
|
||||
frame = TTSTextFrame(text=text, aggregated_by="word")
|
||||
frame.includes_inter_frame_spaces = True
|
||||
return frame
|
||||
|
||||
|
||||
Reference in New Issue
Block a user