513 lines
16 KiB
Python
513 lines
16 KiB
Python
#
|
|
# Copyright (c) 2024-2026, Daily
|
|
#
|
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
#
|
|
|
|
import asyncio
|
|
import unittest
|
|
|
|
from pipecat.frames.frames import (
|
|
InterimTranscriptionFrame,
|
|
TranscriptionFrame,
|
|
UserStartedSpeakingFrame,
|
|
UserStoppedSpeakingFrame,
|
|
VADUserStartedSpeakingFrame,
|
|
VADUserStoppedSpeakingFrame,
|
|
)
|
|
from pipecat.turns.user_stop import ExternalUserTurnStopStrategy, TranscriptionUserTurnStopStrategy
|
|
from pipecat.utils.asyncio.task_manager import TaskManager, TaskManagerParams
|
|
|
|
AGGREGATION_TIMEOUT = 0.1
|
|
|
|
|
|
class TestTranscriptionUserTurnStopStrategy(unittest.IsolatedAsyncioTestCase):
|
|
async def asyncSetUp(self) -> None:
|
|
self.task_manager = TaskManager()
|
|
self.task_manager.setup(TaskManagerParams(loop=asyncio.get_running_loop()))
|
|
|
|
async def test_ste(self):
|
|
strategy = TranscriptionUserTurnStopStrategy()
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
|
|
# Transcription comes in between user started/stopped and there are not
|
|
# interim, we just trigger bot speech.
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_site(self):
|
|
strategy = TranscriptionUserTurnStopStrategy()
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="Hello!", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# T
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
|
|
# Transcription comes in between user started/stopped, so we trigger
|
|
# speech right away.
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_st1iest2e(self):
|
|
strategy = TranscriptionUserTurnStopStrategy()
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T1
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="Hello!", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T2
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
|
|
# There was an interim before the first user stopped speaking, then we
|
|
# got a transcription comes in between user started/stopped, so we
|
|
# trigger speech right away.
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_siet(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="How", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_sieit(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="How", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# T
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_set(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_seit(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="How", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# T
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_st1et2(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T1
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
|
|
# Transcription comes between user start/stopped speaking, we need to
|
|
# trigger speech right away.
|
|
self.assertTrue(should_start)
|
|
should_start = None
|
|
|
|
# T2
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_set1t2(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T1
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# T2
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_siet1it2(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="Hello!", user_id="cat", timestamp="")
|
|
)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# T1
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="How", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# T2
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_t(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# T
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_it(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="Hello!", user_id="cat", timestamp="")
|
|
)
|
|
|
|
# T
|
|
await strategy.process_frame(TranscriptionFrame(text="Hello!", user_id="cat", timestamp=""))
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
async def test_sie_delay_it(self):
|
|
strategy = TranscriptionUserTurnStopStrategy(timeout=AGGREGATION_TIMEOUT)
|
|
await strategy.setup(self.task_manager)
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
# S
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="Hello!", user_id="cat", timestamp="")
|
|
)
|
|
|
|
# E
|
|
await strategy.process_frame(VADUserStoppedSpeakingFrame())
|
|
self.assertIsNone(should_start)
|
|
|
|
# Delay
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
|
|
# I
|
|
await strategy.process_frame(
|
|
InterimTranscriptionFrame(text="How", user_id="cat", timestamp="")
|
|
)
|
|
|
|
# T
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertIsNone(should_start)
|
|
|
|
# Transcription comes after user stopped speaking, we need to wait for
|
|
# at least the aggregation timeout.
|
|
await asyncio.sleep(AGGREGATION_TIMEOUT + 0.1)
|
|
self.assertTrue(should_start)
|
|
|
|
|
|
class TestExternalUserTurnStopStrategy(unittest.IsolatedAsyncioTestCase):
|
|
async def test_external_strategy(self):
|
|
strategy = ExternalUserTurnStopStrategy()
|
|
|
|
should_start = None
|
|
|
|
@strategy.event_handler("on_user_turn_stopped")
|
|
async def on_user_turn_stopped(strategy, params):
|
|
nonlocal should_start
|
|
should_start = True
|
|
|
|
await strategy.process_frame(VADUserStartedSpeakingFrame())
|
|
self.assertFalse(should_start)
|
|
|
|
await strategy.process_frame(UserStartedSpeakingFrame())
|
|
self.assertFalse(should_start)
|
|
|
|
await strategy.process_frame(UserStoppedSpeakingFrame())
|
|
self.assertFalse(should_start)
|
|
|
|
await strategy.process_frame(UserStartedSpeakingFrame())
|
|
self.assertFalse(should_start)
|
|
|
|
await strategy.process_frame(
|
|
TranscriptionFrame(text="How are you?", user_id="cat", timestamp="")
|
|
)
|
|
self.assertFalse(should_start)
|
|
|
|
await strategy.process_frame(UserStoppedSpeakingFrame())
|
|
self.assertTrue(should_start)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|