Fixed BotStoppedSpeakingFrame emission: now emitted as soon as TTSStoppedFrame is received, with a fallback silence-based timeout increased to reduce false positives
This commit is contained in:
@@ -44,12 +44,15 @@ from pipecat.frames.frames import (
|
||||
StartFrame,
|
||||
SystemFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.utils.time import nanoseconds_to_seconds
|
||||
|
||||
BOT_VAD_STOP_SECS = 0.35
|
||||
# Only used as a fallback
|
||||
BOT_VAD_STOP_FALLBACK_SECS = 3
|
||||
|
||||
|
||||
class BaseOutputTransport(FrameProcessor):
|
||||
@@ -354,6 +357,8 @@ class BaseOutputTransport(FrameProcessor):
|
||||
await sender.handle_sync_frame(frame)
|
||||
elif isinstance(frame, MixerControlFrame):
|
||||
await sender.handle_mixer_control_frame(frame)
|
||||
elif isinstance(frame, TTSStoppedFrame):
|
||||
await sender.handle_sync_frame(frame)
|
||||
elif frame.pts:
|
||||
await sender.handle_timed_frame(frame)
|
||||
else:
|
||||
@@ -412,6 +417,8 @@ class BaseOutputTransport(FrameProcessor):
|
||||
|
||||
# Indicates if the bot is currently speaking.
|
||||
self._bot_speaking = False
|
||||
# Indicates if TTS audio has been received since the last stop.
|
||||
self._tts_audio_received = False
|
||||
# Last time a BotSpeakingFrame was pushed.
|
||||
self._bot_speaking_frame_time = 0
|
||||
# How often a BotSpeakingFrame should be pushed (value should be
|
||||
@@ -639,6 +646,7 @@ class BaseOutputTransport(FrameProcessor):
|
||||
return
|
||||
|
||||
self._bot_speaking = False
|
||||
self._tts_audio_received = False
|
||||
|
||||
# Clean audio buffer (there could be tiny left overs if not multiple
|
||||
# to our output chunk size).
|
||||
@@ -682,6 +690,9 @@ class BaseOutputTransport(FrameProcessor):
|
||||
async def _handle_bot_speech(self, frame: Frame):
|
||||
# TTS case.
|
||||
if isinstance(frame, TTSAudioRawFrame):
|
||||
# We will only trigger bot stopped speaking based on the TTSStoppedFrame,
|
||||
# if we have received audio from TTS
|
||||
self._tts_audio_received = True
|
||||
await self._bot_currently_speaking()
|
||||
# Speech stream case.
|
||||
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
||||
@@ -703,6 +714,12 @@ class BaseOutputTransport(FrameProcessor):
|
||||
await self._transport.send_message(frame)
|
||||
elif isinstance(frame, OutputDTMFFrame):
|
||||
await self._transport.write_dtmf(frame)
|
||||
elif isinstance(frame, TTSStoppedFrame):
|
||||
# We will only trigger bot stopped speaking based on the TTSStoppedFrame,
|
||||
# if we have received audio from TTS
|
||||
if self._tts_audio_received:
|
||||
logger.debug("Bot stopped speaking based on TTSStoppedFrame")
|
||||
await self._bot_stopped_speaking()
|
||||
else:
|
||||
await self._transport.write_transport_frame(frame)
|
||||
|
||||
@@ -722,7 +739,7 @@ class BaseOutputTransport(FrameProcessor):
|
||||
yield frame
|
||||
self._audio_queue.task_done()
|
||||
except asyncio.TimeoutError:
|
||||
# Notify the bot stopped speaking upstream if necessary.
|
||||
# Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
|
||||
await self._bot_stopped_speaking()
|
||||
|
||||
async def with_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
|
||||
@@ -737,7 +754,7 @@ class BaseOutputTransport(FrameProcessor):
|
||||
yield frame
|
||||
self._audio_queue.task_done()
|
||||
except asyncio.QueueEmpty:
|
||||
# Notify the bot stopped speaking upstream if necessary.
|
||||
# Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
|
||||
diff_time = time.time() - last_frame_time
|
||||
if diff_time > vad_stop_secs:
|
||||
await self._bot_stopped_speaking()
|
||||
@@ -755,9 +772,9 @@ class BaseOutputTransport(FrameProcessor):
|
||||
await asyncio.sleep(0)
|
||||
|
||||
if self._mixer:
|
||||
return with_mixer(BOT_VAD_STOP_SECS)
|
||||
return with_mixer(BOT_VAD_STOP_FALLBACK_SECS)
|
||||
else:
|
||||
return without_mixer(BOT_VAD_STOP_SECS)
|
||||
return without_mixer(BOT_VAD_STOP_FALLBACK_SECS)
|
||||
|
||||
async def _send_silence(self, secs: int):
|
||||
if secs <= 0:
|
||||
|
||||
Reference in New Issue
Block a user