Fixed BotStoppedSpeakingFrame emission: now emitted as soon as TTSStoppedFrame is received, with a fallback silence-based timeout increased to reduce false positives

This commit is contained in:
filipi87
2026-03-06 16:16:11 -03:00
parent 88ff7c451b
commit 07abd3d60f

View File

@@ -44,12 +44,15 @@ from pipecat.frames.frames import (
StartFrame,
SystemFrame,
TTSAudioRawFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.base_transport import TransportParams
from pipecat.utils.time import nanoseconds_to_seconds
BOT_VAD_STOP_SECS = 0.35
# Only used as a fallback
BOT_VAD_STOP_FALLBACK_SECS = 3
class BaseOutputTransport(FrameProcessor):
@@ -354,6 +357,8 @@ class BaseOutputTransport(FrameProcessor):
await sender.handle_sync_frame(frame)
elif isinstance(frame, MixerControlFrame):
await sender.handle_mixer_control_frame(frame)
elif isinstance(frame, TTSStoppedFrame):
await sender.handle_sync_frame(frame)
elif frame.pts:
await sender.handle_timed_frame(frame)
else:
@@ -412,6 +417,8 @@ class BaseOutputTransport(FrameProcessor):
# Indicates if the bot is currently speaking.
self._bot_speaking = False
# Indicates if TTS audio has been received since the last stop.
self._tts_audio_received = False
# Last time a BotSpeakingFrame was pushed.
self._bot_speaking_frame_time = 0
# How often a BotSpeakingFrame should be pushed (value should be
@@ -639,6 +646,7 @@ class BaseOutputTransport(FrameProcessor):
return
self._bot_speaking = False
self._tts_audio_received = False
# Clean audio buffer (there could be tiny left overs if not multiple
# to our output chunk size).
@@ -682,6 +690,9 @@ class BaseOutputTransport(FrameProcessor):
async def _handle_bot_speech(self, frame: Frame):
# TTS case.
if isinstance(frame, TTSAudioRawFrame):
# We will only trigger bot stopped speaking based on the TTSStoppedFrame,
# if we have received audio from TTS
self._tts_audio_received = True
await self._bot_currently_speaking()
# Speech stream case.
elif isinstance(frame, SpeechOutputAudioRawFrame):
@@ -703,6 +714,12 @@ class BaseOutputTransport(FrameProcessor):
await self._transport.send_message(frame)
elif isinstance(frame, OutputDTMFFrame):
await self._transport.write_dtmf(frame)
elif isinstance(frame, TTSStoppedFrame):
# We will only trigger bot stopped speaking based on the TTSStoppedFrame,
# if we have received audio from TTS
if self._tts_audio_received:
logger.debug("Bot stopped speaking based on TTSStoppedFrame")
await self._bot_stopped_speaking()
else:
await self._transport.write_transport_frame(frame)
@@ -722,7 +739,7 @@ class BaseOutputTransport(FrameProcessor):
yield frame
self._audio_queue.task_done()
except asyncio.TimeoutError:
# Notify the bot stopped speaking upstream if necessary.
# Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
await self._bot_stopped_speaking()
async def with_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
@@ -737,7 +754,7 @@ class BaseOutputTransport(FrameProcessor):
yield frame
self._audio_queue.task_done()
except asyncio.QueueEmpty:
# Notify the bot stopped speaking upstream if necessary.
# Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
diff_time = time.time() - last_frame_time
if diff_time > vad_stop_secs:
await self._bot_stopped_speaking()
@@ -755,9 +772,9 @@ class BaseOutputTransport(FrameProcessor):
await asyncio.sleep(0)
if self._mixer:
return with_mixer(BOT_VAD_STOP_SECS)
return with_mixer(BOT_VAD_STOP_FALLBACK_SECS)
else:
return without_mixer(BOT_VAD_STOP_SECS)
return without_mixer(BOT_VAD_STOP_FALLBACK_SECS)
async def _send_silence(self, secs: int):
if secs <= 0: