diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index 15eb99f3a..e14ae3828 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -44,12 +44,15 @@ from pipecat.frames.frames import ( StartFrame, SystemFrame, TTSAudioRawFrame, + TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.transports.base_transport import TransportParams from pipecat.utils.time import nanoseconds_to_seconds BOT_VAD_STOP_SECS = 0.35 +# Only used as a fallback +BOT_VAD_STOP_FALLBACK_SECS = 3 class BaseOutputTransport(FrameProcessor): @@ -354,6 +357,8 @@ class BaseOutputTransport(FrameProcessor): await sender.handle_sync_frame(frame) elif isinstance(frame, MixerControlFrame): await sender.handle_mixer_control_frame(frame) + elif isinstance(frame, TTSStoppedFrame): + await sender.handle_sync_frame(frame) elif frame.pts: await sender.handle_timed_frame(frame) else: @@ -412,6 +417,8 @@ class BaseOutputTransport(FrameProcessor): # Indicates if the bot is currently speaking. self._bot_speaking = False + # Indicates if TTS audio has been received since the last stop. + self._tts_audio_received = False # Last time a BotSpeakingFrame was pushed. self._bot_speaking_frame_time = 0 # How often a BotSpeakingFrame should be pushed (value should be @@ -639,6 +646,7 @@ class BaseOutputTransport(FrameProcessor): return self._bot_speaking = False + self._tts_audio_received = False # Clean audio buffer (there could be tiny left overs if not multiple # to our output chunk size). @@ -682,6 +690,9 @@ class BaseOutputTransport(FrameProcessor): async def _handle_bot_speech(self, frame: Frame): # TTS case. if isinstance(frame, TTSAudioRawFrame): + # We will only trigger bot stopped speaking based on the TTSStoppedFrame, + # if we have received audio from TTS + self._tts_audio_received = True await self._bot_currently_speaking() # Speech stream case. elif isinstance(frame, SpeechOutputAudioRawFrame): @@ -703,6 +714,12 @@ class BaseOutputTransport(FrameProcessor): await self._transport.send_message(frame) elif isinstance(frame, OutputDTMFFrame): await self._transport.write_dtmf(frame) + elif isinstance(frame, TTSStoppedFrame): + # We will only trigger bot stopped speaking based on the TTSStoppedFrame, + # if we have received audio from TTS + if self._tts_audio_received: + logger.debug("Bot stopped speaking based on TTSStoppedFrame") + await self._bot_stopped_speaking() else: await self._transport.write_transport_frame(frame) @@ -722,7 +739,7 @@ class BaseOutputTransport(FrameProcessor): yield frame self._audio_queue.task_done() except asyncio.TimeoutError: - # Notify the bot stopped speaking upstream if necessary. + # Fallback: notify the bot stopped speaking upstream if necessary based on timeout. await self._bot_stopped_speaking() async def with_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]: @@ -737,7 +754,7 @@ class BaseOutputTransport(FrameProcessor): yield frame self._audio_queue.task_done() except asyncio.QueueEmpty: - # Notify the bot stopped speaking upstream if necessary. + # Fallback: notify the bot stopped speaking upstream if necessary based on timeout. diff_time = time.time() - last_frame_time if diff_time > vad_stop_secs: await self._bot_stopped_speaking() @@ -755,9 +772,9 @@ class BaseOutputTransport(FrameProcessor): await asyncio.sleep(0) if self._mixer: - return with_mixer(BOT_VAD_STOP_SECS) + return with_mixer(BOT_VAD_STOP_FALLBACK_SECS) else: - return without_mixer(BOT_VAD_STOP_SECS) + return without_mixer(BOT_VAD_STOP_FALLBACK_SECS) async def _send_silence(self, secs: int): if secs <= 0: