Fixed BotStoppedSpeakingFrame emission: now emitted as soon as TTSStoppedFrame is received, with a fallback silence-based timeout increased to reduce false positives

2026-03-06 16:16:11 -03:00
parent 88ff7c451b
commit 07abd3d60f
1 changed files with 21 additions and 4 deletions
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -44,12 +44,15 @@ from pipecat.frames.frames import (
    StartFrame,
    SystemFrame,
    TTSAudioRawFrame,
+    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.base_transport import TransportParams
 from pipecat.utils.time import nanoseconds_to_seconds

 BOT_VAD_STOP_SECS = 0.35
+# Only used as a fallback
+BOT_VAD_STOP_FALLBACK_SECS = 3


 class BaseOutputTransport(FrameProcessor):
@@ -354,6 +357,8 @@ class BaseOutputTransport(FrameProcessor):
                await sender.handle_sync_frame(frame)
        elif isinstance(frame, MixerControlFrame):
            await sender.handle_mixer_control_frame(frame)
+        elif isinstance(frame, TTSStoppedFrame):
+            await sender.handle_sync_frame(frame)
        elif frame.pts:
            await sender.handle_timed_frame(frame)
        else:
@@ -412,6 +417,8 @@ class BaseOutputTransport(FrameProcessor):

            # Indicates if the bot is currently speaking.
            self._bot_speaking = False
+            # Indicates if TTS audio has been received since the last stop.
+            self._tts_audio_received = False
            # Last time a BotSpeakingFrame was pushed.
            self._bot_speaking_frame_time = 0
            # How often a BotSpeakingFrame should be pushed (value should be
@@ -639,6 +646,7 @@ class BaseOutputTransport(FrameProcessor):
                return

            self._bot_speaking = False
+            self._tts_audio_received = False

            # Clean audio buffer (there could be tiny left overs if not multiple
            # to our output chunk size).
@@ -682,6 +690,9 @@ class BaseOutputTransport(FrameProcessor):
        async def _handle_bot_speech(self, frame: Frame):
            # TTS case.
            if isinstance(frame, TTSAudioRawFrame):
+                # We will only trigger bot stopped speaking based on the TTSStoppedFrame,
+                # if we have received audio from TTS
+                self._tts_audio_received = True
                await self._bot_currently_speaking()
            # Speech stream case.
            elif isinstance(frame, SpeechOutputAudioRawFrame):
@@ -703,6 +714,12 @@ class BaseOutputTransport(FrameProcessor):
                await self._transport.send_message(frame)
            elif isinstance(frame, OutputDTMFFrame):
                await self._transport.write_dtmf(frame)
+            elif isinstance(frame, TTSStoppedFrame):
+                # We will only trigger bot stopped speaking based on the TTSStoppedFrame,
+                # if we have received audio from TTS
+                if self._tts_audio_received:
+                    logger.debug("Bot stopped speaking based on TTSStoppedFrame")
+                    await self._bot_stopped_speaking()
            else:
                await self._transport.write_transport_frame(frame)

@@ -722,7 +739,7 @@ class BaseOutputTransport(FrameProcessor):
                        yield frame
                        self._audio_queue.task_done()
                    except asyncio.TimeoutError:
-                        # Notify the bot stopped speaking upstream if necessary.
+                        # Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
                        await self._bot_stopped_speaking()

            async def with_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
@@ -737,7 +754,7 @@ class BaseOutputTransport(FrameProcessor):
                        yield frame
                        self._audio_queue.task_done()
                    except asyncio.QueueEmpty:
-                        # Notify the bot stopped speaking upstream if necessary.
+                        # Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
                        diff_time = time.time() - last_frame_time
                        if diff_time > vad_stop_secs:
                            await self._bot_stopped_speaking()
@@ -755,9 +772,9 @@ class BaseOutputTransport(FrameProcessor):
                        await asyncio.sleep(0)

            if self._mixer:
-                return with_mixer(BOT_VAD_STOP_SECS)
+                return with_mixer(BOT_VAD_STOP_FALLBACK_SECS)
            else:
-                return without_mixer(BOT_VAD_STOP_SECS)
+                return without_mixer(BOT_VAD_STOP_FALLBACK_SECS)

        async def _send_silence(self, secs: int):
            if secs <= 0: