diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index 15eb99f3a..e14ae3828 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -44,12 +44,15 @@ from pipecat.frames.frames import (
     StartFrame,
     SystemFrame,
     TTSAudioRawFrame,
+    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.base_transport import TransportParams
 from pipecat.utils.time import nanoseconds_to_seconds
 
 BOT_VAD_STOP_SECS = 0.35
+# Only used as a fallback
+BOT_VAD_STOP_FALLBACK_SECS = 3
 
 
 class BaseOutputTransport(FrameProcessor):
@@ -354,6 +357,8 @@ class BaseOutputTransport(FrameProcessor):
                 await sender.handle_sync_frame(frame)
         elif isinstance(frame, MixerControlFrame):
             await sender.handle_mixer_control_frame(frame)
+        elif isinstance(frame, TTSStoppedFrame):
+            await sender.handle_sync_frame(frame)
         elif frame.pts:
             await sender.handle_timed_frame(frame)
         else:
@@ -412,6 +417,8 @@ class BaseOutputTransport(FrameProcessor):
 
             # Indicates if the bot is currently speaking.
             self._bot_speaking = False
+            # Indicates if TTS audio has been received since the last stop.
+            self._tts_audio_received = False
             # Last time a BotSpeakingFrame was pushed.
             self._bot_speaking_frame_time = 0
             # How often a BotSpeakingFrame should be pushed (value should be
@@ -639,6 +646,7 @@ class BaseOutputTransport(FrameProcessor):
                 return
 
             self._bot_speaking = False
+            self._tts_audio_received = False
 
             # Clean audio buffer (there could be tiny left overs if not multiple
             # to our output chunk size).
@@ -682,6 +690,9 @@ class BaseOutputTransport(FrameProcessor):
         async def _handle_bot_speech(self, frame: Frame):
             # TTS case.
             if isinstance(frame, TTSAudioRawFrame):
+                # We will only trigger bot stopped speaking based on the TTSStoppedFrame,
+                # if we have received audio from TTS
+                self._tts_audio_received = True
                 await self._bot_currently_speaking()
             # Speech stream case.
             elif isinstance(frame, SpeechOutputAudioRawFrame):
@@ -703,6 +714,12 @@ class BaseOutputTransport(FrameProcessor):
                 await self._transport.send_message(frame)
             elif isinstance(frame, OutputDTMFFrame):
                 await self._transport.write_dtmf(frame)
+            elif isinstance(frame, TTSStoppedFrame):
+                # We will only trigger bot stopped speaking based on the TTSStoppedFrame,
+                # if we have received audio from TTS
+                if self._tts_audio_received:
+                    logger.debug("Bot stopped speaking based on TTSStoppedFrame")
+                    await self._bot_stopped_speaking()
             else:
                 await self._transport.write_transport_frame(frame)
 
@@ -722,7 +739,7 @@ class BaseOutputTransport(FrameProcessor):
                         yield frame
                         self._audio_queue.task_done()
                     except asyncio.TimeoutError:
-                        # Notify the bot stopped speaking upstream if necessary.
+                        # Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
                         await self._bot_stopped_speaking()
 
             async def with_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
@@ -737,7 +754,7 @@ class BaseOutputTransport(FrameProcessor):
                         yield frame
                         self._audio_queue.task_done()
                     except asyncio.QueueEmpty:
-                        # Notify the bot stopped speaking upstream if necessary.
+                        # Fallback: notify the bot stopped speaking upstream if necessary based on timeout.
                         diff_time = time.time() - last_frame_time
                         if diff_time > vad_stop_secs:
                             await self._bot_stopped_speaking()
@@ -755,9 +772,9 @@ class BaseOutputTransport(FrameProcessor):
                         await asyncio.sleep(0)
 
             if self._mixer:
-                return with_mixer(BOT_VAD_STOP_SECS)
+                return with_mixer(BOT_VAD_STOP_FALLBACK_SECS)
             else:
-                return without_mixer(BOT_VAD_STOP_SECS)
+                return without_mixer(BOT_VAD_STOP_FALLBACK_SECS)
 
         async def _send_silence(self, secs: int):
             if secs <= 0: