diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2eb958215..f285bc014 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- `AudioRawFrame`s are not pushed downstream from the base output
+  transport. This allows capturing the exact words the bot says by adding an STT
+  service at the end of the pipeline.
+
 - Added new `GStreamerPipelineSource`. This processor can generate image or
   audio frames from a GStreamer pipeline (e.g. reading an MP4 file, and RTP
   stream or anything supported by GStreamer).
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index e4b5c6007..e6256d038 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -203,6 +203,7 @@ class BaseOutputTransport(FrameProcessor):
                 frame = await self._sink_queue.get()
                 if isinstance(frame, AudioRawFrame):
                     await self.write_raw_audio_frames(frame.audio)
+                    await self._internal_push_frame(frame)
                     await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
                 elif isinstance(frame, ImageRawFrame):
                     await self._set_camera_image(frame)
@@ -329,6 +330,7 @@ class BaseOutputTransport(FrameProcessor):
             try:
                 frame = await self._audio_out_queue.get()
                 await self.write_raw_audio_frames(frame.audio)
+                await self._internal_push_frame(frame)
                 await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
             except asyncio.CancelledError:
                 break