diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eb958215..f285bc014 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- `AudioRawFrame`s are not pushed downstream from the base output + transport. This allows capturing the exact words the bot says by adding an STT + service at the end of the pipeline. + - Added new `GStreamerPipelineSource`. This processor can generate image or audio frames from a GStreamer pipeline (e.g. reading an MP4 file, and RTP stream or anything supported by GStreamer). diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index e4b5c6007..e6256d038 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -203,6 +203,7 @@ class BaseOutputTransport(FrameProcessor): frame = await self._sink_queue.get() if isinstance(frame, AudioRawFrame): await self.write_raw_audio_frames(frame.audio) + await self._internal_push_frame(frame) await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM) elif isinstance(frame, ImageRawFrame): await self._set_camera_image(frame) @@ -329,6 +330,7 @@ class BaseOutputTransport(FrameProcessor): try: frame = await self._audio_out_queue.get() await self.write_raw_audio_frames(frame.audio) + await self._internal_push_frame(frame) await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM) except asyncio.CancelledError: break