From 970b8044a05e6e6f27e7b46f8ac82a44198c6fd2 Mon Sep 17 00:00:00 2001
From: Filipi Fuchter <filipi87@gmail.com>
Date: Thu, 24 Jul 2025 13:25:48 -0300
Subject: [PATCH] Fixed an issue in `AudioBufferProcessor` that caused garbled
 audio when `enable_turn_audio` was enabled and audio resampling was required.

---
 CHANGELOG.md                                  |  3 +++
 .../audio/audio_buffer_processor.py           | 19 ++++++++++---------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eeb3dde3e..6174fea5b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -67,6 +67,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed an issue in `AudioBufferProcessor` that caused garbled audio when
+  `enable_turn_audio` was enabled and audio resampling was required.
+
 - Fixed a dependency issue for uv users where an `llvmlite` version required python 3.9.
 
 - Fixed an issue in `MiniMaxHttpTTSService` where the `pitch` param was the
diff --git a/src/pipecat/processors/audio/audio_buffer_processor.py b/src/pipecat/processors/audio/audio_buffer_processor.py
index 643e6e93f..b6432dd64 100644
--- a/src/pipecat/processors/audio/audio_buffer_processor.py
+++ b/src/pipecat/processors/audio/audio_buffer_processor.py
@@ -195,8 +195,6 @@ class AudioBufferProcessor(FrameProcessor):
 
         if self._recording:
             await self._process_recording(frame)
-            if self._enable_turn_audio:
-                await self._process_turn_recording(frame)
 
         if isinstance(frame, (CancelFrame, EndFrame)):
             await self.stop_recording()
@@ -210,6 +208,7 @@ class AudioBufferProcessor(FrameProcessor):
 
     async def _process_recording(self, frame: Frame):
         """Process audio frames for recording."""
+        resampled = None
         if isinstance(frame, InputAudioRawFrame):
             # Add silence if we need to.
             silence = self._compute_silence(self._last_user_frame_at)
@@ -232,7 +231,11 @@ class AudioBufferProcessor(FrameProcessor):
         if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
             await self._call_on_audio_data_handler()
 
-    async def _process_turn_recording(self, frame: Frame):
+        # Process turn recording with preprocessed data.
+        if self._enable_turn_audio:
+            await self._process_turn_recording(frame, resampled)
+
+    async def _process_turn_recording(self, frame: Frame, resampled_audio: Optional[bytes] = None):
         """Process frames for turn-based audio recording."""
         if isinstance(frame, UserStartedSpeakingFrame):
             self._user_speaking = True
@@ -251,9 +254,8 @@ class AudioBufferProcessor(FrameProcessor):
             self._bot_speaking = False
             self._bot_turn_audio_buffer = bytearray()
 
-        if isinstance(frame, InputAudioRawFrame):
-            resampled = await self._resample_input_audio(frame)
-            self._user_turn_audio_buffer += resampled
+        if isinstance(frame, InputAudioRawFrame) and resampled_audio:
+            self._user_turn_audio_buffer.extend(resampled_audio)
             # In the case of the user, we need to keep a short buffer of audio
             # since VAD notification of when the user starts speaking comes
             # later.
@@ -263,9 +265,8 @@ class AudioBufferProcessor(FrameProcessor):
             ):
                 discarded = len(self._user_turn_audio_buffer) - self._audio_buffer_size_1s
                 self._user_turn_audio_buffer = self._user_turn_audio_buffer[discarded:]
-        elif self._bot_speaking and isinstance(frame, OutputAudioRawFrame):
-            resampled = await self._resample_output_audio(frame)
-            self._bot_turn_audio_buffer += resampled
+        elif self._bot_speaking and isinstance(frame, OutputAudioRawFrame) and resampled_audio:
+            self._bot_turn_audio_buffer.extend(resampled_audio)
 
     async def _call_on_audio_data_handler(self):
         """Call the audio data event handlers with buffered audio."""