Merge pull request #2260 from pipecat-ai/filipi/audio_resampler

Fixed an issue in `AudioBufferProcessor` that caused garbled audio
This commit is contained in:
Filipi da Silva Fuchter
2025-07-25 09:27:42 -03:00
committed by GitHub
2 changed files with 13 additions and 9 deletions

View File

@@ -71,6 +71,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed an issue in `AudioBufferProcessor` that caused garbled audio when
`enable_turn_audio` was enabled and audio resampling was required.
- Fixed a dependency issue for uv users where an `llvmlite` version required python 3.9.
- Fixed an issue in `MiniMaxHttpTTSService` where the `pitch` param was the

View File

@@ -195,8 +195,6 @@ class AudioBufferProcessor(FrameProcessor):
if self._recording:
await self._process_recording(frame)
if self._enable_turn_audio:
await self._process_turn_recording(frame)
if isinstance(frame, (CancelFrame, EndFrame)):
await self.stop_recording()
@@ -210,6 +208,7 @@ class AudioBufferProcessor(FrameProcessor):
async def _process_recording(self, frame: Frame):
"""Process audio frames for recording."""
resampled = None
if isinstance(frame, InputAudioRawFrame):
# Add silence if we need to.
silence = self._compute_silence(self._last_user_frame_at)
@@ -232,7 +231,11 @@ class AudioBufferProcessor(FrameProcessor):
if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
await self._call_on_audio_data_handler()
async def _process_turn_recording(self, frame: Frame):
# Process turn recording with preprocessed data.
if self._enable_turn_audio:
await self._process_turn_recording(frame, resampled)
async def _process_turn_recording(self, frame: Frame, resampled_audio: Optional[bytes] = None):
"""Process frames for turn-based audio recording."""
if isinstance(frame, UserStartedSpeakingFrame):
self._user_speaking = True
@@ -251,9 +254,8 @@ class AudioBufferProcessor(FrameProcessor):
self._bot_speaking = False
self._bot_turn_audio_buffer = bytearray()
if isinstance(frame, InputAudioRawFrame):
resampled = await self._resample_input_audio(frame)
self._user_turn_audio_buffer += resampled
if isinstance(frame, InputAudioRawFrame) and resampled_audio:
self._user_turn_audio_buffer.extend(resampled_audio)
# In the case of the user, we need to keep a short buffer of audio
# since VAD notification of when the user starts speaking comes
# later.
@@ -263,9 +265,8 @@ class AudioBufferProcessor(FrameProcessor):
):
discarded = len(self._user_turn_audio_buffer) - self._audio_buffer_size_1s
self._user_turn_audio_buffer = self._user_turn_audio_buffer[discarded:]
elif self._bot_speaking and isinstance(frame, OutputAudioRawFrame):
resampled = await self._resample_output_audio(frame)
self._bot_turn_audio_buffer += resampled
elif self._bot_speaking and isinstance(frame, OutputAudioRawFrame) and resampled_audio:
self._bot_turn_audio_buffer.extend(resampled_audio)
async def _call_on_audio_data_handler(self):
"""Call the audio data event handlers with buffered audio."""