Merge pull request #2024 from pipecat-ai/aleix/audio-buffer-processor-sync-issues

AudioBufferProcessor: treat all streams as intermittent
2025-06-18 18:26:38 -07:00
parent dc78e874af b118082984
commit ebb23a5a8c
4 changed files with 38 additions and 48 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,6 +37,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Fixed an issue that was causing user and bot speech to not be synchronized
+  during recordings.
+
 - Fixed an issue where voice settings weren't applied to ElevenLabsTTSService.

 - Fixed an issue with `GroqTTSService` where it was not properly parsing the
@@ -47,6 +50,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - Fixed an issue where `GoogleLLMService`'s TTFB value was incorrect.

+### Deprecated
+
+- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated.
+
 ### Other

 - Rename `14e-function-calling-gemini.py` to `14e-function-calling-google.py`.
--- a/examples/twilio-chatbot/bot.py
+++ b/examples/twilio-chatbot/bot.py
@@ -95,7 +95,7 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, call_sid: str, t

    # NOTE: Watch out! This will save all the conversation in memory. You can
    # pass `buffer_size` to get periodic callbacks.
-    audiobuffer = AudioBufferProcessor(user_continuous_stream=not testing)
+    audiobuffer = AudioBufferProcessor()

    pipeline = Pipeline(
        [
--- a/examples/twilio-chatbot/client/python/client.py
+++ b/examples/twilio-chatbot/client/python/client.py
@@ -119,7 +119,7 @@ async def run_client(client_name: str, server_url: str, duration_secs: int):

    # NOTE: Watch out! This will save all the conversation in memory. You can
    # pass `buffer_size` to get periodic callbacks.
-    audiobuffer = AudioBufferProcessor(user_continuous_stream=False)
+    audiobuffer = AudioBufferProcessor()

    pipeline = Pipeline(
        [
--- a/src/pipecat/processors/audio/audio_buffer_processor.py
+++ b/src/pipecat/processors/audio/audio_buffer_processor.py
@@ -41,7 +41,6 @@ class AudioBufferProcessor(FrameProcessor):
        sample_rate (Optional[int]): Desired output sample rate. If None, uses source rate
        num_channels (int): Number of channels (1 for mono, 2 for stereo). Defaults to 1
        buffer_size (int): Size of buffer before triggering events. 0 for no buffering
-        user_continuous_stream (bool): Whether user audio is continuous or speech-only
        enable_turn_audio (bool): Whether turn audio event handlers should be triggered

    Audio handling:
@@ -50,10 +49,6 @@ class AudioBufferProcessor(FrameProcessor):
        - Automatic resampling of incoming audio to match desired sample_rate
        - Silence insertion for non-continuous audio streams
        - Buffer synchronization between user and bot audio
-
-    Note:
-        When user_continuous_stream is False, the processor expects only speech
-        segments and will handle silence insertion between segments automatically.
    """

    def __init__(
@@ -62,7 +57,7 @@ class AudioBufferProcessor(FrameProcessor):
        sample_rate: Optional[int] = None,
        num_channels: int = 1,
        buffer_size: int = 0,
-        user_continuous_stream: bool = True,
+        user_continuous_stream: Optional[bool] = None,
        enable_turn_audio: bool = False,
        **kwargs,
    ):
@@ -72,9 +67,18 @@ class AudioBufferProcessor(FrameProcessor):
        self._audio_buffer_size_1s = 0
        self._num_channels = num_channels
        self._buffer_size = buffer_size
-        self._user_continuous_stream = user_continuous_stream
        self._enable_turn_audio = enable_turn_audio

+        if user_continuous_stream is not None:
+            import warnings
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("always")
+                warnings.warn(
+                    "Parameter `user_continuous_stream` is deprecated.",
+                    DeprecationWarning,
+                )
+
        self._user_audio_buffer = bytearray()
        self._bot_audio_buffer = bytearray()

@@ -181,10 +185,24 @@ class AudioBufferProcessor(FrameProcessor):
        self._audio_buffer_size_1s = self._sample_rate * 2

    async def _process_recording(self, frame: Frame):
-        if self._user_continuous_stream:
-            await self._handle_continuous_stream(frame)
-        else:
-            await self._handle_intermittent_stream(frame)
+        if isinstance(frame, InputAudioRawFrame):
+            # Add silence if we need to.
+            silence = self._compute_silence(self._last_user_frame_at)
+            self._user_audio_buffer.extend(silence)
+            # Add user audio.
+            resampled = await self._resample_audio(frame)
+            self._user_audio_buffer.extend(resampled)
+            # Save time of frame so we can compute silence.
+            self._last_user_frame_at = time.time()
+        elif self._recording and isinstance(frame, OutputAudioRawFrame):
+            # Add silence if we need to.
+            silence = self._compute_silence(self._last_bot_frame_at)
+            self._bot_audio_buffer.extend(silence)
+            # Add bot audio.
+            resampled = await self._resample_audio(frame)
+            self._bot_audio_buffer.extend(resampled)
+            # Save time of frame so we can compute silence.
+            self._last_bot_frame_at = time.time()

        if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
            await self._call_on_audio_data_handler()
@@ -223,41 +241,6 @@ class AudioBufferProcessor(FrameProcessor):
            resampled = await self._resample_audio(frame)
            self._bot_turn_audio_buffer += resampled

-    async def _handle_continuous_stream(self, frame: Frame):
-        if isinstance(frame, InputAudioRawFrame):
-            # Add user audio.
-            resampled = await self._resample_audio(frame)
-            self._user_audio_buffer.extend(resampled)
-            # Sync the bot's buffer to the user's buffer by adding silence if needed
-            if len(self._user_audio_buffer) > len(self._bot_audio_buffer):
-                silence_size = len(self._user_audio_buffer) - len(self._bot_audio_buffer)
-                silence = b"\x00" * silence_size
-                self._bot_audio_buffer.extend(silence)
-        elif self._recording and isinstance(frame, OutputAudioRawFrame):
-            # Add bot audio.
-            resampled = await self._resample_audio(frame)
-            self._bot_audio_buffer.extend(resampled)
-
-    async def _handle_intermittent_stream(self, frame: Frame):
-        if isinstance(frame, InputAudioRawFrame):
-            # Add silence if we need to.
-            silence = self._compute_silence(self._last_user_frame_at)
-            self._user_audio_buffer.extend(silence)
-            # Add user audio.
-            resampled = await self._resample_audio(frame)
-            self._user_audio_buffer.extend(resampled)
-            # Save time of frame so we can compute silence.
-            self._last_user_frame_at = time.time()
-        elif self._recording and isinstance(frame, OutputAudioRawFrame):
-            # Add silence if we need to.
-            silence = self._compute_silence(self._last_bot_frame_at)
-            self._bot_audio_buffer.extend(silence)
-            # Add bot audio.
-            resampled = await self._resample_audio(frame)
-            self._bot_audio_buffer.extend(resampled)
-            # Save time of frame so we can compute silence.
-            self._last_bot_frame_at = time.time()
-
    async def _call_on_audio_data_handler(self):
        if not self.has_audio() or not self._recording:
            return