diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ae46e81b..e8e93ab8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed an issue that was causing user and bot speech to not be synchronized + during recordings. + - Fixed an issue where voice settings weren't applied to ElevenLabsTTSService. - Fixed an issue with `GroqTTSService` where it was not properly parsing the @@ -47,6 +50,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed an issue where `GoogleLLMService`'s TTFB value was incorrect. +### Deprecated + +- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated. + ### Other - Rename `14e-function-calling-gemini.py` to `14e-function-calling-google.py`. diff --git a/examples/twilio-chatbot/bot.py b/examples/twilio-chatbot/bot.py index 8aa73a2be..8a05a5d17 100644 --- a/examples/twilio-chatbot/bot.py +++ b/examples/twilio-chatbot/bot.py @@ -95,7 +95,7 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, call_sid: str, t # NOTE: Watch out! This will save all the conversation in memory. You can # pass `buffer_size` to get periodic callbacks. - audiobuffer = AudioBufferProcessor(user_continuous_stream=not testing) + audiobuffer = AudioBufferProcessor() pipeline = Pipeline( [ diff --git a/examples/twilio-chatbot/client/python/client.py b/examples/twilio-chatbot/client/python/client.py index 33592da0a..d066a6a7e 100644 --- a/examples/twilio-chatbot/client/python/client.py +++ b/examples/twilio-chatbot/client/python/client.py @@ -119,7 +119,7 @@ async def run_client(client_name: str, server_url: str, duration_secs: int): # NOTE: Watch out! This will save all the conversation in memory. You can # pass `buffer_size` to get periodic callbacks. - audiobuffer = AudioBufferProcessor(user_continuous_stream=False) + audiobuffer = AudioBufferProcessor() pipeline = Pipeline( [ diff --git a/src/pipecat/processors/audio/audio_buffer_processor.py b/src/pipecat/processors/audio/audio_buffer_processor.py index c1b2eb810..13d5a84bc 100644 --- a/src/pipecat/processors/audio/audio_buffer_processor.py +++ b/src/pipecat/processors/audio/audio_buffer_processor.py @@ -41,7 +41,6 @@ class AudioBufferProcessor(FrameProcessor): sample_rate (Optional[int]): Desired output sample rate. If None, uses source rate num_channels (int): Number of channels (1 for mono, 2 for stereo). Defaults to 1 buffer_size (int): Size of buffer before triggering events. 0 for no buffering - user_continuous_stream (bool): Whether user audio is continuous or speech-only enable_turn_audio (bool): Whether turn audio event handlers should be triggered Audio handling: @@ -50,10 +49,6 @@ class AudioBufferProcessor(FrameProcessor): - Automatic resampling of incoming audio to match desired sample_rate - Silence insertion for non-continuous audio streams - Buffer synchronization between user and bot audio - - Note: - When user_continuous_stream is False, the processor expects only speech - segments and will handle silence insertion between segments automatically. """ def __init__( @@ -62,7 +57,7 @@ class AudioBufferProcessor(FrameProcessor): sample_rate: Optional[int] = None, num_channels: int = 1, buffer_size: int = 0, - user_continuous_stream: bool = True, + user_continuous_stream: Optional[bool] = None, enable_turn_audio: bool = False, **kwargs, ): @@ -72,9 +67,18 @@ class AudioBufferProcessor(FrameProcessor): self._audio_buffer_size_1s = 0 self._num_channels = num_channels self._buffer_size = buffer_size - self._user_continuous_stream = user_continuous_stream self._enable_turn_audio = enable_turn_audio + if user_continuous_stream is not None: + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("always") + warnings.warn( + "Parameter `user_continuous_stream` is deprecated.", + DeprecationWarning, + ) + self._user_audio_buffer = bytearray() self._bot_audio_buffer = bytearray() @@ -181,10 +185,24 @@ class AudioBufferProcessor(FrameProcessor): self._audio_buffer_size_1s = self._sample_rate * 2 async def _process_recording(self, frame: Frame): - if self._user_continuous_stream: - await self._handle_continuous_stream(frame) - else: - await self._handle_intermittent_stream(frame) + if isinstance(frame, InputAudioRawFrame): + # Add silence if we need to. + silence = self._compute_silence(self._last_user_frame_at) + self._user_audio_buffer.extend(silence) + # Add user audio. + resampled = await self._resample_audio(frame) + self._user_audio_buffer.extend(resampled) + # Save time of frame so we can compute silence. + self._last_user_frame_at = time.time() + elif self._recording and isinstance(frame, OutputAudioRawFrame): + # Add silence if we need to. + silence = self._compute_silence(self._last_bot_frame_at) + self._bot_audio_buffer.extend(silence) + # Add bot audio. + resampled = await self._resample_audio(frame) + self._bot_audio_buffer.extend(resampled) + # Save time of frame so we can compute silence. + self._last_bot_frame_at = time.time() if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size: await self._call_on_audio_data_handler() @@ -223,41 +241,6 @@ class AudioBufferProcessor(FrameProcessor): resampled = await self._resample_audio(frame) self._bot_turn_audio_buffer += resampled - async def _handle_continuous_stream(self, frame: Frame): - if isinstance(frame, InputAudioRawFrame): - # Add user audio. - resampled = await self._resample_audio(frame) - self._user_audio_buffer.extend(resampled) - # Sync the bot's buffer to the user's buffer by adding silence if needed - if len(self._user_audio_buffer) > len(self._bot_audio_buffer): - silence_size = len(self._user_audio_buffer) - len(self._bot_audio_buffer) - silence = b"\x00" * silence_size - self._bot_audio_buffer.extend(silence) - elif self._recording and isinstance(frame, OutputAudioRawFrame): - # Add bot audio. - resampled = await self._resample_audio(frame) - self._bot_audio_buffer.extend(resampled) - - async def _handle_intermittent_stream(self, frame: Frame): - if isinstance(frame, InputAudioRawFrame): - # Add silence if we need to. - silence = self._compute_silence(self._last_user_frame_at) - self._user_audio_buffer.extend(silence) - # Add user audio. - resampled = await self._resample_audio(frame) - self._user_audio_buffer.extend(resampled) - # Save time of frame so we can compute silence. - self._last_user_frame_at = time.time() - elif self._recording and isinstance(frame, OutputAudioRawFrame): - # Add silence if we need to. - silence = self._compute_silence(self._last_bot_frame_at) - self._bot_audio_buffer.extend(silence) - # Add bot audio. - resampled = await self._resample_audio(frame) - self._bot_audio_buffer.extend(resampled) - # Save time of frame so we can compute silence. - self._last_bot_frame_at = time.time() - async def _call_on_audio_data_handler(self): if not self.has_audio() or not self._recording: return