diff --git a/CHANGELOG.md b/CHANGELOG.md index 83dd4b49a..73e188ced 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `resample_audio()` is now deprecated, use `create_default_resampler()` instead. +### Removed + +- `AudioBufferProcessor.reset_audio_buffers()` has been removed, use + `AudioBufferProcessor.start_recording()` and + ``AudioBufferProcessor.stop_recording()` instead. + ### Fixed - Fixed a `AudioBufferProcessor` that would cause crackling in some recordings. diff --git a/examples/canonical-metrics/bot.py b/examples/canonical-metrics/bot.py index 945792b25..e2673bb3e 100644 --- a/examples/canonical-metrics/bot.py +++ b/examples/canonical-metrics/bot.py @@ -124,6 +124,7 @@ async def main(): @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): + await audio_buffer_processor.start_recording() await transport.capture_participant_transcription(participant["id"]) await task.queue_frames([context_aggregator.user().get_context_frame()]) diff --git a/examples/chatbot-audio-recording/bot.py b/examples/chatbot-audio-recording/bot.py index 7cf750c62..62b7b29c8 100644 --- a/examples/chatbot-audio-recording/bot.py +++ b/examples/chatbot-audio-recording/bot.py @@ -109,8 +109,9 @@ async def main(): context = OpenAILLMContext(messages) context_aggregator = llm.create_context_aggregator(context) - # Save audio every 10 seconds. - audiobuffer = AudioBufferProcessor(buffer_size=480000) + # NOTE: Watch out! This will save all the conversation in memory. You + # can pass `buffer_size` to get periodic callbacks. + audiobuffer = AudioBufferProcessor() pipeline = Pipeline( [ @@ -132,6 +133,7 @@ async def main(): @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): + await audiobuffer.start_recording() await transport.capture_participant_transcription(participant["id"]) await task.queue_frames([context_aggregator.user().get_context_frame()]) diff --git a/src/pipecat/processors/audio/audio_buffer_processor.py b/src/pipecat/processors/audio/audio_buffer_processor.py index 86a41cbd4..d56604330 100644 --- a/src/pipecat/processors/audio/audio_buffer_processor.py +++ b/src/pipecat/processors/audio/audio_buffer_processor.py @@ -14,7 +14,6 @@ from pipecat.frames.frames import ( Frame, InputAudioRawFrame, OutputAudioRawFrame, - StartFrame, ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -47,6 +46,8 @@ class AudioBufferProcessor(FrameProcessor): self._last_user_frame_at = 0 self._last_bot_frame_at = 0 + self._recording = False + self._resampler = create_default_resampler() self._register_event_handler("on_audio_data") @@ -74,21 +75,18 @@ class AudioBufferProcessor(FrameProcessor): else: return b"" - def reset_audio_buffers(self): - self._user_audio_buffer = bytearray() - self._bot_audio_buffer = bytearray() + async def start_recording(self): + self._recording = True + self._reset_recording() - self._last_user_frame_at = time.time() - self._last_bot_frame_at = time.time() + async def stop_recording(self): + await self._call_on_audio_data_handler() + self._recording = False async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) - if isinstance(frame, StartFrame): - self._last_user_frame_at = time.time() - self._last_bot_frame_at = time.time() - - if isinstance(frame, InputAudioRawFrame): + if self._recording and isinstance(frame, InputAudioRawFrame): # Add silence if we need to. silence = self._compute_silence(self._last_user_frame_at) self._user_audio_buffer.extend(silence) @@ -97,7 +95,7 @@ class AudioBufferProcessor(FrameProcessor): self._user_audio_buffer.extend(resampled) # Save time of frame so we can compute silence. self._last_user_frame_at = time.time() - elif isinstance(frame, OutputAudioRawFrame): + elif self._recording and isinstance(frame, OutputAudioRawFrame): # Add silence if we need to. silence = self._compute_silence(self._last_bot_frame_at) self._bot_audio_buffer.extend(silence) @@ -111,23 +109,32 @@ class AudioBufferProcessor(FrameProcessor): await self._call_on_audio_data_handler() if isinstance(frame, (CancelFrame, EndFrame)): - await self._call_on_audio_data_handler() + await self.stop_recording() await self.push_frame(frame, direction) async def _call_on_audio_data_handler(self): - if not self.has_audio(): + if not self.has_audio() or not self._recording: return merged_audio = self.merge_audio_buffers() await self._call_event_handler( "on_audio_data", merged_audio, self._sample_rate, self._num_channels ) - self.reset_audio_buffers() + self._reset_audio_buffers() def _buffer_has_audio(self, buffer: bytearray) -> bool: return buffer is not None and len(buffer) > 0 + def _reset_recording(self): + self._reset_audio_buffers() + self._last_user_frame_at = time.time() + self._last_bot_frame_at = time.time() + + def _reset_audio_buffers(self): + self._user_audio_buffer = bytearray() + self._bot_audio_buffer = bytearray() + async def _resample_audio(self, frame: AudioRawFrame) -> bytes: return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)