AudioBufferProcessor: add start_recording()/stop_recording()

This commit is contained in:
Aleix Conchillo Flaqué
2025-01-31 16:23:54 -08:00
parent 371c2f3704
commit f3f22cf61c
4 changed files with 33 additions and 17 deletions

View File

@@ -67,6 +67,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `resample_audio()` is now deprecated, use `create_default_resampler()`
instead.
### Removed
- `AudioBufferProcessor.reset_audio_buffers()` has been removed, use
`AudioBufferProcessor.start_recording()` and
``AudioBufferProcessor.stop_recording()` instead.
### Fixed
- Fixed a `AudioBufferProcessor` that would cause crackling in some recordings.

View File

@@ -124,6 +124,7 @@ async def main():
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await audio_buffer_processor.start_recording()
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([context_aggregator.user().get_context_frame()])

View File

@@ -109,8 +109,9 @@ async def main():
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
# Save audio every 10 seconds.
audiobuffer = AudioBufferProcessor(buffer_size=480000)
# NOTE: Watch out! This will save all the conversation in memory. You
# can pass `buffer_size` to get periodic callbacks.
audiobuffer = AudioBufferProcessor()
pipeline = Pipeline(
[
@@ -132,6 +133,7 @@ async def main():
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await audiobuffer.start_recording()
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([context_aggregator.user().get_context_frame()])

View File

@@ -14,7 +14,6 @@ from pipecat.frames.frames import (
Frame,
InputAudioRawFrame,
OutputAudioRawFrame,
StartFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -47,6 +46,8 @@ class AudioBufferProcessor(FrameProcessor):
self._last_user_frame_at = 0
self._last_bot_frame_at = 0
self._recording = False
self._resampler = create_default_resampler()
self._register_event_handler("on_audio_data")
@@ -74,21 +75,18 @@ class AudioBufferProcessor(FrameProcessor):
else:
return b""
def reset_audio_buffers(self):
self._user_audio_buffer = bytearray()
self._bot_audio_buffer = bytearray()
async def start_recording(self):
self._recording = True
self._reset_recording()
self._last_user_frame_at = time.time()
self._last_bot_frame_at = time.time()
async def stop_recording(self):
await self._call_on_audio_data_handler()
self._recording = False
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, StartFrame):
self._last_user_frame_at = time.time()
self._last_bot_frame_at = time.time()
if isinstance(frame, InputAudioRawFrame):
if self._recording and isinstance(frame, InputAudioRawFrame):
# Add silence if we need to.
silence = self._compute_silence(self._last_user_frame_at)
self._user_audio_buffer.extend(silence)
@@ -97,7 +95,7 @@ class AudioBufferProcessor(FrameProcessor):
self._user_audio_buffer.extend(resampled)
# Save time of frame so we can compute silence.
self._last_user_frame_at = time.time()
elif isinstance(frame, OutputAudioRawFrame):
elif self._recording and isinstance(frame, OutputAudioRawFrame):
# Add silence if we need to.
silence = self._compute_silence(self._last_bot_frame_at)
self._bot_audio_buffer.extend(silence)
@@ -111,23 +109,32 @@ class AudioBufferProcessor(FrameProcessor):
await self._call_on_audio_data_handler()
if isinstance(frame, (CancelFrame, EndFrame)):
await self._call_on_audio_data_handler()
await self.stop_recording()
await self.push_frame(frame, direction)
async def _call_on_audio_data_handler(self):
if not self.has_audio():
if not self.has_audio() or not self._recording:
return
merged_audio = self.merge_audio_buffers()
await self._call_event_handler(
"on_audio_data", merged_audio, self._sample_rate, self._num_channels
)
self.reset_audio_buffers()
self._reset_audio_buffers()
def _buffer_has_audio(self, buffer: bytearray) -> bool:
return buffer is not None and len(buffer) > 0
def _reset_recording(self):
self._reset_audio_buffers()
self._last_user_frame_at = time.time()
self._last_bot_frame_at = time.time()
def _reset_audio_buffers(self):
self._user_audio_buffer = bytearray()
self._bot_audio_buffer = bytearray()
async def _resample_audio(self, frame: AudioRawFrame) -> bytes:
return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)