diff --git a/src/pipecat/audio/resamplers/base_audio_resampler.py b/src/pipecat/audio/resamplers/base_audio_resampler.py index 0d7c3afef..6afbbcbfe 100644 --- a/src/pipecat/audio/resamplers/base_audio_resampler.py +++ b/src/pipecat/audio/resamplers/base_audio_resampler.py @@ -13,7 +13,7 @@ class BaseAudioResampler(ABC): """ @abstractmethod - def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes: + async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes: """ Resamples the given audio data to a different sample rate. diff --git a/src/pipecat/audio/resamplers/resampy_resampler.py b/src/pipecat/audio/resamplers/resampy_resampler.py index be194b767..8c053fc3b 100644 --- a/src/pipecat/audio/resamplers/resampy_resampler.py +++ b/src/pipecat/audio/resamplers/resampy_resampler.py @@ -16,9 +16,10 @@ class ResampyResampler(BaseAudioResampler): def __init__(self, **kwargs): pass - def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes: + async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes: if in_rate == out_rate: return audio audio_data = np.frombuffer(audio, dtype=np.int16) resampled_audio = resampy.resample(audio_data, in_rate, out_rate, filter="kaiser_fast") - return resampled_audio.astype(np.int16).tobytes() + result = resampled_audio.astype(np.int16).tobytes() + return result diff --git a/src/pipecat/audio/resamplers/soxr_resampler.py b/src/pipecat/audio/resamplers/soxr_resampler.py index eaa06ad4e..88edb84eb 100644 --- a/src/pipecat/audio/resamplers/soxr_resampler.py +++ b/src/pipecat/audio/resamplers/soxr_resampler.py @@ -16,9 +16,10 @@ class SOXRAudioResampler(BaseAudioResampler): def __init__(self, **kwargs): pass - def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes: + async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes: if in_rate == out_rate: return audio audio_data = np.frombuffer(audio, dtype=np.int16) resampled_audio = soxr.resample(audio_data, in_rate, out_rate, quality="VHQ") - return resampled_audio.astype(np.int16).tobytes() + result = resampled_audio.astype(np.int16).tobytes() + return result diff --git a/src/pipecat/audio/utils.py b/src/pipecat/audio/utils.py index 9a9d442dd..0cb5d4f06 100644 --- a/src/pipecat/audio/utils.py +++ b/src/pipecat/audio/utils.py @@ -91,19 +91,21 @@ def exp_smoothing(value: float, prev_value: float, factor: float) -> float: return prev_value + factor * (value - prev_value) -def ulaw_to_pcm(ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler): +async def ulaw_to_pcm( + ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler +): # Convert μ-law to PCM in_pcm_bytes = audioop.ulaw2lin(ulaw_bytes, 2) # Resample - out_pcm_bytes = resampler.resample(in_pcm_bytes, in_rate, out_rate) + out_pcm_bytes = await resampler.resample(in_pcm_bytes, in_rate, out_rate) return out_pcm_bytes -def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler): +async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler): # Resample - in_pcm_bytes = resampler.resample(pcm_bytes, in_rate, out_rate) + in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate) # Convert PCM to μ-law ulaw_bytes = audioop.lin2ulaw(in_pcm_bytes, 2) @@ -111,21 +113,21 @@ def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAu return ulaw_bytes -def alaw_to_pcm( +async def alaw_to_pcm( alaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler ) -> bytes: # Convert a-law to PCM in_pcm_bytes = audioop.alaw2lin(alaw_bytes, 2) # Resample - out_pcm_bytes = resampler.resample(in_pcm_bytes, in_rate, out_rate) + out_pcm_bytes = await resampler.resample(in_pcm_bytes, in_rate, out_rate) return out_pcm_bytes -def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler): +async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler): # Resample - in_pcm_bytes = resampler.resample(pcm_bytes, in_rate, out_rate) + in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate) # Convert PCM to μ-law alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2) diff --git a/src/pipecat/processors/audio/audio_buffer_processor.py b/src/pipecat/processors/audio/audio_buffer_processor.py index 376f6bf74..f00da6eb8 100644 --- a/src/pipecat/processors/audio/audio_buffer_processor.py +++ b/src/pipecat/processors/audio/audio_buffer_processor.py @@ -77,7 +77,7 @@ class AudioBufferProcessor(FrameProcessor): # Include all audio from the user. if isinstance(frame, InputAudioRawFrame): - resampled = self._resample_audio(frame) + resampled = await self._resample_audio(frame) self._user_audio_buffer.extend(resampled) # Sync the bot's buffer to the user's buffer by adding silence if needed. if len(self._user_audio_buffer) > len(self._bot_audio_buffer): @@ -86,7 +86,7 @@ class AudioBufferProcessor(FrameProcessor): self._bot_audio_buffer.extend(silence) # If the bot is speaking, include all audio from the bot. elif isinstance(frame, OutputAudioRawFrame): - resampled = self._resample_audio(frame) + resampled = await self._resample_audio(frame) self._bot_audio_buffer.extend(resampled) # Sync the user's buffer to the bot's buffer by adding silence if needed. if len(self._bot_audio_buffer) > len(self._user_audio_buffer): @@ -115,5 +115,5 @@ class AudioBufferProcessor(FrameProcessor): def _buffer_has_audio(self, buffer: bytearray) -> bool: return buffer is not None and len(buffer) > 0 - def _resample_audio(self, frame: AudioRawFrame) -> bytes: - return self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate) + async def _resample_audio(self, frame: AudioRawFrame) -> bytes: + return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate) diff --git a/src/pipecat/serializers/telnyx.py b/src/pipecat/serializers/telnyx.py index 2e0246ea7..79c6f42ff 100644 --- a/src/pipecat/serializers/telnyx.py +++ b/src/pipecat/serializers/telnyx.py @@ -6,7 +6,6 @@ import base64 import json -from typing import Optional from pydantic import BaseModel @@ -58,11 +57,11 @@ class TelnyxFrameSerializer(FrameSerializer): data = frame.audio if self._params.inbound_encoding == "PCMU": - serialized_data = pcm_to_ulaw( + serialized_data = await pcm_to_ulaw( data, frame.sample_rate, self._params.telnyx_sample_rate, self._resampler ) elif self._params.inbound_encoding == "PCMA": - serialized_data = pcm_to_alaw( + serialized_data = await pcm_to_alaw( data, frame.sample_rate, self._params.telnyx_sample_rate, self._resampler ) else: @@ -88,14 +87,14 @@ class TelnyxFrameSerializer(FrameSerializer): payload = base64.b64decode(payload_base64) if self._params.outbound_encoding == "PCMU": - deserialized_data = ulaw_to_pcm( + deserialized_data = await ulaw_to_pcm( payload, self._params.telnyx_sample_rate, self._params.sample_rate, self._resampler, ) elif self._params.outbound_encoding == "PCMA": - deserialized_data = alaw_to_pcm( + deserialized_data = await alaw_to_pcm( payload, self._params.telnyx_sample_rate, self._params.sample_rate, diff --git a/src/pipecat/serializers/twilio.py b/src/pipecat/serializers/twilio.py index c6d4e1e13..7862fa0db 100644 --- a/src/pipecat/serializers/twilio.py +++ b/src/pipecat/serializers/twilio.py @@ -45,7 +45,7 @@ class TwilioFrameSerializer(FrameSerializer): elif isinstance(frame, AudioRawFrame): data = frame.audio - serialized_data = pcm_to_ulaw( + serialized_data = await pcm_to_ulaw( data, frame.sample_rate, self._params.twilio_sample_rate, self._resampler ) payload = base64.b64encode(serialized_data).decode("utf-8") @@ -66,7 +66,7 @@ class TwilioFrameSerializer(FrameSerializer): payload_base64 = message["media"]["payload"] payload = base64.b64decode(payload_base64) - deserialized_data = ulaw_to_pcm( + deserialized_data = await ulaw_to_pcm( payload, self._params.twilio_sample_rate, self._params.sample_rate, self._resampler ) audio_frame = InputAudioRawFrame( diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index eb03bbc59..4444f07b7 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -195,10 +195,7 @@ class PollyTTSService(TTSService): response = self._polly_client.synthesize_speech(**args) if "AudioStream" in response: audio_data = response["AudioStream"].read() - resampled = self._resampler.resample( - audio_data, 16000, self._settings["sample_rate"] - ) - return resampled + return audio_data return None logger.debug(f"Generating TTS: [{text}]") @@ -229,6 +226,10 @@ class PollyTTSService(TTSService): yield None return + audio_data = await self._resampler.resample( + audio_data, 16000, self._settings["sample_rate"] + ) + await self.start_tts_usage_metrics(text) yield TTSStartedFrame() diff --git a/src/pipecat/services/tavus.py b/src/pipecat/services/tavus.py index e6693ecc7..b0ca699cb 100644 --- a/src/pipecat/services/tavus.py +++ b/src/pipecat/services/tavus.py @@ -94,7 +94,7 @@ class TavusVideoService(AIService): async def _encode_audio_and_send(self, audio: bytes, in_rate: int, done: bool) -> None: """Encodes audio to base64 and sends it to Tavus""" if not done: - audio = self._resampler.resample(audio, in_rate, 16000) + audio = await self._resampler.resample(audio, in_rate, 16000) audio_base64 = base64.b64encode(audio).decode("utf-8") logger.trace(f"{self}: sending {len(audio)} bytes") await self._send_audio_message(audio_base64, done=done) diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index b26660175..a10247acd 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -163,7 +163,7 @@ class XTTSService(TTSService): buffer = buffer[48000:] # XTTS uses 24000 so we need to resample to our desired rate. - resampled_audio = self._resampler.resample( + resampled_audio = await self._resampler.resample( bytes(process_data), 24000, self._sample_rate ) # Create the frame with the resampled audio @@ -172,7 +172,9 @@ class XTTSService(TTSService): # Process any remaining data in the buffer. if len(buffer) > 0: - resampled_audio = self._resampler.resample(bytes(buffer), 24000, self._sample_rate) + resampled_audio = await self._resampler.resample( + bytes(buffer), 24000, self._sample_rate + ) frame = TTSAudioRawFrame(resampled_audio, self._sample_rate, 1) yield frame