BaseAudioResampler: make resample() async

This commit is contained in:
Aleix Conchillo Flaqué
2025-01-31 12:58:55 -08:00
parent dcfb86583d
commit 06449eff2c
10 changed files with 37 additions and 31 deletions

View File

@@ -13,7 +13,7 @@ class BaseAudioResampler(ABC):
"""
@abstractmethod
def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
"""
Resamples the given audio data to a different sample rate.

View File

@@ -16,9 +16,10 @@ class ResampyResampler(BaseAudioResampler):
def __init__(self, **kwargs):
pass
def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
if in_rate == out_rate:
return audio
audio_data = np.frombuffer(audio, dtype=np.int16)
resampled_audio = resampy.resample(audio_data, in_rate, out_rate, filter="kaiser_fast")
return resampled_audio.astype(np.int16).tobytes()
result = resampled_audio.astype(np.int16).tobytes()
return result

View File

@@ -16,9 +16,10 @@ class SOXRAudioResampler(BaseAudioResampler):
def __init__(self, **kwargs):
pass
def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
if in_rate == out_rate:
return audio
audio_data = np.frombuffer(audio, dtype=np.int16)
resampled_audio = soxr.resample(audio_data, in_rate, out_rate, quality="VHQ")
return resampled_audio.astype(np.int16).tobytes()
result = resampled_audio.astype(np.int16).tobytes()
return result

View File

@@ -91,19 +91,21 @@ def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
return prev_value + factor * (value - prev_value)
def ulaw_to_pcm(ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
async def ulaw_to_pcm(
ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
):
# Convert μ-law to PCM
in_pcm_bytes = audioop.ulaw2lin(ulaw_bytes, 2)
# Resample
out_pcm_bytes = resampler.resample(in_pcm_bytes, in_rate, out_rate)
out_pcm_bytes = await resampler.resample(in_pcm_bytes, in_rate, out_rate)
return out_pcm_bytes
def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
# Resample
in_pcm_bytes = resampler.resample(pcm_bytes, in_rate, out_rate)
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
# Convert PCM to μ-law
ulaw_bytes = audioop.lin2ulaw(in_pcm_bytes, 2)
@@ -111,21 +113,21 @@ def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAu
return ulaw_bytes
def alaw_to_pcm(
async def alaw_to_pcm(
alaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
) -> bytes:
# Convert a-law to PCM
in_pcm_bytes = audioop.alaw2lin(alaw_bytes, 2)
# Resample
out_pcm_bytes = resampler.resample(in_pcm_bytes, in_rate, out_rate)
out_pcm_bytes = await resampler.resample(in_pcm_bytes, in_rate, out_rate)
return out_pcm_bytes
def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
# Resample
in_pcm_bytes = resampler.resample(pcm_bytes, in_rate, out_rate)
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
# Convert PCM to μ-law
alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)

View File

@@ -77,7 +77,7 @@ class AudioBufferProcessor(FrameProcessor):
# Include all audio from the user.
if isinstance(frame, InputAudioRawFrame):
resampled = self._resample_audio(frame)
resampled = await self._resample_audio(frame)
self._user_audio_buffer.extend(resampled)
# Sync the bot's buffer to the user's buffer by adding silence if needed.
if len(self._user_audio_buffer) > len(self._bot_audio_buffer):
@@ -86,7 +86,7 @@ class AudioBufferProcessor(FrameProcessor):
self._bot_audio_buffer.extend(silence)
# If the bot is speaking, include all audio from the bot.
elif isinstance(frame, OutputAudioRawFrame):
resampled = self._resample_audio(frame)
resampled = await self._resample_audio(frame)
self._bot_audio_buffer.extend(resampled)
# Sync the user's buffer to the bot's buffer by adding silence if needed.
if len(self._bot_audio_buffer) > len(self._user_audio_buffer):
@@ -115,5 +115,5 @@ class AudioBufferProcessor(FrameProcessor):
def _buffer_has_audio(self, buffer: bytearray) -> bool:
return buffer is not None and len(buffer) > 0
def _resample_audio(self, frame: AudioRawFrame) -> bytes:
return self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)
async def _resample_audio(self, frame: AudioRawFrame) -> bytes:
return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)

View File

@@ -6,7 +6,6 @@
import base64
import json
from typing import Optional
from pydantic import BaseModel
@@ -58,11 +57,11 @@ class TelnyxFrameSerializer(FrameSerializer):
data = frame.audio
if self._params.inbound_encoding == "PCMU":
serialized_data = pcm_to_ulaw(
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._params.telnyx_sample_rate, self._resampler
)
elif self._params.inbound_encoding == "PCMA":
serialized_data = pcm_to_alaw(
serialized_data = await pcm_to_alaw(
data, frame.sample_rate, self._params.telnyx_sample_rate, self._resampler
)
else:
@@ -88,14 +87,14 @@ class TelnyxFrameSerializer(FrameSerializer):
payload = base64.b64decode(payload_base64)
if self._params.outbound_encoding == "PCMU":
deserialized_data = ulaw_to_pcm(
deserialized_data = await ulaw_to_pcm(
payload,
self._params.telnyx_sample_rate,
self._params.sample_rate,
self._resampler,
)
elif self._params.outbound_encoding == "PCMA":
deserialized_data = alaw_to_pcm(
deserialized_data = await alaw_to_pcm(
payload,
self._params.telnyx_sample_rate,
self._params.sample_rate,

View File

@@ -45,7 +45,7 @@ class TwilioFrameSerializer(FrameSerializer):
elif isinstance(frame, AudioRawFrame):
data = frame.audio
serialized_data = pcm_to_ulaw(
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._params.twilio_sample_rate, self._resampler
)
payload = base64.b64encode(serialized_data).decode("utf-8")
@@ -66,7 +66,7 @@ class TwilioFrameSerializer(FrameSerializer):
payload_base64 = message["media"]["payload"]
payload = base64.b64decode(payload_base64)
deserialized_data = ulaw_to_pcm(
deserialized_data = await ulaw_to_pcm(
payload, self._params.twilio_sample_rate, self._params.sample_rate, self._resampler
)
audio_frame = InputAudioRawFrame(

View File

@@ -195,10 +195,7 @@ class PollyTTSService(TTSService):
response = self._polly_client.synthesize_speech(**args)
if "AudioStream" in response:
audio_data = response["AudioStream"].read()
resampled = self._resampler.resample(
audio_data, 16000, self._settings["sample_rate"]
)
return resampled
return audio_data
return None
logger.debug(f"Generating TTS: [{text}]")
@@ -229,6 +226,10 @@ class PollyTTSService(TTSService):
yield None
return
audio_data = await self._resampler.resample(
audio_data, 16000, self._settings["sample_rate"]
)
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()

View File

@@ -94,7 +94,7 @@ class TavusVideoService(AIService):
async def _encode_audio_and_send(self, audio: bytes, in_rate: int, done: bool) -> None:
"""Encodes audio to base64 and sends it to Tavus"""
if not done:
audio = self._resampler.resample(audio, in_rate, 16000)
audio = await self._resampler.resample(audio, in_rate, 16000)
audio_base64 = base64.b64encode(audio).decode("utf-8")
logger.trace(f"{self}: sending {len(audio)} bytes")
await self._send_audio_message(audio_base64, done=done)

View File

@@ -163,7 +163,7 @@ class XTTSService(TTSService):
buffer = buffer[48000:]
# XTTS uses 24000 so we need to resample to our desired rate.
resampled_audio = self._resampler.resample(
resampled_audio = await self._resampler.resample(
bytes(process_data), 24000, self._sample_rate
)
# Create the frame with the resampled audio
@@ -172,7 +172,9 @@ class XTTSService(TTSService):
# Process any remaining data in the buffer.
if len(buffer) > 0:
resampled_audio = self._resampler.resample(bytes(buffer), 24000, self._sample_rate)
resampled_audio = await self._resampler.resample(
bytes(buffer), 24000, self._sample_rate
)
frame = TTSAudioRawFrame(resampled_audio, self._sample_rate, 1)
yield frame