BaseAudioResampler: make resample() async
This commit is contained in:
@@ -13,7 +13,7 @@ class BaseAudioResampler(ABC):
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
"""
|
||||
Resamples the given audio data to a different sample rate.
|
||||
|
||||
|
||||
@@ -16,9 +16,10 @@ class ResampyResampler(BaseAudioResampler):
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
if in_rate == out_rate:
|
||||
return audio
|
||||
audio_data = np.frombuffer(audio, dtype=np.int16)
|
||||
resampled_audio = resampy.resample(audio_data, in_rate, out_rate, filter="kaiser_fast")
|
||||
return resampled_audio.astype(np.int16).tobytes()
|
||||
result = resampled_audio.astype(np.int16).tobytes()
|
||||
return result
|
||||
|
||||
@@ -16,9 +16,10 @@ class SOXRAudioResampler(BaseAudioResampler):
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
if in_rate == out_rate:
|
||||
return audio
|
||||
audio_data = np.frombuffer(audio, dtype=np.int16)
|
||||
resampled_audio = soxr.resample(audio_data, in_rate, out_rate, quality="VHQ")
|
||||
return resampled_audio.astype(np.int16).tobytes()
|
||||
result = resampled_audio.astype(np.int16).tobytes()
|
||||
return result
|
||||
|
||||
@@ -91,19 +91,21 @@ def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
|
||||
return prev_value + factor * (value - prev_value)
|
||||
|
||||
|
||||
def ulaw_to_pcm(ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
||||
async def ulaw_to_pcm(
|
||||
ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
|
||||
):
|
||||
# Convert μ-law to PCM
|
||||
in_pcm_bytes = audioop.ulaw2lin(ulaw_bytes, 2)
|
||||
|
||||
# Resample
|
||||
out_pcm_bytes = resampler.resample(in_pcm_bytes, in_rate, out_rate)
|
||||
out_pcm_bytes = await resampler.resample(in_pcm_bytes, in_rate, out_rate)
|
||||
|
||||
return out_pcm_bytes
|
||||
|
||||
|
||||
def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
||||
async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
||||
# Resample
|
||||
in_pcm_bytes = resampler.resample(pcm_bytes, in_rate, out_rate)
|
||||
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
|
||||
|
||||
# Convert PCM to μ-law
|
||||
ulaw_bytes = audioop.lin2ulaw(in_pcm_bytes, 2)
|
||||
@@ -111,21 +113,21 @@ def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAu
|
||||
return ulaw_bytes
|
||||
|
||||
|
||||
def alaw_to_pcm(
|
||||
async def alaw_to_pcm(
|
||||
alaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
|
||||
) -> bytes:
|
||||
# Convert a-law to PCM
|
||||
in_pcm_bytes = audioop.alaw2lin(alaw_bytes, 2)
|
||||
|
||||
# Resample
|
||||
out_pcm_bytes = resampler.resample(in_pcm_bytes, in_rate, out_rate)
|
||||
out_pcm_bytes = await resampler.resample(in_pcm_bytes, in_rate, out_rate)
|
||||
|
||||
return out_pcm_bytes
|
||||
|
||||
|
||||
def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
||||
async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
||||
# Resample
|
||||
in_pcm_bytes = resampler.resample(pcm_bytes, in_rate, out_rate)
|
||||
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
|
||||
|
||||
# Convert PCM to μ-law
|
||||
alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)
|
||||
|
||||
@@ -77,7 +77,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
|
||||
# Include all audio from the user.
|
||||
if isinstance(frame, InputAudioRawFrame):
|
||||
resampled = self._resample_audio(frame)
|
||||
resampled = await self._resample_audio(frame)
|
||||
self._user_audio_buffer.extend(resampled)
|
||||
# Sync the bot's buffer to the user's buffer by adding silence if needed.
|
||||
if len(self._user_audio_buffer) > len(self._bot_audio_buffer):
|
||||
@@ -86,7 +86,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
self._bot_audio_buffer.extend(silence)
|
||||
# If the bot is speaking, include all audio from the bot.
|
||||
elif isinstance(frame, OutputAudioRawFrame):
|
||||
resampled = self._resample_audio(frame)
|
||||
resampled = await self._resample_audio(frame)
|
||||
self._bot_audio_buffer.extend(resampled)
|
||||
# Sync the user's buffer to the bot's buffer by adding silence if needed.
|
||||
if len(self._bot_audio_buffer) > len(self._user_audio_buffer):
|
||||
@@ -115,5 +115,5 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
def _buffer_has_audio(self, buffer: bytearray) -> bool:
|
||||
return buffer is not None and len(buffer) > 0
|
||||
|
||||
def _resample_audio(self, frame: AudioRawFrame) -> bytes:
|
||||
return self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)
|
||||
async def _resample_audio(self, frame: AudioRawFrame) -> bytes:
|
||||
return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
import base64
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
@@ -58,11 +57,11 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
data = frame.audio
|
||||
|
||||
if self._params.inbound_encoding == "PCMU":
|
||||
serialized_data = pcm_to_ulaw(
|
||||
serialized_data = await pcm_to_ulaw(
|
||||
data, frame.sample_rate, self._params.telnyx_sample_rate, self._resampler
|
||||
)
|
||||
elif self._params.inbound_encoding == "PCMA":
|
||||
serialized_data = pcm_to_alaw(
|
||||
serialized_data = await pcm_to_alaw(
|
||||
data, frame.sample_rate, self._params.telnyx_sample_rate, self._resampler
|
||||
)
|
||||
else:
|
||||
@@ -88,14 +87,14 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
payload = base64.b64decode(payload_base64)
|
||||
|
||||
if self._params.outbound_encoding == "PCMU":
|
||||
deserialized_data = ulaw_to_pcm(
|
||||
deserialized_data = await ulaw_to_pcm(
|
||||
payload,
|
||||
self._params.telnyx_sample_rate,
|
||||
self._params.sample_rate,
|
||||
self._resampler,
|
||||
)
|
||||
elif self._params.outbound_encoding == "PCMA":
|
||||
deserialized_data = alaw_to_pcm(
|
||||
deserialized_data = await alaw_to_pcm(
|
||||
payload,
|
||||
self._params.telnyx_sample_rate,
|
||||
self._params.sample_rate,
|
||||
|
||||
@@ -45,7 +45,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
elif isinstance(frame, AudioRawFrame):
|
||||
data = frame.audio
|
||||
|
||||
serialized_data = pcm_to_ulaw(
|
||||
serialized_data = await pcm_to_ulaw(
|
||||
data, frame.sample_rate, self._params.twilio_sample_rate, self._resampler
|
||||
)
|
||||
payload = base64.b64encode(serialized_data).decode("utf-8")
|
||||
@@ -66,7 +66,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
payload_base64 = message["media"]["payload"]
|
||||
payload = base64.b64decode(payload_base64)
|
||||
|
||||
deserialized_data = ulaw_to_pcm(
|
||||
deserialized_data = await ulaw_to_pcm(
|
||||
payload, self._params.twilio_sample_rate, self._params.sample_rate, self._resampler
|
||||
)
|
||||
audio_frame = InputAudioRawFrame(
|
||||
|
||||
@@ -195,10 +195,7 @@ class PollyTTSService(TTSService):
|
||||
response = self._polly_client.synthesize_speech(**args)
|
||||
if "AudioStream" in response:
|
||||
audio_data = response["AudioStream"].read()
|
||||
resampled = self._resampler.resample(
|
||||
audio_data, 16000, self._settings["sample_rate"]
|
||||
)
|
||||
return resampled
|
||||
return audio_data
|
||||
return None
|
||||
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
@@ -229,6 +226,10 @@ class PollyTTSService(TTSService):
|
||||
yield None
|
||||
return
|
||||
|
||||
audio_data = await self._resampler.resample(
|
||||
audio_data, 16000, self._settings["sample_rate"]
|
||||
)
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
@@ -94,7 +94,7 @@ class TavusVideoService(AIService):
|
||||
async def _encode_audio_and_send(self, audio: bytes, in_rate: int, done: bool) -> None:
|
||||
"""Encodes audio to base64 and sends it to Tavus"""
|
||||
if not done:
|
||||
audio = self._resampler.resample(audio, in_rate, 16000)
|
||||
audio = await self._resampler.resample(audio, in_rate, 16000)
|
||||
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
||||
logger.trace(f"{self}: sending {len(audio)} bytes")
|
||||
await self._send_audio_message(audio_base64, done=done)
|
||||
|
||||
@@ -163,7 +163,7 @@ class XTTSService(TTSService):
|
||||
buffer = buffer[48000:]
|
||||
|
||||
# XTTS uses 24000 so we need to resample to our desired rate.
|
||||
resampled_audio = self._resampler.resample(
|
||||
resampled_audio = await self._resampler.resample(
|
||||
bytes(process_data), 24000, self._sample_rate
|
||||
)
|
||||
# Create the frame with the resampled audio
|
||||
@@ -172,7 +172,9 @@ class XTTSService(TTSService):
|
||||
|
||||
# Process any remaining data in the buffer.
|
||||
if len(buffer) > 0:
|
||||
resampled_audio = self._resampler.resample(bytes(buffer), 24000, self._sample_rate)
|
||||
resampled_audio = await self._resampler.resample(
|
||||
bytes(buffer), 24000, self._sample_rate
|
||||
)
|
||||
frame = TTSAudioRawFrame(resampled_audio, self._sample_rate, 1)
|
||||
yield frame
|
||||
|
||||
|
||||
Reference in New Issue
Block a user