Configured the services to use create_stream_resampler instead of create_default_resampler

This commit is contained in:
Filipi Fuchter
2025-07-02 16:20:34 -03:00
parent 3de271161c
commit 5af563cd91
11 changed files with 49 additions and 40 deletions

View File

@@ -14,9 +14,8 @@ configurations and event-driven processing.
import time
from typing import Optional
from pipecat.audio.utils import create_default_resampler, interleave_stereo_audio, mix_audio
from pipecat.audio.utils import create_stream_resampler, interleave_stereo_audio, mix_audio
from pipecat.frames.frames import (
AudioRawFrame,
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
CancelFrame,
@@ -106,7 +105,8 @@ class AudioBufferProcessor(FrameProcessor):
self._recording = False
self._resampler = create_default_resampler()
self._input_resampler = create_stream_resampler()
self._output_resampler = create_stream_resampler()
self._register_event_handler("on_audio_data")
self._register_event_handler("on_track_audio_data")
@@ -210,7 +210,7 @@ class AudioBufferProcessor(FrameProcessor):
silence = self._compute_silence(self._last_user_frame_at)
self._user_audio_buffer.extend(silence)
# Add user audio.
resampled = await self._resample_audio(frame)
resampled = await self._resample_input_audio(frame)
self._user_audio_buffer.extend(resampled)
# Save time of frame so we can compute silence.
self._last_user_frame_at = time.time()
@@ -219,7 +219,7 @@ class AudioBufferProcessor(FrameProcessor):
silence = self._compute_silence(self._last_bot_frame_at)
self._bot_audio_buffer.extend(silence)
# Add bot audio.
resampled = await self._resample_audio(frame)
resampled = await self._resample_output_audio(frame)
self._bot_audio_buffer.extend(resampled)
# Save time of frame so we can compute silence.
self._last_bot_frame_at = time.time()
@@ -247,7 +247,7 @@ class AudioBufferProcessor(FrameProcessor):
self._bot_turn_audio_buffer = bytearray()
if isinstance(frame, InputAudioRawFrame):
resampled = await self._resample_audio(frame)
resampled = await self._resample_input_audio(frame)
self._user_turn_audio_buffer += resampled
# In the case of the user, we need to keep a short buffer of audio
# since VAD notification of when the user starts speaking comes
@@ -259,7 +259,7 @@ class AudioBufferProcessor(FrameProcessor):
discarded = len(self._user_turn_audio_buffer) - self._audio_buffer_size_1s
self._user_turn_audio_buffer = self._user_turn_audio_buffer[discarded:]
elif self._bot_speaking and isinstance(frame, OutputAudioRawFrame):
resampled = await self._resample_audio(frame)
resampled = await self._resample_output_audio(frame)
self._bot_turn_audio_buffer += resampled
async def _call_on_audio_data_handler(self):
@@ -301,9 +301,17 @@ class AudioBufferProcessor(FrameProcessor):
self._user_turn_audio_buffer = bytearray()
self._bot_turn_audio_buffer = bytearray()
async def _resample_audio(self, frame: AudioRawFrame) -> bytes:
async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes:
"""Resample audio frame to the target sample rate."""
return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)
return await self._input_resampler.resample(
frame.audio, frame.sample_rate, self._sample_rate
)
async def _resample_output_audio(self, frame: OutputAudioRawFrame) -> bytes:
"""Resample audio frame to the target sample rate."""
return await self._output_resampler.resample(
frame.audio, frame.sample_rate, self._sample_rate
)
def _compute_silence(self, from_time: float) -> bytes:
"""Compute silence to insert based on time gap."""

View File

@@ -13,7 +13,7 @@ from typing import Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_default_resampler
from pipecat.audio.utils import create_stream_resampler
from pipecat.frames.frames import (
AudioRawFrame,
Frame,
@@ -67,7 +67,8 @@ class ExotelFrameSerializer(FrameSerializer):
self._exotel_sample_rate = self._params.exotel_sample_rate
self._sample_rate = 0 # Pipeline input rate
self._resampler = create_default_resampler()
self._input_resampler = create_stream_resampler()
self._output_resampler = create_stream_resampler()
@property
def type(self) -> FrameSerializerType:
@@ -104,7 +105,7 @@ class ExotelFrameSerializer(FrameSerializer):
data = frame.audio
# Output: Exotel outputs PCM audio, but we need to resample to match requested sample_rate
serialized_data = await self._resampler.resample(
serialized_data = await self._output_resampler.resample(
data, frame.sample_rate, self._exotel_sample_rate
)
payload = base64.b64encode(serialized_data).decode("ascii")
@@ -138,7 +139,7 @@ class ExotelFrameSerializer(FrameSerializer):
payload_base64 = message["media"]["payload"]
payload = base64.b64decode(payload_base64)
deserialized_data = await self._resampler.resample(
deserialized_data = await self._input_resampler.resample(
payload,
self._exotel_sample_rate,
self._sample_rate,

View File

@@ -13,7 +13,7 @@ from typing import Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm
from pipecat.audio.utils import create_stream_resampler, pcm_to_ulaw, ulaw_to_pcm
from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
@@ -81,7 +81,8 @@ class PlivoFrameSerializer(FrameSerializer):
self._plivo_sample_rate = self._params.plivo_sample_rate
self._sample_rate = 0 # Pipeline input rate
self._resampler = create_default_resampler()
self._input_resampler = create_stream_resampler()
self._output_resampler = create_stream_resampler()
self._hangup_attempted = False
@property
@@ -129,7 +130,7 @@ class PlivoFrameSerializer(FrameSerializer):
# Output: Convert PCM at frame's rate to 8kHz μ-law for Plivo
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._plivo_sample_rate, self._resampler
data, frame.sample_rate, self._plivo_sample_rate, self._output_resampler
)
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
@@ -224,7 +225,7 @@ class PlivoFrameSerializer(FrameSerializer):
# Input: Convert Plivo's 8kHz μ-law to PCM at pipeline input rate
deserialized_data = await ulaw_to_pcm(
payload, self._plivo_sample_rate, self._sample_rate, self._resampler
payload, self._plivo_sample_rate, self._sample_rate, self._input_resampler
)
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate

View File

@@ -16,7 +16,7 @@ from pydantic import BaseModel
from pipecat.audio.utils import (
alaw_to_pcm,
create_default_resampler,
create_stream_resampler,
pcm_to_alaw,
pcm_to_ulaw,
ulaw_to_pcm,
@@ -93,7 +93,8 @@ class TelnyxFrameSerializer(FrameSerializer):
self._telnyx_sample_rate = self._params.telnyx_sample_rate
self._sample_rate = 0 # Pipeline input rate
self._resampler = create_default_resampler()
self._input_resampler = create_stream_resampler()
self._output_resampler = create_stream_resampler()
self._hangup_attempted = False
@property
@@ -145,11 +146,11 @@ class TelnyxFrameSerializer(FrameSerializer):
# Output: Convert PCM at frame's rate to 8kHz encoded for Telnyx
if self._params.inbound_encoding == "PCMU":
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._telnyx_sample_rate, self._resampler
data, frame.sample_rate, self._telnyx_sample_rate, self._output_resampler
)
elif self._params.inbound_encoding == "PCMA":
serialized_data = await pcm_to_alaw(
data, frame.sample_rate, self._telnyx_sample_rate, self._resampler
data, frame.sample_rate, self._telnyx_sample_rate, self._output_resampler
)
else:
raise ValueError(f"Unsupported encoding: {self._params.inbound_encoding}")
@@ -249,14 +250,14 @@ class TelnyxFrameSerializer(FrameSerializer):
payload,
self._telnyx_sample_rate,
self._sample_rate,
self._resampler,
self._input_resampler,
)
elif self._params.outbound_encoding == "PCMA":
deserialized_data = await alaw_to_pcm(
payload,
self._telnyx_sample_rate,
self._sample_rate,
self._resampler,
self._input_resampler,
)
else:
raise ValueError(f"Unsupported encoding: {self._params.outbound_encoding}")

View File

@@ -13,7 +13,7 @@ from typing import Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm
from pipecat.audio.utils import create_stream_resampler, pcm_to_ulaw, ulaw_to_pcm
from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
@@ -81,7 +81,8 @@ class TwilioFrameSerializer(FrameSerializer):
self._twilio_sample_rate = self._params.twilio_sample_rate
self._sample_rate = 0 # Pipeline input rate
self._resampler = create_default_resampler()
self._input_resampler = create_stream_resampler()
self._output_resampler = create_stream_resampler()
self._hangup_attempted = False
@property
@@ -129,7 +130,7 @@ class TwilioFrameSerializer(FrameSerializer):
# Output: Convert PCM at frame's rate to 8kHz μ-law for Twilio
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._twilio_sample_rate, self._resampler
data, frame.sample_rate, self._twilio_sample_rate, self._output_resampler
)
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
@@ -214,7 +215,7 @@ class TwilioFrameSerializer(FrameSerializer):
# Input: Convert Twilio's 8kHz μ-law to PCM at pipeline input rate
deserialized_data = await ulaw_to_pcm(
payload, self._twilio_sample_rate, self._sample_rate, self._resampler
payload, self._twilio_sample_rate, self._sample_rate, self._input_resampler
)
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate

View File

@@ -17,7 +17,7 @@ from typing import AsyncGenerator, List, Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_default_resampler
from pipecat.audio.utils import create_stream_resampler
from pipecat.frames.frames import (
ErrorFrame,
Frame,
@@ -195,7 +195,7 @@ class AWSPollyTTSService(TTSService):
"lexicon_names": params.lexicon_names,
}
self._resampler = create_default_resampler()
self._resampler = create_stream_resampler()
self.set_voice(voice_id)

View File

@@ -17,7 +17,7 @@ import aiohttp
from daily.daily import AudioData, VideoFrame
from loguru import logger
from pipecat.audio.utils import create_default_resampler
from pipecat.audio.utils import create_stream_resampler
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -75,7 +75,7 @@ class TavusVideoService(AIService):
self._client: Optional[TavusTransportClient] = None
self._conversation_id: str
self._resampler = create_default_resampler()
self._resampler = create_stream_resampler()
self._audio_buffer = bytearray()
self._send_task: Optional[asyncio.Task] = None

View File

@@ -15,7 +15,7 @@ from typing import Any, AsyncGenerator, Dict, Optional
import aiohttp
from loguru import logger
from pipecat.audio.utils import create_default_resampler
from pipecat.audio.utils import create_stream_resampler
from pipecat.frames.frames import (
ErrorFrame,
Frame,
@@ -121,7 +121,7 @@ class XTTSService(TTSService):
self._studio_speakers: Optional[Dict[str, Any]] = None
self._aiohttp_session = aiohttp_session
self._resampler = create_default_resampler()
self._resampler = create_stream_resampler()
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.

View File

@@ -21,7 +21,7 @@ from loguru import logger
from PIL import Image
from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
from pipecat.audio.utils import create_default_resampler
from pipecat.audio.utils import create_stream_resampler
from pipecat.frames.frames import (
BotSpeakingFrame,
BotStartedSpeakingFrame,
@@ -358,7 +358,7 @@ class BaseOutputTransport(FrameProcessor):
self._audio_buffer = bytearray()
# This will be used to resample incoming audio to the output sample rate.
self._resampler = create_default_resampler()
self._resampler = create_stream_resampler()
# The user can provide a single mixer, to be used by the default
# destination, or a destination/mixer mapping.

View File

@@ -18,7 +18,7 @@ from typing import Any, Awaitable, Callable, List, Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_default_resampler
from pipecat.audio.utils import create_stream_resampler
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
from pipecat.frames.frames import (
AudioRawFrame,
@@ -513,7 +513,7 @@ class LiveKitInputTransport(BaseInputTransport):
self._audio_in_task = None
self._vad_analyzer: Optional[VADAnalyzer] = params.vad_analyzer
self._resampler = create_default_resampler()
self._resampler = create_stream_resampler()
# Whether we have seen a StartFrame already.
self._initialized = False

View File

@@ -20,7 +20,6 @@ from daily.daily import AudioData
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_default_resampler
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -437,8 +436,6 @@ class TavusInputTransport(BaseInputTransport):
super().__init__(params, **kwargs)
self._client = client
self._params = params
self._resampler = create_default_resampler()
# Whether we have seen a StartFrame already.
self._initialized = False