diff --git a/src/pipecat/processors/audio/audio_buffer_processor.py b/src/pipecat/processors/audio/audio_buffer_processor.py index 78561b2f9..9cfc08daa 100644 --- a/src/pipecat/processors/audio/audio_buffer_processor.py +++ b/src/pipecat/processors/audio/audio_buffer_processor.py @@ -14,9 +14,8 @@ configurations and event-driven processing. import time from typing import Optional -from pipecat.audio.utils import create_default_resampler, interleave_stereo_audio, mix_audio +from pipecat.audio.utils import create_stream_resampler, interleave_stereo_audio, mix_audio from pipecat.frames.frames import ( - AudioRawFrame, BotStartedSpeakingFrame, BotStoppedSpeakingFrame, CancelFrame, @@ -106,7 +105,8 @@ class AudioBufferProcessor(FrameProcessor): self._recording = False - self._resampler = create_default_resampler() + self._input_resampler = create_stream_resampler() + self._output_resampler = create_stream_resampler() self._register_event_handler("on_audio_data") self._register_event_handler("on_track_audio_data") @@ -210,7 +210,7 @@ class AudioBufferProcessor(FrameProcessor): silence = self._compute_silence(self._last_user_frame_at) self._user_audio_buffer.extend(silence) # Add user audio. - resampled = await self._resample_audio(frame) + resampled = await self._resample_input_audio(frame) self._user_audio_buffer.extend(resampled) # Save time of frame so we can compute silence. self._last_user_frame_at = time.time() @@ -219,7 +219,7 @@ class AudioBufferProcessor(FrameProcessor): silence = self._compute_silence(self._last_bot_frame_at) self._bot_audio_buffer.extend(silence) # Add bot audio. - resampled = await self._resample_audio(frame) + resampled = await self._resample_output_audio(frame) self._bot_audio_buffer.extend(resampled) # Save time of frame so we can compute silence. self._last_bot_frame_at = time.time() @@ -247,7 +247,7 @@ class AudioBufferProcessor(FrameProcessor): self._bot_turn_audio_buffer = bytearray() if isinstance(frame, InputAudioRawFrame): - resampled = await self._resample_audio(frame) + resampled = await self._resample_input_audio(frame) self._user_turn_audio_buffer += resampled # In the case of the user, we need to keep a short buffer of audio # since VAD notification of when the user starts speaking comes @@ -259,7 +259,7 @@ class AudioBufferProcessor(FrameProcessor): discarded = len(self._user_turn_audio_buffer) - self._audio_buffer_size_1s self._user_turn_audio_buffer = self._user_turn_audio_buffer[discarded:] elif self._bot_speaking and isinstance(frame, OutputAudioRawFrame): - resampled = await self._resample_audio(frame) + resampled = await self._resample_output_audio(frame) self._bot_turn_audio_buffer += resampled async def _call_on_audio_data_handler(self): @@ -301,9 +301,17 @@ class AudioBufferProcessor(FrameProcessor): self._user_turn_audio_buffer = bytearray() self._bot_turn_audio_buffer = bytearray() - async def _resample_audio(self, frame: AudioRawFrame) -> bytes: + async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes: """Resample audio frame to the target sample rate.""" - return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate) + return await self._input_resampler.resample( + frame.audio, frame.sample_rate, self._sample_rate + ) + + async def _resample_output_audio(self, frame: OutputAudioRawFrame) -> bytes: + """Resample audio frame to the target sample rate.""" + return await self._output_resampler.resample( + frame.audio, frame.sample_rate, self._sample_rate + ) def _compute_silence(self, from_time: float) -> bytes: """Compute silence to insert based on time gap.""" diff --git a/src/pipecat/serializers/exotel.py b/src/pipecat/serializers/exotel.py index 960cbe74a..81f0daf39 100644 --- a/src/pipecat/serializers/exotel.py +++ b/src/pipecat/serializers/exotel.py @@ -13,7 +13,7 @@ from typing import Optional from loguru import logger from pydantic import BaseModel -from pipecat.audio.utils import create_default_resampler +from pipecat.audio.utils import create_stream_resampler from pipecat.frames.frames import ( AudioRawFrame, Frame, @@ -67,7 +67,8 @@ class ExotelFrameSerializer(FrameSerializer): self._exotel_sample_rate = self._params.exotel_sample_rate self._sample_rate = 0 # Pipeline input rate - self._resampler = create_default_resampler() + self._input_resampler = create_stream_resampler() + self._output_resampler = create_stream_resampler() @property def type(self) -> FrameSerializerType: @@ -104,7 +105,7 @@ class ExotelFrameSerializer(FrameSerializer): data = frame.audio # Output: Exotel outputs PCM audio, but we need to resample to match requested sample_rate - serialized_data = await self._resampler.resample( + serialized_data = await self._output_resampler.resample( data, frame.sample_rate, self._exotel_sample_rate ) payload = base64.b64encode(serialized_data).decode("ascii") @@ -138,7 +139,7 @@ class ExotelFrameSerializer(FrameSerializer): payload_base64 = message["media"]["payload"] payload = base64.b64decode(payload_base64) - deserialized_data = await self._resampler.resample( + deserialized_data = await self._input_resampler.resample( payload, self._exotel_sample_rate, self._sample_rate, diff --git a/src/pipecat/serializers/plivo.py b/src/pipecat/serializers/plivo.py index d0c19be0a..1b2ebb57f 100644 --- a/src/pipecat/serializers/plivo.py +++ b/src/pipecat/serializers/plivo.py @@ -13,7 +13,7 @@ from typing import Optional from loguru import logger from pydantic import BaseModel -from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm +from pipecat.audio.utils import create_stream_resampler, pcm_to_ulaw, ulaw_to_pcm from pipecat.frames.frames import ( AudioRawFrame, CancelFrame, @@ -81,7 +81,8 @@ class PlivoFrameSerializer(FrameSerializer): self._plivo_sample_rate = self._params.plivo_sample_rate self._sample_rate = 0 # Pipeline input rate - self._resampler = create_default_resampler() + self._input_resampler = create_stream_resampler() + self._output_resampler = create_stream_resampler() self._hangup_attempted = False @property @@ -129,7 +130,7 @@ class PlivoFrameSerializer(FrameSerializer): # Output: Convert PCM at frame's rate to 8kHz μ-law for Plivo serialized_data = await pcm_to_ulaw( - data, frame.sample_rate, self._plivo_sample_rate, self._resampler + data, frame.sample_rate, self._plivo_sample_rate, self._output_resampler ) payload = base64.b64encode(serialized_data).decode("utf-8") answer = { @@ -224,7 +225,7 @@ class PlivoFrameSerializer(FrameSerializer): # Input: Convert Plivo's 8kHz μ-law to PCM at pipeline input rate deserialized_data = await ulaw_to_pcm( - payload, self._plivo_sample_rate, self._sample_rate, self._resampler + payload, self._plivo_sample_rate, self._sample_rate, self._input_resampler ) audio_frame = InputAudioRawFrame( audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate diff --git a/src/pipecat/serializers/telnyx.py b/src/pipecat/serializers/telnyx.py index adc235555..133c50dc9 100644 --- a/src/pipecat/serializers/telnyx.py +++ b/src/pipecat/serializers/telnyx.py @@ -16,7 +16,7 @@ from pydantic import BaseModel from pipecat.audio.utils import ( alaw_to_pcm, - create_default_resampler, + create_stream_resampler, pcm_to_alaw, pcm_to_ulaw, ulaw_to_pcm, @@ -93,7 +93,8 @@ class TelnyxFrameSerializer(FrameSerializer): self._telnyx_sample_rate = self._params.telnyx_sample_rate self._sample_rate = 0 # Pipeline input rate - self._resampler = create_default_resampler() + self._input_resampler = create_stream_resampler() + self._output_resampler = create_stream_resampler() self._hangup_attempted = False @property @@ -145,11 +146,11 @@ class TelnyxFrameSerializer(FrameSerializer): # Output: Convert PCM at frame's rate to 8kHz encoded for Telnyx if self._params.inbound_encoding == "PCMU": serialized_data = await pcm_to_ulaw( - data, frame.sample_rate, self._telnyx_sample_rate, self._resampler + data, frame.sample_rate, self._telnyx_sample_rate, self._output_resampler ) elif self._params.inbound_encoding == "PCMA": serialized_data = await pcm_to_alaw( - data, frame.sample_rate, self._telnyx_sample_rate, self._resampler + data, frame.sample_rate, self._telnyx_sample_rate, self._output_resampler ) else: raise ValueError(f"Unsupported encoding: {self._params.inbound_encoding}") @@ -249,14 +250,14 @@ class TelnyxFrameSerializer(FrameSerializer): payload, self._telnyx_sample_rate, self._sample_rate, - self._resampler, + self._input_resampler, ) elif self._params.outbound_encoding == "PCMA": deserialized_data = await alaw_to_pcm( payload, self._telnyx_sample_rate, self._sample_rate, - self._resampler, + self._input_resampler, ) else: raise ValueError(f"Unsupported encoding: {self._params.outbound_encoding}") diff --git a/src/pipecat/serializers/twilio.py b/src/pipecat/serializers/twilio.py index ae4d54e4d..50ca420fa 100644 --- a/src/pipecat/serializers/twilio.py +++ b/src/pipecat/serializers/twilio.py @@ -13,7 +13,7 @@ from typing import Optional from loguru import logger from pydantic import BaseModel -from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm +from pipecat.audio.utils import create_stream_resampler, pcm_to_ulaw, ulaw_to_pcm from pipecat.frames.frames import ( AudioRawFrame, CancelFrame, @@ -81,7 +81,8 @@ class TwilioFrameSerializer(FrameSerializer): self._twilio_sample_rate = self._params.twilio_sample_rate self._sample_rate = 0 # Pipeline input rate - self._resampler = create_default_resampler() + self._input_resampler = create_stream_resampler() + self._output_resampler = create_stream_resampler() self._hangup_attempted = False @property @@ -129,7 +130,7 @@ class TwilioFrameSerializer(FrameSerializer): # Output: Convert PCM at frame's rate to 8kHz μ-law for Twilio serialized_data = await pcm_to_ulaw( - data, frame.sample_rate, self._twilio_sample_rate, self._resampler + data, frame.sample_rate, self._twilio_sample_rate, self._output_resampler ) payload = base64.b64encode(serialized_data).decode("utf-8") answer = { @@ -214,7 +215,7 @@ class TwilioFrameSerializer(FrameSerializer): # Input: Convert Twilio's 8kHz μ-law to PCM at pipeline input rate deserialized_data = await ulaw_to_pcm( - payload, self._twilio_sample_rate, self._sample_rate, self._resampler + payload, self._twilio_sample_rate, self._sample_rate, self._input_resampler ) audio_frame = InputAudioRawFrame( audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py index ce89dea9e..7f4ccd079 100644 --- a/src/pipecat/services/aws/tts.py +++ b/src/pipecat/services/aws/tts.py @@ -17,7 +17,7 @@ from typing import AsyncGenerator, List, Optional from loguru import logger from pydantic import BaseModel -from pipecat.audio.utils import create_default_resampler +from pipecat.audio.utils import create_stream_resampler from pipecat.frames.frames import ( ErrorFrame, Frame, @@ -195,7 +195,7 @@ class AWSPollyTTSService(TTSService): "lexicon_names": params.lexicon_names, } - self._resampler = create_default_resampler() + self._resampler = create_stream_resampler() self.set_voice(voice_id) diff --git a/src/pipecat/services/tavus/video.py b/src/pipecat/services/tavus/video.py index d633329bb..f9ba2833b 100644 --- a/src/pipecat/services/tavus/video.py +++ b/src/pipecat/services/tavus/video.py @@ -17,7 +17,7 @@ import aiohttp from daily.daily import AudioData, VideoFrame from loguru import logger -from pipecat.audio.utils import create_default_resampler +from pipecat.audio.utils import create_stream_resampler from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -75,7 +75,7 @@ class TavusVideoService(AIService): self._client: Optional[TavusTransportClient] = None self._conversation_id: str - self._resampler = create_default_resampler() + self._resampler = create_stream_resampler() self._audio_buffer = bytearray() self._send_task: Optional[asyncio.Task] = None diff --git a/src/pipecat/services/xtts/tts.py b/src/pipecat/services/xtts/tts.py index 6332e26ef..844e0fbaf 100644 --- a/src/pipecat/services/xtts/tts.py +++ b/src/pipecat/services/xtts/tts.py @@ -15,7 +15,7 @@ from typing import Any, AsyncGenerator, Dict, Optional import aiohttp from loguru import logger -from pipecat.audio.utils import create_default_resampler +from pipecat.audio.utils import create_stream_resampler from pipecat.frames.frames import ( ErrorFrame, Frame, @@ -121,7 +121,7 @@ class XTTSService(TTSService): self._studio_speakers: Optional[Dict[str, Any]] = None self._aiohttp_session = aiohttp_session - self._resampler = create_default_resampler() + self._resampler = create_stream_resampler() def can_generate_metrics(self) -> bool: """Check if this service can generate processing metrics. diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index 93a11f5a3..4d1120cc7 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -21,7 +21,7 @@ from loguru import logger from PIL import Image from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer -from pipecat.audio.utils import create_default_resampler +from pipecat.audio.utils import create_stream_resampler from pipecat.frames.frames import ( BotSpeakingFrame, BotStartedSpeakingFrame, @@ -358,7 +358,7 @@ class BaseOutputTransport(FrameProcessor): self._audio_buffer = bytearray() # This will be used to resample incoming audio to the output sample rate. - self._resampler = create_default_resampler() + self._resampler = create_stream_resampler() # The user can provide a single mixer, to be used by the default # destination, or a destination/mixer mapping. diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py index d7363b9e7..68fc6a8a8 100644 --- a/src/pipecat/transports/services/livekit.py +++ b/src/pipecat/transports/services/livekit.py @@ -18,7 +18,7 @@ from typing import Any, Awaitable, Callable, List, Optional from loguru import logger from pydantic import BaseModel -from pipecat.audio.utils import create_default_resampler +from pipecat.audio.utils import create_stream_resampler from pipecat.audio.vad.vad_analyzer import VADAnalyzer from pipecat.frames.frames import ( AudioRawFrame, @@ -513,7 +513,7 @@ class LiveKitInputTransport(BaseInputTransport): self._audio_in_task = None self._vad_analyzer: Optional[VADAnalyzer] = params.vad_analyzer - self._resampler = create_default_resampler() + self._resampler = create_stream_resampler() # Whether we have seen a StartFrame already. self._initialized = False diff --git a/src/pipecat/transports/services/tavus.py b/src/pipecat/transports/services/tavus.py index e83dc6a20..10ae001e0 100644 --- a/src/pipecat/transports/services/tavus.py +++ b/src/pipecat/transports/services/tavus.py @@ -20,7 +20,6 @@ from daily.daily import AudioData from loguru import logger from pydantic import BaseModel -from pipecat.audio.utils import create_default_resampler from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -437,8 +436,6 @@ class TavusInputTransport(BaseInputTransport): super().__init__(params, **kwargs) self._client = client self._params = params - self._resampler = create_default_resampler() - # Whether we have seen a StartFrame already. self._initialized = False