Configured the services to use create_stream_resampler instead of create_default_resampler
This commit is contained in:
@@ -14,9 +14,8 @@ configurations and event-driven processing.
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler, interleave_stereo_audio, mix_audio
|
||||
from pipecat.audio.utils import create_stream_resampler, interleave_stereo_audio, mix_audio
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
CancelFrame,
|
||||
@@ -106,7 +105,8 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
|
||||
self._recording = False
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._input_resampler = create_stream_resampler()
|
||||
self._output_resampler = create_stream_resampler()
|
||||
|
||||
self._register_event_handler("on_audio_data")
|
||||
self._register_event_handler("on_track_audio_data")
|
||||
@@ -210,7 +210,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
silence = self._compute_silence(self._last_user_frame_at)
|
||||
self._user_audio_buffer.extend(silence)
|
||||
# Add user audio.
|
||||
resampled = await self._resample_audio(frame)
|
||||
resampled = await self._resample_input_audio(frame)
|
||||
self._user_audio_buffer.extend(resampled)
|
||||
# Save time of frame so we can compute silence.
|
||||
self._last_user_frame_at = time.time()
|
||||
@@ -219,7 +219,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
silence = self._compute_silence(self._last_bot_frame_at)
|
||||
self._bot_audio_buffer.extend(silence)
|
||||
# Add bot audio.
|
||||
resampled = await self._resample_audio(frame)
|
||||
resampled = await self._resample_output_audio(frame)
|
||||
self._bot_audio_buffer.extend(resampled)
|
||||
# Save time of frame so we can compute silence.
|
||||
self._last_bot_frame_at = time.time()
|
||||
@@ -247,7 +247,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
self._bot_turn_audio_buffer = bytearray()
|
||||
|
||||
if isinstance(frame, InputAudioRawFrame):
|
||||
resampled = await self._resample_audio(frame)
|
||||
resampled = await self._resample_input_audio(frame)
|
||||
self._user_turn_audio_buffer += resampled
|
||||
# In the case of the user, we need to keep a short buffer of audio
|
||||
# since VAD notification of when the user starts speaking comes
|
||||
@@ -259,7 +259,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
discarded = len(self._user_turn_audio_buffer) - self._audio_buffer_size_1s
|
||||
self._user_turn_audio_buffer = self._user_turn_audio_buffer[discarded:]
|
||||
elif self._bot_speaking and isinstance(frame, OutputAudioRawFrame):
|
||||
resampled = await self._resample_audio(frame)
|
||||
resampled = await self._resample_output_audio(frame)
|
||||
self._bot_turn_audio_buffer += resampled
|
||||
|
||||
async def _call_on_audio_data_handler(self):
|
||||
@@ -301,9 +301,17 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
self._user_turn_audio_buffer = bytearray()
|
||||
self._bot_turn_audio_buffer = bytearray()
|
||||
|
||||
async def _resample_audio(self, frame: AudioRawFrame) -> bytes:
|
||||
async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes:
|
||||
"""Resample audio frame to the target sample rate."""
|
||||
return await self._resampler.resample(frame.audio, frame.sample_rate, self._sample_rate)
|
||||
return await self._input_resampler.resample(
|
||||
frame.audio, frame.sample_rate, self._sample_rate
|
||||
)
|
||||
|
||||
async def _resample_output_audio(self, frame: OutputAudioRawFrame) -> bytes:
|
||||
"""Resample audio frame to the target sample rate."""
|
||||
return await self._output_resampler.resample(
|
||||
frame.audio, frame.sample_rate, self._sample_rate
|
||||
)
|
||||
|
||||
def _compute_silence(self, from_time: float) -> bytes:
|
||||
"""Compute silence to insert based on time gap."""
|
||||
|
||||
@@ -13,7 +13,7 @@ from typing import Optional
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler
|
||||
from pipecat.audio.utils import create_stream_resampler
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
Frame,
|
||||
@@ -67,7 +67,8 @@ class ExotelFrameSerializer(FrameSerializer):
|
||||
self._exotel_sample_rate = self._params.exotel_sample_rate
|
||||
self._sample_rate = 0 # Pipeline input rate
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._input_resampler = create_stream_resampler()
|
||||
self._output_resampler = create_stream_resampler()
|
||||
|
||||
@property
|
||||
def type(self) -> FrameSerializerType:
|
||||
@@ -104,7 +105,7 @@ class ExotelFrameSerializer(FrameSerializer):
|
||||
data = frame.audio
|
||||
|
||||
# Output: Exotel outputs PCM audio, but we need to resample to match requested sample_rate
|
||||
serialized_data = await self._resampler.resample(
|
||||
serialized_data = await self._output_resampler.resample(
|
||||
data, frame.sample_rate, self._exotel_sample_rate
|
||||
)
|
||||
payload = base64.b64encode(serialized_data).decode("ascii")
|
||||
@@ -138,7 +139,7 @@ class ExotelFrameSerializer(FrameSerializer):
|
||||
payload_base64 = message["media"]["payload"]
|
||||
payload = base64.b64decode(payload_base64)
|
||||
|
||||
deserialized_data = await self._resampler.resample(
|
||||
deserialized_data = await self._input_resampler.resample(
|
||||
payload,
|
||||
self._exotel_sample_rate,
|
||||
self._sample_rate,
|
||||
|
||||
@@ -13,7 +13,7 @@ from typing import Optional
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm
|
||||
from pipecat.audio.utils import create_stream_resampler, pcm_to_ulaw, ulaw_to_pcm
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
@@ -81,7 +81,8 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
self._plivo_sample_rate = self._params.plivo_sample_rate
|
||||
self._sample_rate = 0 # Pipeline input rate
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._input_resampler = create_stream_resampler()
|
||||
self._output_resampler = create_stream_resampler()
|
||||
self._hangup_attempted = False
|
||||
|
||||
@property
|
||||
@@ -129,7 +130,7 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
|
||||
# Output: Convert PCM at frame's rate to 8kHz μ-law for Plivo
|
||||
serialized_data = await pcm_to_ulaw(
|
||||
data, frame.sample_rate, self._plivo_sample_rate, self._resampler
|
||||
data, frame.sample_rate, self._plivo_sample_rate, self._output_resampler
|
||||
)
|
||||
payload = base64.b64encode(serialized_data).decode("utf-8")
|
||||
answer = {
|
||||
@@ -224,7 +225,7 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
|
||||
# Input: Convert Plivo's 8kHz μ-law to PCM at pipeline input rate
|
||||
deserialized_data = await ulaw_to_pcm(
|
||||
payload, self._plivo_sample_rate, self._sample_rate, self._resampler
|
||||
payload, self._plivo_sample_rate, self._sample_rate, self._input_resampler
|
||||
)
|
||||
audio_frame = InputAudioRawFrame(
|
||||
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
|
||||
|
||||
@@ -16,7 +16,7 @@ from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import (
|
||||
alaw_to_pcm,
|
||||
create_default_resampler,
|
||||
create_stream_resampler,
|
||||
pcm_to_alaw,
|
||||
pcm_to_ulaw,
|
||||
ulaw_to_pcm,
|
||||
@@ -93,7 +93,8 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
self._telnyx_sample_rate = self._params.telnyx_sample_rate
|
||||
self._sample_rate = 0 # Pipeline input rate
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._input_resampler = create_stream_resampler()
|
||||
self._output_resampler = create_stream_resampler()
|
||||
self._hangup_attempted = False
|
||||
|
||||
@property
|
||||
@@ -145,11 +146,11 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
# Output: Convert PCM at frame's rate to 8kHz encoded for Telnyx
|
||||
if self._params.inbound_encoding == "PCMU":
|
||||
serialized_data = await pcm_to_ulaw(
|
||||
data, frame.sample_rate, self._telnyx_sample_rate, self._resampler
|
||||
data, frame.sample_rate, self._telnyx_sample_rate, self._output_resampler
|
||||
)
|
||||
elif self._params.inbound_encoding == "PCMA":
|
||||
serialized_data = await pcm_to_alaw(
|
||||
data, frame.sample_rate, self._telnyx_sample_rate, self._resampler
|
||||
data, frame.sample_rate, self._telnyx_sample_rate, self._output_resampler
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported encoding: {self._params.inbound_encoding}")
|
||||
@@ -249,14 +250,14 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
payload,
|
||||
self._telnyx_sample_rate,
|
||||
self._sample_rate,
|
||||
self._resampler,
|
||||
self._input_resampler,
|
||||
)
|
||||
elif self._params.outbound_encoding == "PCMA":
|
||||
deserialized_data = await alaw_to_pcm(
|
||||
payload,
|
||||
self._telnyx_sample_rate,
|
||||
self._sample_rate,
|
||||
self._resampler,
|
||||
self._input_resampler,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported encoding: {self._params.outbound_encoding}")
|
||||
|
||||
@@ -13,7 +13,7 @@ from typing import Optional
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm
|
||||
from pipecat.audio.utils import create_stream_resampler, pcm_to_ulaw, ulaw_to_pcm
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
@@ -81,7 +81,8 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
self._twilio_sample_rate = self._params.twilio_sample_rate
|
||||
self._sample_rate = 0 # Pipeline input rate
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._input_resampler = create_stream_resampler()
|
||||
self._output_resampler = create_stream_resampler()
|
||||
self._hangup_attempted = False
|
||||
|
||||
@property
|
||||
@@ -129,7 +130,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
|
||||
# Output: Convert PCM at frame's rate to 8kHz μ-law for Twilio
|
||||
serialized_data = await pcm_to_ulaw(
|
||||
data, frame.sample_rate, self._twilio_sample_rate, self._resampler
|
||||
data, frame.sample_rate, self._twilio_sample_rate, self._output_resampler
|
||||
)
|
||||
payload = base64.b64encode(serialized_data).decode("utf-8")
|
||||
answer = {
|
||||
@@ -214,7 +215,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
|
||||
# Input: Convert Twilio's 8kHz μ-law to PCM at pipeline input rate
|
||||
deserialized_data = await ulaw_to_pcm(
|
||||
payload, self._twilio_sample_rate, self._sample_rate, self._resampler
|
||||
payload, self._twilio_sample_rate, self._sample_rate, self._input_resampler
|
||||
)
|
||||
audio_frame = InputAudioRawFrame(
|
||||
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
|
||||
|
||||
@@ -17,7 +17,7 @@ from typing import AsyncGenerator, List, Optional
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler
|
||||
from pipecat.audio.utils import create_stream_resampler
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
@@ -195,7 +195,7 @@ class AWSPollyTTSService(TTSService):
|
||||
"lexicon_names": params.lexicon_names,
|
||||
}
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._resampler = create_stream_resampler()
|
||||
|
||||
self.set_voice(voice_id)
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ import aiohttp
|
||||
from daily.daily import AudioData, VideoFrame
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler
|
||||
from pipecat.audio.utils import create_stream_resampler
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
@@ -75,7 +75,7 @@ class TavusVideoService(AIService):
|
||||
self._client: Optional[TavusTransportClient] = None
|
||||
|
||||
self._conversation_id: str
|
||||
self._resampler = create_default_resampler()
|
||||
self._resampler = create_stream_resampler()
|
||||
|
||||
self._audio_buffer = bytearray()
|
||||
self._send_task: Optional[asyncio.Task] = None
|
||||
|
||||
@@ -15,7 +15,7 @@ from typing import Any, AsyncGenerator, Dict, Optional
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler
|
||||
from pipecat.audio.utils import create_stream_resampler
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
@@ -121,7 +121,7 @@ class XTTSService(TTSService):
|
||||
self._studio_speakers: Optional[Dict[str, Any]] = None
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._resampler = create_stream_resampler()
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
@@ -21,7 +21,7 @@ from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
|
||||
from pipecat.audio.utils import create_default_resampler
|
||||
from pipecat.audio.utils import create_stream_resampler
|
||||
from pipecat.frames.frames import (
|
||||
BotSpeakingFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
@@ -358,7 +358,7 @@ class BaseOutputTransport(FrameProcessor):
|
||||
self._audio_buffer = bytearray()
|
||||
|
||||
# This will be used to resample incoming audio to the output sample rate.
|
||||
self._resampler = create_default_resampler()
|
||||
self._resampler = create_stream_resampler()
|
||||
|
||||
# The user can provide a single mixer, to be used by the default
|
||||
# destination, or a destination/mixer mapping.
|
||||
|
||||
@@ -18,7 +18,7 @@ from typing import Any, Awaitable, Callable, List, Optional
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler
|
||||
from pipecat.audio.utils import create_stream_resampler
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
@@ -513,7 +513,7 @@ class LiveKitInputTransport(BaseInputTransport):
|
||||
|
||||
self._audio_in_task = None
|
||||
self._vad_analyzer: Optional[VADAnalyzer] = params.vad_analyzer
|
||||
self._resampler = create_default_resampler()
|
||||
self._resampler = create_stream_resampler()
|
||||
|
||||
# Whether we have seen a StartFrame already.
|
||||
self._initialized = False
|
||||
|
||||
@@ -20,7 +20,6 @@ from daily.daily import AudioData
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
@@ -437,8 +436,6 @@ class TavusInputTransport(BaseInputTransport):
|
||||
super().__init__(params, **kwargs)
|
||||
self._client = client
|
||||
self._params = params
|
||||
self._resampler = create_default_resampler()
|
||||
|
||||
# Whether we have seen a StartFrame already.
|
||||
self._initialized = False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user