diff --git a/CHANGELOG.md b/CHANGELOG.md index 1045974a3..dde8ccec0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,17 +9,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added a websocket service for PlayHT, called `PlayHTTTSService`. - Added input parameter options for `PlayHTTTSService` and `PlayHTHttpTTSService`. ### Changed +- Module `utils.audio` is now `audio.utils`. A new `resample_audio` function has + been added. + +- `PlayHTTTSService` now uses PlayHT websockets instead of HTTP requests. + +- The previous `PlayHTTTSService` HTTP implementation is now + `PlayHTHttpTTSService`. + - `PlayHTTTSService` and `PlayHTHttpTTSService` now use a `voice_engine` of `PlayHT3.0-mini`, which allows for multi-lingual support. -- Changed the name of the HTTP PlayHT service from `PlayHTTTSService` to - `PlayHTHttpTTSService` since there's now a websocket service, which is the - default. + - Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to match other services. diff --git a/pyproject.toml b/pyproject.toml index c67fe6c09..e294f44fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "protobuf~=4.25.4", "pydantic~=2.8.2", "pyloudnorm~=0.1.1", + "scipy~=1.14.1", ] [project.urls] @@ -61,7 +62,6 @@ silero = [ "onnxruntime>=1.16.1" ] together = [ "openai~=1.50.2" ] websocket = [ "websockets~=13.1", "fastapi~=0.115.0" ] whisper = [ "faster-whisper~=1.0.3" ] -xtts = [ "resampy~=0.4.3" ] [tool.setuptools.packages.find] # All the following settings are optional: diff --git a/src/pipecat/utils/audio.py b/src/pipecat/audio/utils.py similarity index 81% rename from src/pipecat/utils/audio.py rename to src/pipecat/audio/utils.py index 0764c6abb..6675a07ab 100644 --- a/src/pipecat/utils/audio.py +++ b/src/pipecat/audio/utils.py @@ -7,6 +7,14 @@ import audioop import numpy as np import pyloudnorm as pyln +from scipy import signal + + +def resample_audio(audio: bytes, original_rate: int, target_rate: int) -> bytes: + audio_data = np.frombuffer(audio, dtype=np.int16) + num_samples = int(len(audio) * target_rate / original_rate) + resampled_audio = signal.resample(audio_data, num_samples) + return resampled_audio.astype(np.int16).tobytes() def normalize_value(value, min_value, max_value): diff --git a/src/pipecat/audio/vad/vad_analyzer.py b/src/pipecat/audio/vad/vad_analyzer.py index fe2739b28..3387a1746 100644 --- a/src/pipecat/audio/vad/vad_analyzer.py +++ b/src/pipecat/audio/vad/vad_analyzer.py @@ -10,7 +10,7 @@ from enum import Enum from loguru import logger from pydantic.main import BaseModel -from pipecat.utils.audio import calculate_audio_volume, exp_smoothing +from pipecat.audio.utils import calculate_audio_volume, exp_smoothing class VADState(Enum): diff --git a/src/pipecat/serializers/twilio.py b/src/pipecat/serializers/twilio.py index c0d4c0c47..ebc62e484 100644 --- a/src/pipecat/serializers/twilio.py +++ b/src/pipecat/serializers/twilio.py @@ -9,9 +9,9 @@ import json from pydantic import BaseModel +from pipecat.audio.utils import ulaw_to_pcm, pcm_to_ulaw from pipecat.frames.frames import AudioRawFrame, Frame, StartInterruptionFrame from pipecat.serializers.base_serializer import FrameSerializer -from pipecat.utils.audio import ulaw_to_pcm, pcm_to_ulaw class TwilioFrameSerializer(FrameSerializer): diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index ad6d20d8e..8e11ad6ee 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -12,6 +12,7 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple from loguru import logger +from pipecat.audio.utils import calculate_audio_volume, exp_smoothing from pipecat.frames.frames import ( AudioRawFrame, CancelFrame, @@ -35,11 +36,9 @@ from pipecat.metrics.metrics import MetricsData from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.transcriptions.language import Language -from pipecat.utils.audio import calculate_audio_volume from pipecat.utils.string import match_endofsentence from pipecat.utils.text.base_text_filter import BaseTextFilter from pipecat.utils.time import seconds_to_nanoseconds -from pipecat.utils.utils import exp_smoothing class AIService(FrameProcessor): diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 6c1408553..1c444f9f1 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -7,9 +7,8 @@ from typing import Any, AsyncGenerator, Dict import aiohttp -import numpy as np -from loguru import logger +from pipecat.audio.utils import resample_audio from pipecat.frames.frames import ( ErrorFrame, Frame, @@ -21,12 +20,7 @@ from pipecat.frames.frames import ( from pipecat.services.ai_services import TTSService from pipecat.transcriptions.language import Language -try: - import resampy -except ModuleNotFoundError as e: - logger.error(f"Exception: {e}") - logger.error("In order to use XTTS, you need to `pip install pipecat-ai[xtts]`.") - raise Exception(f"Missing module: {e}") +from loguru import logger # The server below can connect to XTTS through a local running docker @@ -168,22 +162,16 @@ class XTTSService(TTSService): # Remove processed data from buffer buffer = buffer[48000:] - # Convert the byte data to numpy array for resampling - audio_np = np.frombuffer(process_data, dtype=np.int16) # Resample the audio from 24000 Hz to 16000 Hz - resampled_audio = resampy.resample(audio_np, 24000, 16000) - # Convert the numpy array back to bytes - resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes() + resampled_audio = resample_audio(bytes(process_data), 24000, 16000) # Create the frame with the resampled audio - frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1) + frame = TTSAudioRawFrame(resampled_audio, 16000, 1) yield frame # Process any remaining data in the buffer if len(buffer) > 0: - audio_np = np.frombuffer(buffer, dtype=np.int16) - resampled_audio = resampy.resample(audio_np, 24000, 16000) - resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes() - frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1) + resampled_audio = resample_audio(bytes(buffer), 24000, 16000) + frame = TTSAudioRawFrame(resampled_audio, 16000, 1) yield frame yield TTSStoppedFrame() diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py index 9cf2c617d..a6d261f69 100644 --- a/src/pipecat/transports/services/livekit.py +++ b/src/pipecat/transports/services/livekit.py @@ -8,11 +8,9 @@ import asyncio from dataclasses import dataclass from typing import Any, Awaitable, Callable, List -import numpy as np -from loguru import logger from pydantic import BaseModel -from scipy import signal +from pipecat.audio.utils import resample_audio from pipecat.audio.vad.vad_analyzer import VADAnalyzer from pipecat.frames.frames import ( AudioRawFrame, @@ -30,6 +28,8 @@ from pipecat.transports.base_input import BaseInputTransport from pipecat.transports.base_output import BaseOutputTransport from pipecat.transports.base_transport import BaseTransport, TransportParams +from loguru import logger + try: from livekit import rtc from tenacity import retry, stop_after_attempt, wait_exponential @@ -381,12 +381,12 @@ class LiveKitInputTransport(BaseInputTransport): self, audio_frame_event: rtc.AudioFrameEvent ) -> AudioRawFrame: audio_frame = audio_frame_event.frame - audio_data = np.frombuffer(audio_frame.data, dtype=np.int16) + audio_data = audio_frame.data original_sample_rate = audio_frame.sample_rate # Allow 8kHz and 16kHz, convert anything else to 16kHz if original_sample_rate not in [8000, 16000]: - audio_data = self._resample_audio(audio_data, original_sample_rate, 16000) + audio_data = resample_audio(audio_data, original_sample_rate, 16000) sample_rate = 16000 else: sample_rate = original_sample_rate @@ -400,18 +400,11 @@ class LiveKitInputTransport(BaseInputTransport): ) return AudioRawFrame( - audio=audio_data.tobytes(), + audio=audio_data, sample_rate=sample_rate, num_channels=audio_frame.num_channels, ) - def _resample_audio( - self, audio_data: np.ndarray, original_rate: int, target_rate: int - ) -> np.ndarray: - num_samples = int(len(audio_data) * target_rate / original_rate) - resampled_audio = signal.resample(audio_data, num_samples) - return resampled_audio.astype(np.int16) - class LiveKitOutputTransport(BaseOutputTransport): def __init__(self, client: LiveKitTransportClient, params: LiveKitParams, **kwargs): diff --git a/src/pipecat/utils/utils.py b/src/pipecat/utils/utils.py index e2df99389..14f1b541a 100644 --- a/src/pipecat/utils/utils.py +++ b/src/pipecat/utils/utils.py @@ -36,7 +36,3 @@ def obj_count(obj) -> int: 0 """ return next(_COUNTS[obj.__class__.__name__]) - - -def exp_smoothing(value: float, prev_value: float, factor: float) -> float: - return prev_value + factor * (value - prev_value) diff --git a/test-requirements.txt b/test-requirements.txt index 62be00385..b2008606d 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -22,7 +22,7 @@ pydantic~=2.8.2 pyloudnorm~=0.1.1 pyht~=0.1.4 python-dotenv~=1.0.1 -resampy~=0.4.3 +scipy~=1.14.1 silero-vad~=5.1 together~=1.2.7 transformers~=4.44.0