Merge pull request #608 from pipecat-ai/aleix/add-audio-utils-and-resample
add audio utils and resample
This commit is contained in:
13
CHANGELOG.md
13
CHANGELOG.md
@@ -9,17 +9,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added a websocket service for PlayHT, called `PlayHTTTSService`.
|
||||
- Added input parameter options for `PlayHTTTSService` and
|
||||
`PlayHTHttpTTSService`.
|
||||
|
||||
### Changed
|
||||
|
||||
- Module `utils.audio` is now `audio.utils`. A new `resample_audio` function has
|
||||
been added.
|
||||
|
||||
- `PlayHTTTSService` now uses PlayHT websockets instead of HTTP requests.
|
||||
|
||||
- The previous `PlayHTTTSService` HTTP implementation is now
|
||||
`PlayHTHttpTTSService`.
|
||||
|
||||
- `PlayHTTTSService` and `PlayHTHttpTTSService` now use a `voice_engine` of
|
||||
`PlayHT3.0-mini`, which allows for multi-lingual support.
|
||||
- Changed the name of the HTTP PlayHT service from `PlayHTTTSService` to
|
||||
`PlayHTHttpTTSService` since there's now a websocket service, which is the
|
||||
default.
|
||||
|
||||
- Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
|
||||
match other services.
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ dependencies = [
|
||||
"protobuf~=4.25.4",
|
||||
"pydantic~=2.8.2",
|
||||
"pyloudnorm~=0.1.1",
|
||||
"scipy~=1.14.1",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -61,7 +62,6 @@ silero = [ "onnxruntime>=1.16.1" ]
|
||||
together = [ "openai~=1.50.2" ]
|
||||
websocket = [ "websockets~=13.1", "fastapi~=0.115.0" ]
|
||||
whisper = [ "faster-whisper~=1.0.3" ]
|
||||
xtts = [ "resampy~=0.4.3" ]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
# All the following settings are optional:
|
||||
|
||||
@@ -7,6 +7,14 @@
|
||||
import audioop
|
||||
import numpy as np
|
||||
import pyloudnorm as pyln
|
||||
from scipy import signal
|
||||
|
||||
|
||||
def resample_audio(audio: bytes, original_rate: int, target_rate: int) -> bytes:
|
||||
audio_data = np.frombuffer(audio, dtype=np.int16)
|
||||
num_samples = int(len(audio) * target_rate / original_rate)
|
||||
resampled_audio = signal.resample(audio_data, num_samples)
|
||||
return resampled_audio.astype(np.int16).tobytes()
|
||||
|
||||
|
||||
def normalize_value(value, min_value, max_value):
|
||||
@@ -10,7 +10,7 @@ from enum import Enum
|
||||
from loguru import logger
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
|
||||
from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
|
||||
|
||||
|
||||
class VADState(Enum):
|
||||
|
||||
@@ -9,9 +9,9 @@ import json
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import ulaw_to_pcm, pcm_to_ulaw
|
||||
from pipecat.frames.frames import AudioRawFrame, Frame, StartInterruptionFrame
|
||||
from pipecat.serializers.base_serializer import FrameSerializer
|
||||
from pipecat.utils.audio import ulaw_to_pcm, pcm_to_ulaw
|
||||
|
||||
|
||||
class TwilioFrameSerializer(FrameSerializer):
|
||||
|
||||
@@ -12,6 +12,7 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
@@ -35,11 +36,9 @@ from pipecat.metrics.metrics import MetricsData
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.audio import calculate_audio_volume
|
||||
from pipecat.utils.string import match_endofsentence
|
||||
from pipecat.utils.text.base_text_filter import BaseTextFilter
|
||||
from pipecat.utils.time import seconds_to_nanoseconds
|
||||
from pipecat.utils.utils import exp_smoothing
|
||||
|
||||
|
||||
class AIService(FrameProcessor):
|
||||
|
||||
@@ -7,9 +7,8 @@
|
||||
from typing import Any, AsyncGenerator, Dict
|
||||
|
||||
import aiohttp
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.utils import resample_audio
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
@@ -21,12 +20,7 @@ from pipecat.frames.frames import (
|
||||
from pipecat.services.ai_services import TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
try:
|
||||
import resampy
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use XTTS, you need to `pip install pipecat-ai[xtts]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
from loguru import logger
|
||||
|
||||
|
||||
# The server below can connect to XTTS through a local running docker
|
||||
@@ -168,22 +162,16 @@ class XTTSService(TTSService):
|
||||
# Remove processed data from buffer
|
||||
buffer = buffer[48000:]
|
||||
|
||||
# Convert the byte data to numpy array for resampling
|
||||
audio_np = np.frombuffer(process_data, dtype=np.int16)
|
||||
# Resample the audio from 24000 Hz to 16000 Hz
|
||||
resampled_audio = resampy.resample(audio_np, 24000, 16000)
|
||||
# Convert the numpy array back to bytes
|
||||
resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
|
||||
resampled_audio = resample_audio(bytes(process_data), 24000, 16000)
|
||||
# Create the frame with the resampled audio
|
||||
frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
|
||||
frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
|
||||
yield frame
|
||||
|
||||
# Process any remaining data in the buffer
|
||||
if len(buffer) > 0:
|
||||
audio_np = np.frombuffer(buffer, dtype=np.int16)
|
||||
resampled_audio = resampy.resample(audio_np, 24000, 16000)
|
||||
resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
|
||||
frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
|
||||
resampled_audio = resample_audio(bytes(buffer), 24000, 16000)
|
||||
frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
|
||||
yield frame
|
||||
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
@@ -8,11 +8,9 @@ import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Awaitable, Callable, List
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
from scipy import signal
|
||||
|
||||
from pipecat.audio.utils import resample_audio
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
@@ -30,6 +28,8 @@ from pipecat.transports.base_input import BaseInputTransport
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
from livekit import rtc
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
@@ -381,12 +381,12 @@ class LiveKitInputTransport(BaseInputTransport):
|
||||
self, audio_frame_event: rtc.AudioFrameEvent
|
||||
) -> AudioRawFrame:
|
||||
audio_frame = audio_frame_event.frame
|
||||
audio_data = np.frombuffer(audio_frame.data, dtype=np.int16)
|
||||
audio_data = audio_frame.data
|
||||
original_sample_rate = audio_frame.sample_rate
|
||||
|
||||
# Allow 8kHz and 16kHz, convert anything else to 16kHz
|
||||
if original_sample_rate not in [8000, 16000]:
|
||||
audio_data = self._resample_audio(audio_data, original_sample_rate, 16000)
|
||||
audio_data = resample_audio(audio_data, original_sample_rate, 16000)
|
||||
sample_rate = 16000
|
||||
else:
|
||||
sample_rate = original_sample_rate
|
||||
@@ -400,18 +400,11 @@ class LiveKitInputTransport(BaseInputTransport):
|
||||
)
|
||||
|
||||
return AudioRawFrame(
|
||||
audio=audio_data.tobytes(),
|
||||
audio=audio_data,
|
||||
sample_rate=sample_rate,
|
||||
num_channels=audio_frame.num_channels,
|
||||
)
|
||||
|
||||
def _resample_audio(
|
||||
self, audio_data: np.ndarray, original_rate: int, target_rate: int
|
||||
) -> np.ndarray:
|
||||
num_samples = int(len(audio_data) * target_rate / original_rate)
|
||||
resampled_audio = signal.resample(audio_data, num_samples)
|
||||
return resampled_audio.astype(np.int16)
|
||||
|
||||
|
||||
class LiveKitOutputTransport(BaseOutputTransport):
|
||||
def __init__(self, client: LiveKitTransportClient, params: LiveKitParams, **kwargs):
|
||||
|
||||
@@ -36,7 +36,3 @@ def obj_count(obj) -> int:
|
||||
0
|
||||
"""
|
||||
return next(_COUNTS[obj.__class__.__name__])
|
||||
|
||||
|
||||
def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
|
||||
return prev_value + factor * (value - prev_value)
|
||||
|
||||
@@ -22,7 +22,7 @@ pydantic~=2.8.2
|
||||
pyloudnorm~=0.1.1
|
||||
pyht~=0.1.4
|
||||
python-dotenv~=1.0.1
|
||||
resampy~=0.4.3
|
||||
scipy~=1.14.1
|
||||
silero-vad~=5.1
|
||||
together~=1.2.7
|
||||
transformers~=4.44.0
|
||||
|
||||
Reference in New Issue
Block a user