Merge pull request #608 from pipecat-ai/aleix/add-audio-utils-and-resample

add audio utils and resample
This commit is contained in:
Aleix Conchillo Flaqué
2024-10-17 14:00:49 -07:00
committed by GitHub
10 changed files with 34 additions and 45 deletions

View File

@@ -9,17 +9,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added a websocket service for PlayHT, called `PlayHTTTSService`.
- Added input parameter options for `PlayHTTTSService` and
`PlayHTHttpTTSService`.
### Changed
- Module `utils.audio` is now `audio.utils`. A new `resample_audio` function has
been added.
- `PlayHTTTSService` now uses PlayHT websockets instead of HTTP requests.
- The previous `PlayHTTTSService` HTTP implementation is now
`PlayHTHttpTTSService`.
- `PlayHTTTSService` and `PlayHTHttpTTSService` now use a `voice_engine` of
`PlayHT3.0-mini`, which allows for multi-lingual support.
- Changed the name of the HTTP PlayHT service from `PlayHTTTSService` to
`PlayHTHttpTTSService` since there's now a websocket service, which is the
default.
- Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
match other services.

View File

@@ -28,6 +28,7 @@ dependencies = [
"protobuf~=4.25.4",
"pydantic~=2.8.2",
"pyloudnorm~=0.1.1",
"scipy~=1.14.1",
]
[project.urls]
@@ -61,7 +62,6 @@ silero = [ "onnxruntime>=1.16.1" ]
together = [ "openai~=1.50.2" ]
websocket = [ "websockets~=13.1", "fastapi~=0.115.0" ]
whisper = [ "faster-whisper~=1.0.3" ]
xtts = [ "resampy~=0.4.3" ]
[tool.setuptools.packages.find]
# All the following settings are optional:

View File

@@ -7,6 +7,14 @@
import audioop
import numpy as np
import pyloudnorm as pyln
from scipy import signal
def resample_audio(audio: bytes, original_rate: int, target_rate: int) -> bytes:
audio_data = np.frombuffer(audio, dtype=np.int16)
num_samples = int(len(audio) * target_rate / original_rate)
resampled_audio = signal.resample(audio_data, num_samples)
return resampled_audio.astype(np.int16).tobytes()
def normalize_value(value, min_value, max_value):

View File

@@ -10,7 +10,7 @@ from enum import Enum
from loguru import logger
from pydantic.main import BaseModel
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
class VADState(Enum):

View File

@@ -9,9 +9,9 @@ import json
from pydantic import BaseModel
from pipecat.audio.utils import ulaw_to_pcm, pcm_to_ulaw
from pipecat.frames.frames import AudioRawFrame, Frame, StartInterruptionFrame
from pipecat.serializers.base_serializer import FrameSerializer
from pipecat.utils.audio import ulaw_to_pcm, pcm_to_ulaw
class TwilioFrameSerializer(FrameSerializer):

View File

@@ -12,6 +12,7 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from loguru import logger
from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
@@ -35,11 +36,9 @@ from pipecat.metrics.metrics import MetricsData
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transcriptions.language import Language
from pipecat.utils.audio import calculate_audio_volume
from pipecat.utils.string import match_endofsentence
from pipecat.utils.text.base_text_filter import BaseTextFilter
from pipecat.utils.time import seconds_to_nanoseconds
from pipecat.utils.utils import exp_smoothing
class AIService(FrameProcessor):

View File

@@ -7,9 +7,8 @@
from typing import Any, AsyncGenerator, Dict
import aiohttp
import numpy as np
from loguru import logger
from pipecat.audio.utils import resample_audio
from pipecat.frames.frames import (
ErrorFrame,
Frame,
@@ -21,12 +20,7 @@ from pipecat.frames.frames import (
from pipecat.services.ai_services import TTSService
from pipecat.transcriptions.language import Language
try:
import resampy
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use XTTS, you need to `pip install pipecat-ai[xtts]`.")
raise Exception(f"Missing module: {e}")
from loguru import logger
# The server below can connect to XTTS through a local running docker
@@ -168,22 +162,16 @@ class XTTSService(TTSService):
# Remove processed data from buffer
buffer = buffer[48000:]
# Convert the byte data to numpy array for resampling
audio_np = np.frombuffer(process_data, dtype=np.int16)
# Resample the audio from 24000 Hz to 16000 Hz
resampled_audio = resampy.resample(audio_np, 24000, 16000)
# Convert the numpy array back to bytes
resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
resampled_audio = resample_audio(bytes(process_data), 24000, 16000)
# Create the frame with the resampled audio
frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
yield frame
# Process any remaining data in the buffer
if len(buffer) > 0:
audio_np = np.frombuffer(buffer, dtype=np.int16)
resampled_audio = resampy.resample(audio_np, 24000, 16000)
resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
resampled_audio = resample_audio(bytes(buffer), 24000, 16000)
frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
yield frame
yield TTSStoppedFrame()

View File

@@ -8,11 +8,9 @@ import asyncio
from dataclasses import dataclass
from typing import Any, Awaitable, Callable, List
import numpy as np
from loguru import logger
from pydantic import BaseModel
from scipy import signal
from pipecat.audio.utils import resample_audio
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
from pipecat.frames.frames import (
AudioRawFrame,
@@ -30,6 +28,8 @@ from pipecat.transports.base_input import BaseInputTransport
from pipecat.transports.base_output import BaseOutputTransport
from pipecat.transports.base_transport import BaseTransport, TransportParams
from loguru import logger
try:
from livekit import rtc
from tenacity import retry, stop_after_attempt, wait_exponential
@@ -381,12 +381,12 @@ class LiveKitInputTransport(BaseInputTransport):
self, audio_frame_event: rtc.AudioFrameEvent
) -> AudioRawFrame:
audio_frame = audio_frame_event.frame
audio_data = np.frombuffer(audio_frame.data, dtype=np.int16)
audio_data = audio_frame.data
original_sample_rate = audio_frame.sample_rate
# Allow 8kHz and 16kHz, convert anything else to 16kHz
if original_sample_rate not in [8000, 16000]:
audio_data = self._resample_audio(audio_data, original_sample_rate, 16000)
audio_data = resample_audio(audio_data, original_sample_rate, 16000)
sample_rate = 16000
else:
sample_rate = original_sample_rate
@@ -400,18 +400,11 @@ class LiveKitInputTransport(BaseInputTransport):
)
return AudioRawFrame(
audio=audio_data.tobytes(),
audio=audio_data,
sample_rate=sample_rate,
num_channels=audio_frame.num_channels,
)
def _resample_audio(
self, audio_data: np.ndarray, original_rate: int, target_rate: int
) -> np.ndarray:
num_samples = int(len(audio_data) * target_rate / original_rate)
resampled_audio = signal.resample(audio_data, num_samples)
return resampled_audio.astype(np.int16)
class LiveKitOutputTransport(BaseOutputTransport):
def __init__(self, client: LiveKitTransportClient, params: LiveKitParams, **kwargs):

View File

@@ -36,7 +36,3 @@ def obj_count(obj) -> int:
0
"""
return next(_COUNTS[obj.__class__.__name__])
def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
return prev_value + factor * (value - prev_value)

View File

@@ -22,7 +22,7 @@ pydantic~=2.8.2
pyloudnorm~=0.1.1
pyht~=0.1.4
python-dotenv~=1.0.1
resampy~=0.4.3
scipy~=1.14.1
silero-vad~=5.1
together~=1.2.7
transformers~=4.44.0