Merge pull request #608 from pipecat-ai/aleix/add-audio-utils-and-resample

add audio utils and resample
2024-10-17 14:00:49 -07:00
parent c6d28bb0db d3f4ac61b6
commit d67e08be4d
10 changed files with 34 additions and 45 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,17 +9,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

- Added a websocket service for PlayHT, called `PlayHTTTSService`.
 - Added input parameter options for `PlayHTTTSService` and
  `PlayHTHttpTTSService`.

 ### Changed

+- Module `utils.audio` is now `audio.utils`. A new `resample_audio` function has
+  been added.
+
+- `PlayHTTTSService` now uses PlayHT websockets instead of HTTP requests.
+
+- The previous `PlayHTTTSService` HTTP implementation is now
+  `PlayHTHttpTTSService`.
+
 - `PlayHTTTSService` and `PlayHTHttpTTSService` now use a `voice_engine` of
  `PlayHT3.0-mini`, which allows for multi-lingual support.
- Changed the name of the HTTP PlayHT service from `PlayHTTTSService` to
-  `PlayHTHttpTTSService` since there's now a websocket service, which is the
-  default.
+
 - Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
  match other services.

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
    "protobuf~=4.25.4",
    "pydantic~=2.8.2",
    "pyloudnorm~=0.1.1",
+    "scipy~=1.14.1",
 ]

 [project.urls]
@@ -61,7 +62,6 @@ silero = [ "onnxruntime>=1.16.1" ]
 together = [ "openai~=1.50.2" ]
 websocket = [ "websockets~=13.1", "fastapi~=0.115.0" ]
 whisper = [ "faster-whisper~=1.0.3" ]
-xtts = [ "resampy~=0.4.3" ]

 [tool.setuptools.packages.find]
 # All the following settings are optional:
--- a/src/pipecat/audio/utils.py
+++ b/src/pipecat/audio/utils.py
@@ -7,6 +7,14 @@
 import audioop
 import numpy as np
 import pyloudnorm as pyln
+from scipy import signal
+
+
+def resample_audio(audio: bytes, original_rate: int, target_rate: int) -> bytes:
+    audio_data = np.frombuffer(audio, dtype=np.int16)
+    num_samples = int(len(audio) * target_rate / original_rate)
+    resampled_audio = signal.resample(audio_data, num_samples)
+    return resampled_audio.astype(np.int16).tobytes()


 def normalize_value(value, min_value, max_value):
--- a/src/pipecat/audio/vad/vad_analyzer.py
+++ b/src/pipecat/audio/vad/vad_analyzer.py
@@ -10,7 +10,7 @@ from enum import Enum
 from loguru import logger
 from pydantic.main import BaseModel

-from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
+from pipecat.audio.utils import calculate_audio_volume, exp_smoothing


 class VADState(Enum):
--- a/src/pipecat/serializers/twilio.py
+++ b/src/pipecat/serializers/twilio.py
@@ -9,9 +9,9 @@ import json

 from pydantic import BaseModel

+from pipecat.audio.utils import ulaw_to_pcm, pcm_to_ulaw
 from pipecat.frames.frames import AudioRawFrame, Frame, StartInterruptionFrame
 from pipecat.serializers.base_serializer import FrameSerializer
-from pipecat.utils.audio import ulaw_to_pcm, pcm_to_ulaw


 class TwilioFrameSerializer(FrameSerializer):
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -12,6 +12,7 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple

 from loguru import logger

+from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
 from pipecat.frames.frames import (
    AudioRawFrame,
    CancelFrame,
@@ -35,11 +36,9 @@ from pipecat.metrics.metrics import MetricsData
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transcriptions.language import Language
-from pipecat.utils.audio import calculate_audio_volume
 from pipecat.utils.string import match_endofsentence
 from pipecat.utils.text.base_text_filter import BaseTextFilter
 from pipecat.utils.time import seconds_to_nanoseconds
-from pipecat.utils.utils import exp_smoothing


 class AIService(FrameProcessor):
--- a/src/pipecat/services/xtts.py
+++ b/src/pipecat/services/xtts.py
@@ -7,9 +7,8 @@
 from typing import Any, AsyncGenerator, Dict

 import aiohttp
-import numpy as np
-from loguru import logger

+from pipecat.audio.utils import resample_audio
 from pipecat.frames.frames import (
    ErrorFrame,
    Frame,
@@ -21,12 +20,7 @@ from pipecat.frames.frames import (
 from pipecat.services.ai_services import TTSService
 from pipecat.transcriptions.language import Language

-try:
-    import resampy
-except ModuleNotFoundError as e:
-    logger.error(f"Exception: {e}")
-    logger.error("In order to use XTTS, you need to `pip install pipecat-ai[xtts]`.")
-    raise Exception(f"Missing module: {e}")
+from loguru import logger


 # The server below can connect to XTTS through a local running docker
@@ -168,22 +162,16 @@ class XTTSService(TTSService):
                        # Remove processed data from buffer
                        buffer = buffer[48000:]

-                        # Convert the byte data to numpy array for resampling
-                        audio_np = np.frombuffer(process_data, dtype=np.int16)
                        # Resample the audio from 24000 Hz to 16000 Hz
-                        resampled_audio = resampy.resample(audio_np, 24000, 16000)
-                        # Convert the numpy array back to bytes
-                        resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
+                        resampled_audio = resample_audio(bytes(process_data), 24000, 16000)
                        # Create the frame with the resampled audio
-                        frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
+                        frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
                        yield frame

            # Process any remaining data in the buffer
            if len(buffer) > 0:
-                audio_np = np.frombuffer(buffer, dtype=np.int16)
-                resampled_audio = resampy.resample(audio_np, 24000, 16000)
-                resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
-                frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
+                resampled_audio = resample_audio(bytes(buffer), 24000, 16000)
+                frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
                yield frame

            yield TTSStoppedFrame()
--- a/src/pipecat/transports/services/livekit.py
+++ b/src/pipecat/transports/services/livekit.py
@@ -8,11 +8,9 @@ import asyncio
 from dataclasses import dataclass
 from typing import Any, Awaitable, Callable, List

-import numpy as np
-from loguru import logger
 from pydantic import BaseModel
-from scipy import signal

+from pipecat.audio.utils import resample_audio
 from pipecat.audio.vad.vad_analyzer import VADAnalyzer
 from pipecat.frames.frames import (
    AudioRawFrame,
@@ -30,6 +28,8 @@ from pipecat.transports.base_input import BaseInputTransport
 from pipecat.transports.base_output import BaseOutputTransport
 from pipecat.transports.base_transport import BaseTransport, TransportParams

+from loguru import logger
+
 try:
    from livekit import rtc
    from tenacity import retry, stop_after_attempt, wait_exponential
@@ -381,12 +381,12 @@ class LiveKitInputTransport(BaseInputTransport):
        self, audio_frame_event: rtc.AudioFrameEvent
    ) -> AudioRawFrame:
        audio_frame = audio_frame_event.frame
-        audio_data = np.frombuffer(audio_frame.data, dtype=np.int16)
+        audio_data = audio_frame.data
        original_sample_rate = audio_frame.sample_rate

        # Allow 8kHz and 16kHz, convert anything else to 16kHz
        if original_sample_rate not in [8000, 16000]:
-            audio_data = self._resample_audio(audio_data, original_sample_rate, 16000)
+            audio_data = resample_audio(audio_data, original_sample_rate, 16000)
            sample_rate = 16000
        else:
            sample_rate = original_sample_rate
@@ -400,18 +400,11 @@ class LiveKitInputTransport(BaseInputTransport):
                )

        return AudioRawFrame(
-            audio=audio_data.tobytes(),
+            audio=audio_data,
            sample_rate=sample_rate,
            num_channels=audio_frame.num_channels,
        )

-    def _resample_audio(
-        self, audio_data: np.ndarray, original_rate: int, target_rate: int
-    ) -> np.ndarray:
-        num_samples = int(len(audio_data) * target_rate / original_rate)
-        resampled_audio = signal.resample(audio_data, num_samples)
-        return resampled_audio.astype(np.int16)
-

 class LiveKitOutputTransport(BaseOutputTransport):
    def __init__(self, client: LiveKitTransportClient, params: LiveKitParams, **kwargs):
--- a/src/pipecat/utils/utils.py
+++ b/src/pipecat/utils/utils.py
@@ -36,7 +36,3 @@ def obj_count(obj) -> int:
    0
    """
    return next(_COUNTS[obj.__class__.__name__])
-
-
-def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
-    return prev_value + factor * (value - prev_value)
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -22,7 +22,7 @@ pydantic~=2.8.2
 pyloudnorm~=0.1.1
 pyht~=0.1.4
 python-dotenv~=1.0.1
-resampy~=0.4.3
+scipy~=1.14.1
 silero-vad~=5.1
 together~=1.2.7
 transformers~=4.44.0