diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1045974a3..dde8ccec0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,17 +9,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Added a websocket service for PlayHT, called `PlayHTTTSService`.
 - Added input parameter options for `PlayHTTTSService` and
   `PlayHTHttpTTSService`.
 
 ### Changed
 
+- Module `utils.audio` is now `audio.utils`. A new `resample_audio` function has
+  been added.
+
+- `PlayHTTTSService` now uses PlayHT websockets instead of HTTP requests.
+
+- The previous `PlayHTTTSService` HTTP implementation is now
+  `PlayHTHttpTTSService`.
+
 - `PlayHTTTSService` and `PlayHTHttpTTSService` now use a `voice_engine` of
   `PlayHT3.0-mini`, which allows for multi-lingual support.
-- Changed the name of the HTTP PlayHT service from `PlayHTTTSService` to
-  `PlayHTHttpTTSService` since there's now a websocket service, which is the
-  default.
+
 - Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
   match other services.
 
diff --git a/pyproject.toml b/pyproject.toml
index c67fe6c09..e294f44fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "protobuf~=4.25.4",
     "pydantic~=2.8.2",
     "pyloudnorm~=0.1.1",
+    "scipy~=1.14.1",
 ]
 
 [project.urls]
@@ -61,7 +62,6 @@ silero = [ "onnxruntime>=1.16.1" ]
 together = [ "openai~=1.50.2" ]
 websocket = [ "websockets~=13.1", "fastapi~=0.115.0" ]
 whisper = [ "faster-whisper~=1.0.3" ]
-xtts = [ "resampy~=0.4.3" ]
 
 [tool.setuptools.packages.find]
 # All the following settings are optional:
diff --git a/src/pipecat/utils/audio.py b/src/pipecat/audio/utils.py
similarity index 81%
rename from src/pipecat/utils/audio.py
rename to src/pipecat/audio/utils.py
index 0764c6abb..6675a07ab 100644
--- a/src/pipecat/utils/audio.py
+++ b/src/pipecat/audio/utils.py
@@ -7,6 +7,14 @@
 import audioop
 import numpy as np
 import pyloudnorm as pyln
+from scipy import signal
+
+
+def resample_audio(audio: bytes, original_rate: int, target_rate: int) -> bytes:
+    audio_data = np.frombuffer(audio, dtype=np.int16)
+    num_samples = int(len(audio) * target_rate / original_rate)
+    resampled_audio = signal.resample(audio_data, num_samples)
+    return resampled_audio.astype(np.int16).tobytes()
 
 
 def normalize_value(value, min_value, max_value):
diff --git a/src/pipecat/audio/vad/vad_analyzer.py b/src/pipecat/audio/vad/vad_analyzer.py
index fe2739b28..3387a1746 100644
--- a/src/pipecat/audio/vad/vad_analyzer.py
+++ b/src/pipecat/audio/vad/vad_analyzer.py
@@ -10,7 +10,7 @@ from enum import Enum
 from loguru import logger
 from pydantic.main import BaseModel
 
-from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
+from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
 
 
 class VADState(Enum):
diff --git a/src/pipecat/serializers/twilio.py b/src/pipecat/serializers/twilio.py
index c0d4c0c47..ebc62e484 100644
--- a/src/pipecat/serializers/twilio.py
+++ b/src/pipecat/serializers/twilio.py
@@ -9,9 +9,9 @@ import json
 
 from pydantic import BaseModel
 
+from pipecat.audio.utils import ulaw_to_pcm, pcm_to_ulaw
 from pipecat.frames.frames import AudioRawFrame, Frame, StartInterruptionFrame
 from pipecat.serializers.base_serializer import FrameSerializer
-from pipecat.utils.audio import ulaw_to_pcm, pcm_to_ulaw
 
 
 class TwilioFrameSerializer(FrameSerializer):
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index ad6d20d8e..8e11ad6ee 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -12,6 +12,7 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 
 from loguru import logger
 
+from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
 from pipecat.frames.frames import (
     AudioRawFrame,
     CancelFrame,
@@ -35,11 +36,9 @@ from pipecat.metrics.metrics import MetricsData
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transcriptions.language import Language
-from pipecat.utils.audio import calculate_audio_volume
 from pipecat.utils.string import match_endofsentence
 from pipecat.utils.text.base_text_filter import BaseTextFilter
 from pipecat.utils.time import seconds_to_nanoseconds
-from pipecat.utils.utils import exp_smoothing
 
 
 class AIService(FrameProcessor):
diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py
index 6c1408553..1c444f9f1 100644
--- a/src/pipecat/services/xtts.py
+++ b/src/pipecat/services/xtts.py
@@ -7,9 +7,8 @@
 from typing import Any, AsyncGenerator, Dict
 
 import aiohttp
-import numpy as np
-from loguru import logger
 
+from pipecat.audio.utils import resample_audio
 from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
@@ -21,12 +20,7 @@ from pipecat.frames.frames import (
 from pipecat.services.ai_services import TTSService
 from pipecat.transcriptions.language import Language
 
-try:
-    import resampy
-except ModuleNotFoundError as e:
-    logger.error(f"Exception: {e}")
-    logger.error("In order to use XTTS, you need to `pip install pipecat-ai[xtts]`.")
-    raise Exception(f"Missing module: {e}")
+from loguru import logger
 
 
 # The server below can connect to XTTS through a local running docker
@@ -168,22 +162,16 @@ class XTTSService(TTSService):
                         # Remove processed data from buffer
                         buffer = buffer[48000:]
 
-                        # Convert the byte data to numpy array for resampling
-                        audio_np = np.frombuffer(process_data, dtype=np.int16)
                         # Resample the audio from 24000 Hz to 16000 Hz
-                        resampled_audio = resampy.resample(audio_np, 24000, 16000)
-                        # Convert the numpy array back to bytes
-                        resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
+                        resampled_audio = resample_audio(bytes(process_data), 24000, 16000)
                         # Create the frame with the resampled audio
-                        frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
+                        frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
                         yield frame
 
             # Process any remaining data in the buffer
             if len(buffer) > 0:
-                audio_np = np.frombuffer(buffer, dtype=np.int16)
-                resampled_audio = resampy.resample(audio_np, 24000, 16000)
-                resampled_audio_bytes = resampled_audio.astype(np.int16).tobytes()
-                frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
+                resampled_audio = resample_audio(bytes(buffer), 24000, 16000)
+                frame = TTSAudioRawFrame(resampled_audio, 16000, 1)
                 yield frame
 
             yield TTSStoppedFrame()
diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py
index 9cf2c617d..a6d261f69 100644
--- a/src/pipecat/transports/services/livekit.py
+++ b/src/pipecat/transports/services/livekit.py
@@ -8,11 +8,9 @@ import asyncio
 from dataclasses import dataclass
 from typing import Any, Awaitable, Callable, List
 
-import numpy as np
-from loguru import logger
 from pydantic import BaseModel
-from scipy import signal
 
+from pipecat.audio.utils import resample_audio
 from pipecat.audio.vad.vad_analyzer import VADAnalyzer
 from pipecat.frames.frames import (
     AudioRawFrame,
@@ -30,6 +28,8 @@ from pipecat.transports.base_input import BaseInputTransport
 from pipecat.transports.base_output import BaseOutputTransport
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 
+from loguru import logger
+
 try:
     from livekit import rtc
     from tenacity import retry, stop_after_attempt, wait_exponential
@@ -381,12 +381,12 @@ class LiveKitInputTransport(BaseInputTransport):
         self, audio_frame_event: rtc.AudioFrameEvent
     ) -> AudioRawFrame:
         audio_frame = audio_frame_event.frame
-        audio_data = np.frombuffer(audio_frame.data, dtype=np.int16)
+        audio_data = audio_frame.data
         original_sample_rate = audio_frame.sample_rate
 
         # Allow 8kHz and 16kHz, convert anything else to 16kHz
         if original_sample_rate not in [8000, 16000]:
-            audio_data = self._resample_audio(audio_data, original_sample_rate, 16000)
+            audio_data = resample_audio(audio_data, original_sample_rate, 16000)
             sample_rate = 16000
         else:
             sample_rate = original_sample_rate
@@ -400,18 +400,11 @@ class LiveKitInputTransport(BaseInputTransport):
                 )
 
         return AudioRawFrame(
-            audio=audio_data.tobytes(),
+            audio=audio_data,
             sample_rate=sample_rate,
             num_channels=audio_frame.num_channels,
         )
 
-    def _resample_audio(
-        self, audio_data: np.ndarray, original_rate: int, target_rate: int
-    ) -> np.ndarray:
-        num_samples = int(len(audio_data) * target_rate / original_rate)
-        resampled_audio = signal.resample(audio_data, num_samples)
-        return resampled_audio.astype(np.int16)
-
 
 class LiveKitOutputTransport(BaseOutputTransport):
     def __init__(self, client: LiveKitTransportClient, params: LiveKitParams, **kwargs):
diff --git a/src/pipecat/utils/utils.py b/src/pipecat/utils/utils.py
index e2df99389..14f1b541a 100644
--- a/src/pipecat/utils/utils.py
+++ b/src/pipecat/utils/utils.py
@@ -36,7 +36,3 @@ def obj_count(obj) -> int:
     0
     """
     return next(_COUNTS[obj.__class__.__name__])
-
-
-def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
-    return prev_value + factor * (value - prev_value)
diff --git a/test-requirements.txt b/test-requirements.txt
index 62be00385..b2008606d 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -22,7 +22,7 @@ pydantic~=2.8.2
 pyloudnorm~=0.1.1
 pyht~=0.1.4
 python-dotenv~=1.0.1
-resampy~=0.4.3
+scipy~=1.14.1
 silero-vad~=5.1
 together~=1.2.7
 transformers~=4.44.0