From 81084237424d3eeedaf7ccf0d0bf7bca7f5fdb39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Thu, 17 Oct 2024 10:41:58 -0700
Subject: [PATCH] transport(livekit): force specifying a vad analyzer

Don't default to SileroVADAnalyzer(). Also, resample to input sample rate.
---
 CHANGELOG.md                               |  3 ++
 examples/foundational/01b-livekit-audio.py |  9 ++++--
 pyproject.toml                             |  2 +-
 src/pipecat/transports/services/livekit.py | 32 ++++------------------
 4 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dde8ccec0..ddc78b7e2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,6 +33,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed an issue that would cause an error if no VAD analyzer was passed to
+  `LiveKitTransport` params.
+
 - Fixed `SileroVAD` processor to support interruptions properly.
 
 ### Other
diff --git a/examples/foundational/01b-livekit-audio.py b/examples/foundational/01b-livekit-audio.py
index 68e0d2803..a463adcf4 100644
--- a/examples/foundational/01b-livekit-audio.py
+++ b/examples/foundational/01b-livekit-audio.py
@@ -4,9 +4,6 @@ import os
 import sys
 
 import aiohttp
-from dotenv import load_dotenv
-from livekit import api  # pip install livekit-api
-from loguru import logger
 
 from pipecat.frames.frames import TextFrame
 from pipecat.pipeline.pipeline import Pipeline
@@ -15,6 +12,12 @@ from pipecat.pipeline.task import PipelineTask
 from pipecat.services.cartesia import CartesiaTTSService
 from pipecat.transports.services.livekit import LiveKitParams, LiveKitTransport
 
+from livekit import api
+
+from loguru import logger
+
+from dotenv import load_dotenv
+
 load_dotenv(override=True)
 
 logger.remove(0)
diff --git a/pyproject.toml b/pyproject.toml
index e294f44fc..9cfcf8f11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ google = [ "google-generativeai~=0.7.2", "google-cloud-texttospeech~=2.17.2" ]
 gstreamer = [ "pygobject~=3.48.2" ]
 fireworks = [ "openai~=1.37.2" ]
 langchain = [ "langchain~=0.2.14", "langchain-community~=0.2.12", "langchain-openai~=0.1.20" ]
-livekit = [ "livekit~=0.13.1", "tenacity~=9.0.0" ]
+livekit = [ "livekit~=0.17.5", "livekit-api~=0.7.1", "tenacity~=8.5.0" ]
 lmnt = [ "lmnt~=1.1.4" ]
 local = [ "pyaudio~=0.2.14" ]
 moondream = [ "einops~=0.8.0", "timm~=1.0.8", "transformers~=4.44.0" ]
diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py
index a6d261f69..3c0aca146 100644
--- a/src/pipecat/transports/services/livekit.py
+++ b/src/pipecat/transports/services/livekit.py
@@ -11,7 +11,6 @@ from typing import Any, Awaitable, Callable, List
 from pydantic import BaseModel
 
 from pipecat.audio.utils import resample_audio
-from pipecat.audio.vad.vad_analyzer import VADAnalyzer
 from pipecat.frames.frames import (
     AudioRawFrame,
     CancelFrame,
@@ -50,11 +49,7 @@ class LiveKitTransportMessageUrgentFrame(TransportMessageUrgentFrame):
 
 
 class LiveKitParams(TransportParams):
-    audio_out_sample_rate: int = 48000
-    audio_out_channels: int = 1
-    vad_enabled: bool = True
-    vad_analyzer: VADAnalyzer | None = None
-    audio_in_sample_rate: int = 16000
+    pass
 
 
 class LiveKitCallbacks(BaseModel):
@@ -310,11 +305,6 @@ class LiveKitInputTransport(BaseInputTransport):
         self._client = client
         self._audio_in_task = None
         self._vad_analyzer: VADAnalyzer | None = params.vad_analyzer
-        self._current_sample_rate: int = params.audio_in_sample_rate
-        if params.vad_enabled and not params.vad_analyzer:
-            self._vad_analyzer = VADAnalyzer(
-                sample_rate=self._current_sample_rate, num_channels=self._params.audio_in_channels
-            )
 
     async def start(self, frame: StartFrame):
         await super().start(frame)
@@ -384,24 +374,14 @@ class LiveKitInputTransport(BaseInputTransport):
         audio_data = audio_frame.data
         original_sample_rate = audio_frame.sample_rate
 
-        # Allow 8kHz and 16kHz, convert anything else to 16kHz
-        if original_sample_rate not in [8000, 16000]:
-            audio_data = resample_audio(audio_data, original_sample_rate, 16000)
-            sample_rate = 16000
-        else:
-            sample_rate = original_sample_rate
-
-        if sample_rate != self._current_sample_rate:
-            self._current_sample_rate = sample_rate
-            if self._params.vad_enabled:
-                self._vad_analyzer = VADAnalyzer(
-                    sample_rate=self._current_sample_rate,
-                    num_channels=self._params.audio_in_channels,
-                )
+        if original_sample_rate != self._params.audio_in_sample_rate:
+            audio_data = resample_audio(
+                audio_data, original_sample_rate, self._params.audio_in_sample_rate
+            )
 
         return AudioRawFrame(
             audio=audio_data,
-            sample_rate=sample_rate,
+            sample_rate=self._params.audio_in_sample_rate,
             num_channels=audio_frame.num_channels,
         )