From 81084237424d3eeedaf7ccf0d0bf7bca7f5fdb39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 17 Oct 2024 10:41:58 -0700 Subject: [PATCH] transport(livekit): force specifying a vad analyzer Don't default to SileroVADAnalyzer(). Also, resample to input sample rate. --- CHANGELOG.md | 3 ++ examples/foundational/01b-livekit-audio.py | 9 ++++-- pyproject.toml | 2 +- src/pipecat/transports/services/livekit.py | 32 ++++------------------ 4 files changed, 16 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dde8ccec0..ddc78b7e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed an issue that would cause an error if no VAD analyzer was passed to + `LiveKitTransport` params. + - Fixed `SileroVAD` processor to support interruptions properly. ### Other diff --git a/examples/foundational/01b-livekit-audio.py b/examples/foundational/01b-livekit-audio.py index 68e0d2803..a463adcf4 100644 --- a/examples/foundational/01b-livekit-audio.py +++ b/examples/foundational/01b-livekit-audio.py @@ -4,9 +4,6 @@ import os import sys import aiohttp -from dotenv import load_dotenv -from livekit import api # pip install livekit-api -from loguru import logger from pipecat.frames.frames import TextFrame from pipecat.pipeline.pipeline import Pipeline @@ -15,6 +12,12 @@ from pipecat.pipeline.task import PipelineTask from pipecat.services.cartesia import CartesiaTTSService from pipecat.transports.services.livekit import LiveKitParams, LiveKitTransport +from livekit import api + +from loguru import logger + +from dotenv import load_dotenv + load_dotenv(override=True) logger.remove(0) diff --git a/pyproject.toml b/pyproject.toml index e294f44fc..9cfcf8f11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ google = [ "google-generativeai~=0.7.2", "google-cloud-texttospeech~=2.17.2" ] gstreamer = [ "pygobject~=3.48.2" ] fireworks = [ "openai~=1.37.2" ] langchain = [ "langchain~=0.2.14", "langchain-community~=0.2.12", "langchain-openai~=0.1.20" ] -livekit = [ "livekit~=0.13.1", "tenacity~=9.0.0" ] +livekit = [ "livekit~=0.17.5", "livekit-api~=0.7.1", "tenacity~=8.5.0" ] lmnt = [ "lmnt~=1.1.4" ] local = [ "pyaudio~=0.2.14" ] moondream = [ "einops~=0.8.0", "timm~=1.0.8", "transformers~=4.44.0" ] diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py index a6d261f69..3c0aca146 100644 --- a/src/pipecat/transports/services/livekit.py +++ b/src/pipecat/transports/services/livekit.py @@ -11,7 +11,6 @@ from typing import Any, Awaitable, Callable, List from pydantic import BaseModel from pipecat.audio.utils import resample_audio -from pipecat.audio.vad.vad_analyzer import VADAnalyzer from pipecat.frames.frames import ( AudioRawFrame, CancelFrame, @@ -50,11 +49,7 @@ class LiveKitTransportMessageUrgentFrame(TransportMessageUrgentFrame): class LiveKitParams(TransportParams): - audio_out_sample_rate: int = 48000 - audio_out_channels: int = 1 - vad_enabled: bool = True - vad_analyzer: VADAnalyzer | None = None - audio_in_sample_rate: int = 16000 + pass class LiveKitCallbacks(BaseModel): @@ -310,11 +305,6 @@ class LiveKitInputTransport(BaseInputTransport): self._client = client self._audio_in_task = None self._vad_analyzer: VADAnalyzer | None = params.vad_analyzer - self._current_sample_rate: int = params.audio_in_sample_rate - if params.vad_enabled and not params.vad_analyzer: - self._vad_analyzer = VADAnalyzer( - sample_rate=self._current_sample_rate, num_channels=self._params.audio_in_channels - ) async def start(self, frame: StartFrame): await super().start(frame) @@ -384,24 +374,14 @@ class LiveKitInputTransport(BaseInputTransport): audio_data = audio_frame.data original_sample_rate = audio_frame.sample_rate - # Allow 8kHz and 16kHz, convert anything else to 16kHz - if original_sample_rate not in [8000, 16000]: - audio_data = resample_audio(audio_data, original_sample_rate, 16000) - sample_rate = 16000 - else: - sample_rate = original_sample_rate - - if sample_rate != self._current_sample_rate: - self._current_sample_rate = sample_rate - if self._params.vad_enabled: - self._vad_analyzer = VADAnalyzer( - sample_rate=self._current_sample_rate, - num_channels=self._params.audio_in_channels, - ) + if original_sample_rate != self._params.audio_in_sample_rate: + audio_data = resample_audio( + audio_data, original_sample_rate, self._params.audio_in_sample_rate + ) return AudioRawFrame( audio=audio_data, - sample_rate=sample_rate, + sample_rate=self._params.audio_in_sample_rate, num_channels=audio_frame.num_channels, )