diff --git a/CHANGELOG.md b/CHANGELOG.md index 60081503c..052033214 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `GoogleLLMService` has been updated to use `google-genai` instead of the deprecated `google-generativeai`. +### Removed + +- Removed `SileroVAD` frame processor, just use `SileroVADAnalyzer` + instead. Also removed, `07a-interruptible-vad.py` example. + ### Other - Added an `open-telemetry-tracing` example, showing how to setup tracing. The diff --git a/examples/foundational/07a-interruptible-vad.py b/examples/foundational/07a-interruptible-vad.py deleted file mode 100644 index 90e8407b6..000000000 --- a/examples/foundational/07a-interruptible-vad.py +++ /dev/null @@ -1,107 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import argparse -import os - -from dotenv import load_dotenv -from loguru import logger - -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.processors.audio.vad.silero import SileroVAD -from pipecat.services.cartesia.tts import CartesiaTTSService -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.transports.base_transport import TransportParams -from pipecat.transports.network.small_webrtc import SmallWebRTCTransport -from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection - -load_dotenv(override=True) - - -async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace): - logger.info(f"Starting bot") - - transport = SmallWebRTCTransport( - webrtc_connection=webrtc_connection, - params=TransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), - ) - - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - - vad = SileroVAD() - - tts = CartesiaTTSService( - api_key=os.getenv("CARTESIA_API_KEY"), - voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady - ) - - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) - - messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", - }, - ] - - context = OpenAILLMContext(messages) - context_aggregator = llm.create_context_aggregator(context) - - pipeline = Pipeline( - [ - transport.input(), - stt, - vad, - context_aggregator.user(), - llm, - tts, - transport.output(), - context_aggregator.assistant(), - ] - ) - - task = PipelineTask( - pipeline, - params=PipelineParams( - allow_interruptions=True, - enable_metrics=True, - enable_usage_metrics=True, - report_only_initial_ttfb=True, - ), - ) - - @transport.event_handler("on_client_connected") - async def on_client_connected(transport, client): - logger.info(f"Client connected") - # Kick off the conversation. - messages.append({"role": "system", "content": "Please introduce yourself to the user."}) - await task.queue_frames([context_aggregator.user().get_context_frame()]) - - @transport.event_handler("on_client_disconnected") - async def on_client_disconnected(transport, client): - logger.info(f"Client disconnected") - - @transport.event_handler("on_client_closed") - async def on_client_closed(transport, client): - logger.info(f"Client closed connection") - await task.cancel() - - runner = PipelineRunner(handle_sigint=False) - - await runner.run(task) - - -if __name__ == "__main__": - from run import main - - main() diff --git a/src/pipecat/processors/audio/vad/__init__.py b/src/pipecat/processors/audio/vad/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/pipecat/processors/audio/vad/silero.py b/src/pipecat/processors/audio/vad/silero.py deleted file mode 100644 index edfe484ba..000000000 --- a/src/pipecat/processors/audio/vad/silero.py +++ /dev/null @@ -1,97 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -from typing import Optional - -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.audio.vad.vad_analyzer import VADParams, VADState -from pipecat.frames.frames import ( - AudioRawFrame, - Frame, - StartFrame, - StartInterruptionFrame, - StopInterruptionFrame, - UserStartedSpeakingFrame, - UserStoppedSpeakingFrame, -) -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor - - -class SileroVAD(FrameProcessor): - def __init__( - self, - *, - sample_rate: Optional[int] = None, - vad_params: VADParams = VADParams(), - audio_passthrough: bool = False, - ): - super().__init__() - - self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params) - self._audio_passthrough = audio_passthrough - - self._processor_vad_state: VADState = VADState.QUIET - - # - # FrameProcessor - # - - async def process_frame(self, frame: Frame, direction: FrameDirection): - await super().process_frame(frame, direction) - - if isinstance(frame, StartFrame): - self._vad_analyzer.set_sample_rate(frame.audio_in_sample_rate) - - if isinstance(frame, AudioRawFrame): - await self._analyze_audio(frame) - if self._audio_passthrough: - await self.push_frame(frame, direction) - else: - await self.push_frame(frame, direction) - - # - # Handle interruptions - # - - async def _handle_interruptions(self, frame: Frame): - if self.interruptions_allowed: - # Make sure we notify about interruptions quickly out-of-band. - if isinstance(frame, UserStartedSpeakingFrame): - logger.debug("User started speaking") - await self._start_interruption() - # Push an out-of-band frame (i.e. not using the ordered push - # frame task) to stop everything, specially at the output - # transport. - await self.push_frame(StartInterruptionFrame()) - elif isinstance(frame, UserStoppedSpeakingFrame): - logger.debug("User stopped speaking") - await self._stop_interruption() - await self.push_frame(StopInterruptionFrame()) - - await self.push_frame(frame) - - async def _analyze_audio(self, frame: AudioRawFrame): - # Check VAD and push event if necessary. We just care about changes - # from QUIET to SPEAKING and vice versa. - new_vad_state = self._vad_analyzer.analyze_audio(frame.audio) - if ( - new_vad_state != self._processor_vad_state - and new_vad_state != VADState.STARTING - and new_vad_state != VADState.STOPPING - ): - new_frame = None - - if new_vad_state == VADState.SPEAKING: - new_frame = UserStartedSpeakingFrame() - elif new_vad_state == VADState.QUIET: - new_frame = UserStoppedSpeakingFrame() - - if new_frame: - await self._handle_interruptions(new_frame) - - self._processor_vad_state = new_vad_state