diff --git a/CHANGELOG.md b/CHANGELOG.md index aa07b98f8..1045974a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to match other services. +- The `vad` package is now deprecated and `audio.vad` should be used + instead. The `avd` package will get removed in a future release. + ### Fixed - Fixed `SileroVAD` processor to support interruptions properly. diff --git a/examples/canonical-metrics/bot.py b/examples/canonical-metrics/bot.py index efad7710c..e61dd375d 100644 --- a/examples/canonical-metrics/bot.py +++ b/examples/canonical-metrics/bot.py @@ -14,6 +14,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import EndFrame, LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -27,7 +28,6 @@ from pipecat.services.canonical import CanonicalMetricsService from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/chatbot-audio-recording/bot.py b/examples/chatbot-audio-recording/bot.py index 6acdb08e6..4cec2a996 100644 --- a/examples/chatbot-audio-recording/bot.py +++ b/examples/chatbot-audio-recording/bot.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import EndFrame, LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -25,7 +26,6 @@ from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/deployment/flyio-example/bot.py b/examples/deployment/flyio-example/bot.py index b7378c0ff..079f88d95 100644 --- a/examples/deployment/flyio-example/bot.py +++ b/examples/deployment/flyio-example/bot.py @@ -3,6 +3,7 @@ import os import sys import argparse +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -14,7 +15,6 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame from pipecat.services.openai import OpenAILLMService from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from loguru import logger diff --git a/examples/dialin-chatbot/bot_daily.py b/examples/dialin-chatbot/bot_daily.py index 2645c65a0..f5939b4df 100644 --- a/examples/dialin-chatbot/bot_daily.py +++ b/examples/dialin-chatbot/bot_daily.py @@ -3,6 +3,7 @@ import os import sys import argparse +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -14,7 +15,7 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyDialinSettings -from pipecat.vad.silero import SileroVADAnalyzer + from loguru import logger from dotenv import load_dotenv diff --git a/examples/dialin-chatbot/bot_twilio.py b/examples/dialin-chatbot/bot_twilio.py index c2fe144a6..1cf32afdf 100644 --- a/examples/dialin-chatbot/bot_twilio.py +++ b/examples/dialin-chatbot/bot_twilio.py @@ -3,6 +3,7 @@ import os import sys import argparse +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -14,7 +15,6 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from twilio.rest import Client diff --git a/examples/foundational/06-listen-and-respond.py b/examples/foundational/06-listen-and-respond.py index ce9e235f5..928473056 100644 --- a/examples/foundational/06-listen-and-respond.py +++ b/examples/foundational/06-listen-and-respond.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import Frame, LLMMessagesFrame, MetricsFrame from pipecat.metrics.metrics import ( TTFBMetricsData, @@ -18,7 +19,7 @@ from pipecat.metrics.metrics import ( ) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.pipeline.task import PipelineTask from pipecat.processors.aggregators.llm_response import ( LLMAssistantResponseAggregator, LLMUserResponseAggregator, @@ -27,7 +28,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py index 30bd8dc64..500fea3a5 100644 --- a/examples/foundational/06a-image-sync.py +++ b/examples/foundational/06a-image-sync.py @@ -11,6 +11,7 @@ import sys from PIL import Image +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -23,7 +24,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.cartesia import CartesiaHttpTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from pipecat.transports.services.daily import DailyParams from runner import configure diff --git a/examples/foundational/07-interruptible.py b/examples/foundational/07-interruptible.py index 8026940f8..458edc8ed 100644 --- a/examples/foundational/07-interruptible.py +++ b/examples/foundational/07-interruptible.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/07a-interruptible-anthropic.py b/examples/foundational/07a-interruptible-anthropic.py index 288cb1b31..406f5fef1 100644 --- a/examples/foundational/07a-interruptible-anthropic.py +++ b/examples/foundational/07a-interruptible-anthropic.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -21,7 +22,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.anthropic import AnthropicLLMService from pipecat.services.cartesia import CartesiaTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/07b-interruptible-langchain.py b/examples/foundational/07b-interruptible-langchain.py index 5ebfd3388..ec6cdb0e2 100644 --- a/examples/foundational/07b-interruptible-langchain.py +++ b/examples/foundational/07b-interruptible-langchain.py @@ -10,6 +10,7 @@ import sys import aiohttp +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -21,7 +22,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.processors.frameworks.langchain import LangchainProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_community.chat_message_histories import ChatMessageHistory diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py index fc33c246f..e913005e1 100644 --- a/examples/foundational/07c-interruptible-deepgram.py +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -24,7 +25,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/07d-interruptible-elevenlabs.py b/examples/foundational/07d-interruptible-elevenlabs.py index c8a32d872..fd6a5a1d5 100644 --- a/examples/foundational/07d-interruptible-elevenlabs.py +++ b/examples/foundational/07d-interruptible-elevenlabs.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -24,7 +25,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py index 9185e2390..520fb40bd 100644 --- a/examples/foundational/07e-interruptible-playht.py +++ b/examples/foundational/07e-interruptible-playht.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -25,7 +26,6 @@ from pipecat.services.openai import OpenAILLMService from pipecat.services.playht import PlayHTTTSService from pipecat.transcriptions.language import Language from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/07f-interruptible-azure.py b/examples/foundational/07f-interruptible-azure.py index 11bfebe53..eb7745df0 100644 --- a/examples/foundational/07f-interruptible-azure.py +++ b/examples/foundational/07f-interruptible-azure.py @@ -9,6 +9,7 @@ import asyncio import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -19,7 +20,6 @@ from pipecat.processors.aggregators.llm_response import ( ) from pipecat.services.azure import AzureLLMService, AzureSTTService, AzureTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py index cabf1245e..56f94d568 100644 --- a/examples/foundational/07g-interruptible-openai-tts.py +++ b/examples/foundational/07g-interruptible-openai-tts.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -23,7 +24,6 @@ from pipecat.processors.aggregators.llm_response import ( ) from pipecat.services.openai import OpenAILLMService, OpenAITTSService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/07h-interruptible-openpipe.py b/examples/foundational/07h-interruptible-openpipe.py index b87563bd3..afe378f47 100644 --- a/examples/foundational/07h-interruptible-openpipe.py +++ b/examples/foundational/07h-interruptible-openpipe.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openpipe import OpenPipeLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/07i-interruptible-xtts.py b/examples/foundational/07i-interruptible-xtts.py index 2e6f95433..f51487ec2 100644 --- a/examples/foundational/07i-interruptible-xtts.py +++ b/examples/foundational/07i-interruptible-xtts.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -17,11 +18,9 @@ from pipecat.processors.aggregators.llm_response import ( LLMAssistantResponseAggregator, LLMUserResponseAggregator, ) -from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService from pipecat.services.openai import OpenAILLMService from pipecat.services.xtts import XTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/07j-interruptible-gladia.py b/examples/foundational/07j-interruptible-gladia.py index dc07ec7ba..f2d90761a 100644 --- a/examples/foundational/07j-interruptible-gladia.py +++ b/examples/foundational/07j-interruptible-gladia.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -21,7 +22,6 @@ from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.gladia import GladiaSTTService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/07k-interruptible-lmnt.py b/examples/foundational/07k-interruptible-lmnt.py index fb231c7bc..9056437ef 100644 --- a/examples/foundational/07k-interruptible-lmnt.py +++ b/examples/foundational/07k-interruptible-lmnt.py @@ -9,6 +9,7 @@ import asyncio import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.lmnt import LmntTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/07l-interruptible-together.py b/examples/foundational/07l-interruptible-together.py index a99b07a1a..0010b9643 100644 --- a/examples/foundational/07l-interruptible-together.py +++ b/examples/foundational/07l-interruptible-together.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -21,7 +22,6 @@ from pipecat.services.ai_services import OpenAILLMContext from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.together import TogetherLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py index 69d4b84c1..7cc1440ef 100644 --- a/examples/foundational/07m-interruptible-aws.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -25,7 +26,6 @@ from pipecat.services.aws import AWSTTSService from pipecat.services.deepgram import DeepgramSTTService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py index 55c931cf6..b25ad185f 100644 --- a/examples/foundational/07n-interruptible-google.py +++ b/examples/foundational/07n-interruptible-google.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -25,7 +26,6 @@ from pipecat.services.deepgram import DeepgramSTTService from pipecat.services.google import GoogleTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/10-wake-phrase.py b/examples/foundational/10-wake-phrase.py index 860cda7d0..9bc9a0b9e 100644 --- a/examples/foundational/10-wake-phrase.py +++ b/examples/foundational/10-wake-phrase.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.processors.filters.wake_check_filter import WakeCheckFilter from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py index 89b7ea93c..c6c486368 100644 --- a/examples/foundational/11-sound-effects.py +++ b/examples/foundational/11-sound-effects.py @@ -10,6 +10,7 @@ import os import sys import wave +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import ( Frame, LLMFullResponseEndFrame, @@ -28,7 +29,6 @@ from pipecat.processors.logger import FrameLogger from pipecat.services.cartesia import CartesiaHttpTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/12-describe-video.py b/examples/foundational/12-describe-video.py index 6b24190d0..3b17ba040 100644 --- a/examples/foundational/12-describe-video.py +++ b/examples/foundational/12-describe-video.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.moondream import MoondreamService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/12a-describe-video-gemini-flash.py b/examples/foundational/12a-describe-video-gemini-flash.py index 440564d23..7f00daa68 100644 --- a/examples/foundational/12a-describe-video-gemini-flash.py +++ b/examples/foundational/12a-describe-video-gemini-flash.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.google import GoogleLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/12b-describe-video-gpt-4o.py b/examples/foundational/12b-describe-video-gpt-4o.py index 1d2865004..035d1dddd 100644 --- a/examples/foundational/12b-describe-video-gpt-4o.py +++ b/examples/foundational/12b-describe-video-gpt-4o.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py index c7267467a..e11c02f49 100644 --- a/examples/foundational/12c-describe-video-anthropic.py +++ b/examples/foundational/12c-describe-video-anthropic.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.anthropic import AnthropicLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/14-function-calling.py b/examples/foundational/14-function-calling.py index e1432b6ca..aabc20e86 100644 --- a/examples/foundational/14-function-calling.py +++ b/examples/foundational/14-function-calling.py @@ -9,13 +9,13 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMContext, OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from openai.types.chat import ChatCompletionToolParam diff --git a/examples/foundational/14a-function-calling-anthropic.py b/examples/foundational/14a-function-calling-anthropic.py index 05042c65b..7afab6b84 100644 --- a/examples/foundational/14a-function-calling-anthropic.py +++ b/examples/foundational/14a-function-calling-anthropic.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -16,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.anthropic import AnthropicLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/14b-function-calling-anthropic-video.py b/examples/foundational/14b-function-calling-anthropic-video.py index 8a8110487..ff832f873 100644 --- a/examples/foundational/14b-function-calling-anthropic-video.py +++ b/examples/foundational/14b-function-calling-anthropic-video.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -16,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.anthropic import AnthropicLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/14c-function-calling-together.py b/examples/foundational/14c-function-calling-together.py index ebfc4b5df..391a96887 100644 --- a/examples/foundational/14c-function-calling-together.py +++ b/examples/foundational/14c-function-calling-together.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineTask @@ -16,7 +17,6 @@ from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMContext from pipecat.services.together import TogetherLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from openai.types.chat import ChatCompletionToolParam diff --git a/examples/foundational/14d-function-calling-video.py b/examples/foundational/14d-function-calling-video.py index f42665d5b..5e149c815 100644 --- a/examples/foundational/14d-function-calling-video.py +++ b/examples/foundational/14d-function-calling-video.py @@ -9,13 +9,13 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineTask from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMContext, OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from openai.types.chat import ChatCompletionToolParam diff --git a/examples/foundational/15-switch-voices.py b/examples/foundational/15-switch-voices.py index 4feaa4bbf..d874d0fba 100644 --- a/examples/foundational/15-switch-voices.py +++ b/examples/foundational/15-switch-voices.py @@ -9,6 +9,7 @@ import asyncio import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.parallel_pipeline import ParallelPipeline @@ -19,7 +20,6 @@ from pipecat.processors.filters.function_filter import FunctionFilter from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from openai.types.chat import ChatCompletionToolParam diff --git a/examples/foundational/15a-switch-languages.py b/examples/foundational/15a-switch-languages.py index 8c47ad963..f3310366a 100644 --- a/examples/foundational/15a-switch-languages.py +++ b/examples/foundational/15a-switch-languages.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.parallel_pipeline import ParallelPipeline @@ -20,7 +21,6 @@ from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.services.whisper import Model, WhisperSTTService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from openai.types.chat import ChatCompletionToolParam diff --git a/examples/foundational/16-gpu-container-local-bot.py b/examples/foundational/16-gpu-container-local-bot.py index 55286eed5..ce4b923d5 100644 --- a/examples/foundational/16-gpu-container-local-bot.py +++ b/examples/foundational/16-gpu-container-local-bot.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -28,7 +29,6 @@ from pipecat.transports.services.daily import ( DailyTransport, DailyTransportMessageFrame, ) -from pipecat.vad.silero import SileroVADAnalyzer load_dotenv(override=True) diff --git a/examples/foundational/17-detect-user-idle.py b/examples/foundational/17-detect-user-idle.py index 91835f8b3..79ea712ab 100644 --- a/examples/foundational/17-detect-user-idle.py +++ b/examples/foundational/17-detect-user-idle.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -21,7 +22,6 @@ from pipecat.processors.user_idle_processor import UserIdleProcessor from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py index 41b0f418a..e9cb02f23 100644 --- a/examples/foundational/19-openai-realtime-beta.py +++ b/examples/foundational/19-openai-realtime-beta.py @@ -14,6 +14,8 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.vad.vad_analyzer import VADParams from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -25,8 +27,6 @@ from pipecat.services.openai_realtime_beta import ( TurnDetection, ) from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer -from pipecat.vad.vad_analyzer import VADParams load_dotenv(override=True) diff --git a/examples/foundational/20a-persistent-context-openai.py b/examples/foundational/20a-persistent-context-openai.py index 5767d6dbd..97ef88685 100644 --- a/examples/foundational/20a-persistent-context-openai.py +++ b/examples/foundational/20a-persistent-context-openai.py @@ -16,6 +16,8 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -26,8 +28,6 @@ from pipecat.services.openai import OpenAILLMService from pipecat.services.cartesia import CartesiaTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer -from pipecat.vad.vad_analyzer import VADParams load_dotenv(override=True) diff --git a/examples/foundational/20b-persistent-context-openai-realtime.py b/examples/foundational/20b-persistent-context-openai-realtime.py index 4935fe281..2dc7bf19e 100644 --- a/examples/foundational/20b-persistent-context-openai-realtime.py +++ b/examples/foundational/20b-persistent-context-openai-realtime.py @@ -16,6 +16,8 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -29,8 +31,6 @@ from pipecat.services.openai_realtime_beta import ( TurnDetection, ) from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer -from pipecat.vad.vad_analyzer import VADParams load_dotenv(override=True) diff --git a/examples/foundational/20c-persistent-context-anthropic.py b/examples/foundational/20c-persistent-context-anthropic.py index 926722aeb..02bd4febb 100644 --- a/examples/foundational/20c-persistent-context-anthropic.py +++ b/examples/foundational/20c-persistent-context-anthropic.py @@ -16,6 +16,8 @@ from dotenv import load_dotenv from loguru import logger from runner import configure +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -26,8 +28,6 @@ from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.anthropic import AnthropicLLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer -from pipecat.vad.vad_analyzer import VADParams load_dotenv(override=True) diff --git a/examples/moondream-chatbot/bot.py b/examples/moondream-chatbot/bot.py index 86456d40f..182dceb65 100644 --- a/examples/moondream-chatbot/bot.py +++ b/examples/moondream-chatbot/bot.py @@ -11,6 +11,7 @@ import sys from PIL import Image +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import ( ImageRawFrame, OutputImageRawFrame, @@ -23,7 +24,6 @@ from pipecat.frames.frames import ( UserImageRawFrame, UserImageRequestFrame, ) - from pipecat.pipeline.parallel_pipeline import ParallelPipeline from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -36,7 +36,6 @@ from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.moondream import MoondreamService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/patient-intake/bot.py b/examples/patient-intake/bot.py index 52f45f75e..efdd1caa9 100644 --- a/examples/patient-intake/bot.py +++ b/examples/patient-intake/bot.py @@ -10,6 +10,7 @@ import os import sys import wave +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import OutputAudioRawFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame, OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/simple-chatbot/bot.py b/examples/simple-chatbot/bot.py index b06721d4c..8ca764454 100644 --- a/examples/simple-chatbot/bot.py +++ b/examples/simple-chatbot/bot.py @@ -11,6 +11,7 @@ import sys from PIL import Image +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -30,7 +31,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.elevenlabs import ElevenLabsTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py index 58d5eb2f5..310eb4051 100644 --- a/examples/studypal/studypal.py +++ b/examples/studypal/studypal.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup from pypdf import PdfReader import tiktoken +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -19,7 +20,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVADAnalyzer from runner import configure diff --git a/examples/twilio-chatbot/bot.py b/examples/twilio-chatbot/bot.py index de9e395c4..32d8317ba 100644 --- a/examples/twilio-chatbot/bot.py +++ b/examples/twilio-chatbot/bot.py @@ -1,6 +1,7 @@ import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import EndFrame, LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -16,7 +17,6 @@ from pipecat.transports.network.fastapi_websocket import ( FastAPIWebsocketTransport, FastAPIWebsocketParams, ) -from pipecat.vad.silero import SileroVADAnalyzer from pipecat.serializers.twilio import TwilioFrameSerializer from loguru import logger diff --git a/examples/websocket-server/bot.py b/examples/websocket-server/bot.py index e223d4e3f..deb6a31a2 100644 --- a/examples/websocket-server/bot.py +++ b/examples/websocket-server/bot.py @@ -8,6 +8,7 @@ import asyncio import os import sys +from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -23,7 +24,6 @@ from pipecat.transports.network.websocket_server import ( WebsocketServerParams, WebsocketServerTransport, ) -from pipecat.vad.silero import SileroVADAnalyzer from loguru import logger diff --git a/src/pipecat/vad/data/__init__.py b/src/pipecat/audio/__init__.py similarity index 100% rename from src/pipecat/vad/data/__init__.py rename to src/pipecat/audio/__init__.py diff --git a/src/pipecat/audio/vad/__init__.py b/src/pipecat/audio/vad/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/audio/vad/data/__init__.py b/src/pipecat/audio/vad/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/pipecat/vad/data/silero_vad.onnx b/src/pipecat/audio/vad/data/silero_vad.onnx similarity index 100% rename from src/pipecat/vad/data/silero_vad.onnx rename to src/pipecat/audio/vad/data/silero_vad.onnx diff --git a/src/pipecat/audio/vad/silero.py b/src/pipecat/audio/vad/silero.py new file mode 100644 index 000000000..f21037c5f --- /dev/null +++ b/src/pipecat/audio/vad/silero.py @@ -0,0 +1,245 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import time + +import numpy as np + +from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams, VADState +from pipecat.frames.frames import ( + AudioRawFrame, + Frame, + StartInterruptionFrame, + StopInterruptionFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + +from loguru import logger + +# How often should we reset internal model state +_MODEL_RESET_STATES_TIME = 5.0 + +try: + import onnxruntime + +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.") + raise Exception(f"Missing module(s): {e}") + + +class SileroOnnxModel: + def __init__(self, path, force_onnx_cpu=True): + import numpy as np + + global np + + opts = onnxruntime.SessionOptions() + opts.inter_op_num_threads = 1 + opts.intra_op_num_threads = 1 + + if force_onnx_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers(): + self.session = onnxruntime.InferenceSession( + path, providers=["CPUExecutionProvider"], sess_options=opts + ) + else: + self.session = onnxruntime.InferenceSession(path, sess_options=opts) + + self.reset_states() + self.sample_rates = [8000, 16000] + + def _validate_input(self, x, sr: int): + if np.ndim(x) == 1: + x = np.expand_dims(x, 0) + if np.ndim(x) > 2: + raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}") + + if sr not in self.sample_rates: + raise ValueError( + f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)" + ) + if sr / np.shape(x)[1] > 31.25: + raise ValueError("Input audio chunk is too short") + + return x, sr + + def reset_states(self, batch_size=1): + self._state = np.zeros((2, batch_size, 128), dtype="float32") + self._context = np.zeros((batch_size, 0), dtype="float32") + self._last_sr = 0 + self._last_batch_size = 0 + + def __call__(self, x, sr: int): + x, sr = self._validate_input(x, sr) + num_samples = 512 if sr == 16000 else 256 + + if np.shape(x)[-1] != num_samples: + raise ValueError( + f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)" + ) + + batch_size = np.shape(x)[0] + context_size = 64 if sr == 16000 else 32 + + if not self._last_batch_size: + self.reset_states(batch_size) + if (self._last_sr) and (self._last_sr != sr): + self.reset_states(batch_size) + if (self._last_batch_size) and (self._last_batch_size != batch_size): + self.reset_states(batch_size) + + if not np.shape(self._context)[1]: + self._context = np.zeros((batch_size, context_size), dtype="float32") + + x = np.concatenate((self._context, x), axis=1) + + if sr in [8000, 16000]: + ort_inputs = {"input": x, "state": self._state, "sr": np.array(sr, dtype="int64")} + ort_outs = self.session.run(None, ort_inputs) + out, state = ort_outs + self._state = state + else: + raise ValueError() + + self._context = x[..., -context_size:] + self._last_sr = sr + self._last_batch_size = batch_size + + return out + + +class SileroVADAnalyzer(VADAnalyzer): + def __init__(self, *, sample_rate: int = 16000, params: VADParams = VADParams()): + super().__init__(sample_rate=sample_rate, num_channels=1, params=params) + + if sample_rate != 16000 and sample_rate != 8000: + raise ValueError("Silero VAD sample rate needs to be 16000 or 8000") + + logger.debug("Loading Silero VAD model...") + + model_name = "silero_vad.onnx" + package_path = "pipecat.audio.vad.data" + + try: + import importlib_resources as impresources + + model_file_path = str(impresources.files(package_path).joinpath(model_name)) + except BaseException: + from importlib import resources as impresources + + try: + with impresources.path(package_path, model_name) as f: + model_file_path = f + except BaseException: + model_file_path = str(impresources.files(package_path).joinpath(model_name)) + + self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True) + + self._last_reset_time = 0 + + logger.debug("Loaded Silero VAD") + + # + # VADAnalyzer + # + + def num_frames_required(self) -> int: + return 512 if self.sample_rate == 16000 else 256 + + def voice_confidence(self, buffer) -> float: + try: + audio_int16 = np.frombuffer(buffer, np.int16) + # Divide by 32768 because we have signed 16-bit data. + audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0 + new_confidence = self._model(audio_float32, self.sample_rate)[0] + + # We need to reset the model from time to time because it doesn't + # really need all the data and memory will keep growing otherwise. + curr_time = time.time() + diff_time = curr_time - self._last_reset_time + if diff_time >= _MODEL_RESET_STATES_TIME: + self._model.reset_states() + self._last_reset_time = curr_time + + return new_confidence + except Exception as e: + # This comes from an empty audio array + logger.exception(f"Error analyzing audio with Silero VAD: {e}") + return 0 + + +class SileroVAD(FrameProcessor): + def __init__( + self, + *, + sample_rate: int = 16000, + vad_params: VADParams = VADParams(), + audio_passthrough: bool = False, + ): + super().__init__() + + self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params) + self._audio_passthrough = audio_passthrough + + self._processor_vad_state: VADState = VADState.QUIET + + # + # FrameProcessor + # + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, AudioRawFrame): + await self._analyze_audio(frame) + if self._audio_passthrough: + await self.push_frame(frame, direction) + else: + await self.push_frame(frame, direction) + + # + # Handle interruptions + # + + async def _handle_interruptions(self, frame: Frame): + if self.interruptions_allowed: + # Make sure we notify about interruptions quickly out-of-band. + if isinstance(frame, UserStartedSpeakingFrame): + logger.debug("User started speaking") + await self._start_interruption() + # Push an out-of-band frame (i.e. not using the ordered push + # frame task) to stop everything, specially at the output + # transport. + await self.push_frame(StartInterruptionFrame()) + elif isinstance(frame, UserStoppedSpeakingFrame): + logger.debug("User stopped speaking") + await self._stop_interruption() + await self.push_frame(StopInterruptionFrame()) + + await self.push_frame(frame) + + async def _analyze_audio(self, frame: AudioRawFrame): + # Check VAD and push event if necessary. We just care about changes + # from QUIET to SPEAKING and vice versa. + new_vad_state = self._vad_analyzer.analyze_audio(frame.audio) + if ( + new_vad_state != self._processor_vad_state + and new_vad_state != VADState.STARTING + and new_vad_state != VADState.STOPPING + ): + new_frame = None + + if new_vad_state == VADState.SPEAKING: + new_frame = UserStartedSpeakingFrame() + elif new_vad_state == VADState.QUIET: + new_frame = UserStoppedSpeakingFrame() + + if new_frame: + await self._handle_interruptions(new_frame) + + self._processor_vad_state = new_vad_state diff --git a/src/pipecat/audio/vad/vad_analyzer.py b/src/pipecat/audio/vad/vad_analyzer.py new file mode 100644 index 000000000..fe2739b28 --- /dev/null +++ b/src/pipecat/audio/vad/vad_analyzer.py @@ -0,0 +1,129 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from abc import abstractmethod +from enum import Enum + +from loguru import logger +from pydantic.main import BaseModel + +from pipecat.utils.audio import calculate_audio_volume, exp_smoothing + + +class VADState(Enum): + QUIET = 1 + STARTING = 2 + SPEAKING = 3 + STOPPING = 4 + + +class VADParams(BaseModel): + confidence: float = 0.7 + start_secs: float = 0.2 + stop_secs: float = 0.8 + min_volume: float = 0.6 + + +class VADAnalyzer: + def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams): + self._sample_rate = sample_rate + self._num_channels = num_channels + + self.set_params(params) + + self._vad_buffer = b"" + + # Volume exponential smoothing + self._smoothing_factor = 0.2 + self._prev_volume = 0 + + @property + def sample_rate(self): + return self._sample_rate + + @property + def num_channels(self): + return self._num_channels + + @abstractmethod + def num_frames_required(self) -> int: + pass + + @abstractmethod + def voice_confidence(self, buffer) -> float: + pass + + def set_params(self, params: VADParams): + logger.info(f"Setting VAD params to: {params}") + self._params = params + self._vad_frames = self.num_frames_required() + self._vad_frames_num_bytes = self._vad_frames * self._num_channels * 2 + + vad_frames_per_sec = self._vad_frames / self._sample_rate + + self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec) + self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec) + self._vad_starting_count = 0 + self._vad_stopping_count = 0 + self._vad_state: VADState = VADState.QUIET + + def _get_smoothed_volume(self, audio: bytes) -> float: + volume = calculate_audio_volume(audio, self._sample_rate) + return exp_smoothing(volume, self._prev_volume, self._smoothing_factor) + + def analyze_audio(self, buffer) -> VADState: + self._vad_buffer += buffer + + num_required_bytes = self._vad_frames_num_bytes + if len(self._vad_buffer) < num_required_bytes: + return self._vad_state + + audio_frames = self._vad_buffer[:num_required_bytes] + self._vad_buffer = self._vad_buffer[num_required_bytes:] + + confidence = self.voice_confidence(audio_frames) + + volume = self._get_smoothed_volume(audio_frames) + self._prev_volume = volume + + speaking = confidence >= self._params.confidence and volume >= self._params.min_volume + + if speaking: + match self._vad_state: + case VADState.QUIET: + self._vad_state = VADState.STARTING + self._vad_starting_count = 1 + case VADState.STARTING: + self._vad_starting_count += 1 + case VADState.STOPPING: + self._vad_state = VADState.SPEAKING + self._vad_stopping_count = 0 + else: + match self._vad_state: + case VADState.STARTING: + self._vad_state = VADState.QUIET + self._vad_starting_count = 0 + case VADState.SPEAKING: + self._vad_state = VADState.STOPPING + self._vad_stopping_count = 1 + case VADState.STOPPING: + self._vad_stopping_count += 1 + + if ( + self._vad_state == VADState.STARTING + and self._vad_starting_count >= self._vad_start_frames + ): + self._vad_state = VADState.SPEAKING + self._vad_starting_count = 0 + + if ( + self._vad_state == VADState.STOPPING + and self._vad_stopping_count >= self._vad_stop_frames + ): + self._vad_state = VADState.QUIET + self._vad_stopping_count = 0 + + return self._vad_state diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index db64ed0ea..517918a52 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -7,12 +7,12 @@ from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple +from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.clocks.base_clock import BaseClock from pipecat.metrics.metrics import MetricsData from pipecat.transcriptions.language import Language from pipecat.utils.time import nanoseconds_to_str from pipecat.utils.utils import obj_count, obj_id -from pipecat.vad.vad_analyzer import VADParams def format_pts(pts: int | None): diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py index ad95d1139..d66e8aa71 100644 --- a/src/pipecat/transports/base_input.py +++ b/src/pipecat/transports/base_input.py @@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor from loguru import logger +from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState from pipecat.frames.frames import ( BotInterruptionFrame, CancelFrame, @@ -25,7 +26,6 @@ from pipecat.frames.frames import ( ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.transports.base_transport import TransportParams -from pipecat.vad.vad_analyzer import VADAnalyzer, VADState class BaseInputTransport(FrameProcessor): diff --git a/src/pipecat/transports/base_transport.py b/src/pipecat/transports/base_transport.py index 5802993fa..d2f98c26c 100644 --- a/src/pipecat/transports/base_transport.py +++ b/src/pipecat/transports/base_transport.py @@ -12,8 +12,8 @@ from abc import ABC, abstractmethod from pydantic import ConfigDict from pydantic.main import BaseModel +from pipecat.audio.vad.vad_analyzer import VADAnalyzer from pipecat.processors.frame_processor import FrameProcessor -from pipecat.vad.vad_analyzer import VADAnalyzer from loguru import logger diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index d1e7295d4..243db7603 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -22,6 +22,7 @@ from daily import ( from loguru import logger from pydantic.main import BaseModel +from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams from pipecat.frames.frames import ( CancelFrame, EndFrame, @@ -43,7 +44,6 @@ from pipecat.transcriptions.language import Language from pipecat.transports.base_input import BaseInputTransport from pipecat.transports.base_output import BaseOutputTransport from pipecat.transports.base_transport import BaseTransport, TransportParams -from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams try: from daily import CallClient, Daily, EventHandler diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py index af21cd14a..9cf2c617d 100644 --- a/src/pipecat/transports/services/livekit.py +++ b/src/pipecat/transports/services/livekit.py @@ -13,6 +13,7 @@ from loguru import logger from pydantic import BaseModel from scipy import signal +from pipecat.audio.vad.vad_analyzer import VADAnalyzer from pipecat.frames.frames import ( AudioRawFrame, CancelFrame, @@ -28,7 +29,6 @@ from pipecat.processors.frame_processor import FrameDirection from pipecat.transports.base_input import BaseInputTransport from pipecat.transports.base_output import BaseOutputTransport from pipecat.transports.base_transport import BaseTransport, TransportParams -from pipecat.vad.vad_analyzer import VADAnalyzer try: from livekit import rtc diff --git a/src/pipecat/vad/silero.py b/src/pipecat/vad/silero.py index 399ef6b48..3ecc06e49 100644 --- a/src/pipecat/vad/silero.py +++ b/src/pipecat/vad/silero.py @@ -4,242 +4,8 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import time - -import numpy as np - -from pipecat.frames.frames import ( - AudioRawFrame, - Frame, - StartInterruptionFrame, - StopInterruptionFrame, - UserStartedSpeakingFrame, - UserStoppedSpeakingFrame, -) -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor -from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState - from loguru import logger -# How often should we reset internal model state -_MODEL_RESET_STATES_TIME = 5.0 +logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.") -try: - import onnxruntime - -except ModuleNotFoundError as e: - logger.error(f"Exception: {e}") - logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.") - raise Exception(f"Missing module(s): {e}") - - -class SileroOnnxModel: - def __init__(self, path, force_onnx_cpu=True): - import numpy as np - - global np - - opts = onnxruntime.SessionOptions() - opts.inter_op_num_threads = 1 - opts.intra_op_num_threads = 1 - - if force_onnx_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers(): - self.session = onnxruntime.InferenceSession( - path, providers=["CPUExecutionProvider"], sess_options=opts - ) - else: - self.session = onnxruntime.InferenceSession(path, sess_options=opts) - - self.reset_states() - self.sample_rates = [8000, 16000] - - def _validate_input(self, x, sr: int): - if np.ndim(x) == 1: - x = np.expand_dims(x, 0) - if np.ndim(x) > 2: - raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}") - - if sr not in self.sample_rates: - raise ValueError( - f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)" - ) - if sr / np.shape(x)[1] > 31.25: - raise ValueError("Input audio chunk is too short") - - return x, sr - - def reset_states(self, batch_size=1): - self._state = np.zeros((2, batch_size, 128), dtype="float32") - self._context = np.zeros((batch_size, 0), dtype="float32") - self._last_sr = 0 - self._last_batch_size = 0 - - def __call__(self, x, sr: int): - x, sr = self._validate_input(x, sr) - num_samples = 512 if sr == 16000 else 256 - - if np.shape(x)[-1] != num_samples: - raise ValueError( - f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)" - ) - - batch_size = np.shape(x)[0] - context_size = 64 if sr == 16000 else 32 - - if not self._last_batch_size: - self.reset_states(batch_size) - if (self._last_sr) and (self._last_sr != sr): - self.reset_states(batch_size) - if (self._last_batch_size) and (self._last_batch_size != batch_size): - self.reset_states(batch_size) - - if not np.shape(self._context)[1]: - self._context = np.zeros((batch_size, context_size), dtype="float32") - - x = np.concatenate((self._context, x), axis=1) - - if sr in [8000, 16000]: - ort_inputs = {"input": x, "state": self._state, "sr": np.array(sr, dtype="int64")} - ort_outs = self.session.run(None, ort_inputs) - out, state = ort_outs - self._state = state - else: - raise ValueError() - - self._context = x[..., -context_size:] - self._last_sr = sr - self._last_batch_size = batch_size - - return out - - -class SileroVADAnalyzer(VADAnalyzer): - def __init__(self, *, sample_rate: int = 16000, params: VADParams = VADParams()): - super().__init__(sample_rate=sample_rate, num_channels=1, params=params) - - if sample_rate != 16000 and sample_rate != 8000: - raise ValueError("Silero VAD sample rate needs to be 16000 or 8000") - - logger.debug("Loading Silero VAD model...") - - model_name = "silero_vad.onnx" - package_path = "pipecat.vad.data" - - try: - import importlib_resources as impresources - - model_file_path = str(impresources.files(package_path).joinpath(model_name)) - except BaseException: - from importlib import resources as impresources - - try: - with impresources.path(package_path, model_name) as f: - model_file_path = f - except BaseException: - model_file_path = str(impresources.files(package_path).joinpath(model_name)) - - self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True) - - self._last_reset_time = 0 - - logger.debug("Loaded Silero VAD") - - # - # VADAnalyzer - # - - def num_frames_required(self) -> int: - return 512 if self.sample_rate == 16000 else 256 - - def voice_confidence(self, buffer) -> float: - try: - audio_int16 = np.frombuffer(buffer, np.int16) - # Divide by 32768 because we have signed 16-bit data. - audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0 - new_confidence = self._model(audio_float32, self.sample_rate)[0] - - # We need to reset the model from time to time because it doesn't - # really need all the data and memory will keep growing otherwise. - curr_time = time.time() - diff_time = curr_time - self._last_reset_time - if diff_time >= _MODEL_RESET_STATES_TIME: - self._model.reset_states() - self._last_reset_time = curr_time - - return new_confidence - except Exception as e: - # This comes from an empty audio array - logger.exception(f"Error analyzing audio with Silero VAD: {e}") - return 0 - - -class SileroVAD(FrameProcessor): - def __init__( - self, - *, - sample_rate: int = 16000, - vad_params: VADParams = VADParams(), - audio_passthrough: bool = False, - ): - super().__init__() - - self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params) - self._audio_passthrough = audio_passthrough - - self._processor_vad_state: VADState = VADState.QUIET - - # - # FrameProcessor - # - - async def process_frame(self, frame: Frame, direction: FrameDirection): - await super().process_frame(frame, direction) - - if isinstance(frame, AudioRawFrame): - await self._analyze_audio(frame) - if self._audio_passthrough: - await self.push_frame(frame, direction) - else: - await self.push_frame(frame, direction) - - # - # Handle interruptions - # - - async def _handle_interruptions(self, frame: Frame): - if self.interruptions_allowed: - # Make sure we notify about interruptions quickly out-of-band. - if isinstance(frame, UserStartedSpeakingFrame): - logger.debug("User started speaking") - await self._start_interruption() - # Push an out-of-band frame (i.e. not using the ordered push - # frame task) to stop everything, specially at the output - # transport. - await self.push_frame(StartInterruptionFrame()) - elif isinstance(frame, UserStoppedSpeakingFrame): - logger.debug("User stopped speaking") - await self._stop_interruption() - await self.push_frame(StopInterruptionFrame()) - - await self.push_frame(frame) - - async def _analyze_audio(self, frame: AudioRawFrame): - # Check VAD and push event if necessary. We just care about changes - # from QUIET to SPEAKING and vice versa. - new_vad_state = self._vad_analyzer.analyze_audio(frame.audio) - if ( - new_vad_state != self._processor_vad_state - and new_vad_state != VADState.STARTING - and new_vad_state != VADState.STOPPING - ): - new_frame = None - - if new_vad_state == VADState.SPEAKING: - new_frame = UserStartedSpeakingFrame() - elif new_vad_state == VADState.QUIET: - new_frame = UserStoppedSpeakingFrame() - - if new_frame: - await self._handle_interruptions(new_frame) - - self._processor_vad_state = new_vad_state +from ..audio.vad.silero import SileroVAD, SileroVADAnalyzer diff --git a/src/pipecat/vad/vad_analyzer.py b/src/pipecat/vad/vad_analyzer.py index fe2739b28..f24508826 100644 --- a/src/pipecat/vad/vad_analyzer.py +++ b/src/pipecat/vad/vad_analyzer.py @@ -4,126 +4,8 @@ # SPDX-License-Identifier: BSD 2-Clause License # -from abc import abstractmethod -from enum import Enum - from loguru import logger -from pydantic.main import BaseModel -from pipecat.utils.audio import calculate_audio_volume, exp_smoothing +logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.") - -class VADState(Enum): - QUIET = 1 - STARTING = 2 - SPEAKING = 3 - STOPPING = 4 - - -class VADParams(BaseModel): - confidence: float = 0.7 - start_secs: float = 0.2 - stop_secs: float = 0.8 - min_volume: float = 0.6 - - -class VADAnalyzer: - def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams): - self._sample_rate = sample_rate - self._num_channels = num_channels - - self.set_params(params) - - self._vad_buffer = b"" - - # Volume exponential smoothing - self._smoothing_factor = 0.2 - self._prev_volume = 0 - - @property - def sample_rate(self): - return self._sample_rate - - @property - def num_channels(self): - return self._num_channels - - @abstractmethod - def num_frames_required(self) -> int: - pass - - @abstractmethod - def voice_confidence(self, buffer) -> float: - pass - - def set_params(self, params: VADParams): - logger.info(f"Setting VAD params to: {params}") - self._params = params - self._vad_frames = self.num_frames_required() - self._vad_frames_num_bytes = self._vad_frames * self._num_channels * 2 - - vad_frames_per_sec = self._vad_frames / self._sample_rate - - self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec) - self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec) - self._vad_starting_count = 0 - self._vad_stopping_count = 0 - self._vad_state: VADState = VADState.QUIET - - def _get_smoothed_volume(self, audio: bytes) -> float: - volume = calculate_audio_volume(audio, self._sample_rate) - return exp_smoothing(volume, self._prev_volume, self._smoothing_factor) - - def analyze_audio(self, buffer) -> VADState: - self._vad_buffer += buffer - - num_required_bytes = self._vad_frames_num_bytes - if len(self._vad_buffer) < num_required_bytes: - return self._vad_state - - audio_frames = self._vad_buffer[:num_required_bytes] - self._vad_buffer = self._vad_buffer[num_required_bytes:] - - confidence = self.voice_confidence(audio_frames) - - volume = self._get_smoothed_volume(audio_frames) - self._prev_volume = volume - - speaking = confidence >= self._params.confidence and volume >= self._params.min_volume - - if speaking: - match self._vad_state: - case VADState.QUIET: - self._vad_state = VADState.STARTING - self._vad_starting_count = 1 - case VADState.STARTING: - self._vad_starting_count += 1 - case VADState.STOPPING: - self._vad_state = VADState.SPEAKING - self._vad_stopping_count = 0 - else: - match self._vad_state: - case VADState.STARTING: - self._vad_state = VADState.QUIET - self._vad_starting_count = 0 - case VADState.SPEAKING: - self._vad_state = VADState.STOPPING - self._vad_stopping_count = 1 - case VADState.STOPPING: - self._vad_stopping_count += 1 - - if ( - self._vad_state == VADState.STARTING - and self._vad_starting_count >= self._vad_start_frames - ): - self._vad_state = VADState.SPEAKING - self._vad_starting_count = 0 - - if ( - self._vad_state == VADState.STOPPING - and self._vad_stopping_count >= self._vad_stop_frames - ): - self._vad_state = VADState.QUIET - self._vad_stopping_count = 0 - - return self._vad_state +from ..audio.vad.vad_analyzer import VADAnalyzer, VADParams, VADState