Merge pull request #607 from pipecat-ai/aleix/pipecat-vad-deprecation

move vad package to audio.vad
This commit is contained in:
Aleix Conchillo Flaqué
2024-10-17 13:51:20 -07:00
committed by GitHub
61 changed files with 439 additions and 415 deletions

View File

@@ -23,6 +23,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
match other services.
- The `vad` package is now deprecated and `audio.vad` should be used
instead. The `avd` package will get removed in a future release.
### Fixed
- Fixed `SileroVAD` processor to support interruptions properly.

View File

@@ -14,6 +14,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -27,7 +28,6 @@ from pipecat.services.canonical import CanonicalMetricsService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -25,7 +26,6 @@ from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -3,6 +3,7 @@ import os
import sys
import argparse
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -14,7 +15,6 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from loguru import logger

View File

@@ -3,6 +3,7 @@ import os
import sys
import argparse
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -14,7 +15,7 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyDialinSettings
from pipecat.vad.silero import SileroVADAnalyzer
from loguru import logger
from dotenv import load_dotenv

View File

@@ -3,6 +3,7 @@ import os
import sys
import argparse
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -14,7 +15,6 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from twilio.rest import Client

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, LLMMessagesFrame, MetricsFrame
from pipecat.metrics.metrics import (
TTFBMetricsData,
@@ -18,7 +19,7 @@ from pipecat.metrics.metrics import (
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
@@ -27,7 +28,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -11,6 +11,7 @@ import sys
from PIL import Image
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -23,7 +24,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaHttpTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.transports.services.daily import DailyParams
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -21,7 +22,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -10,6 +10,7 @@ import sys
import aiohttp
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -21,7 +22,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.processors.frameworks.langchain import LangchainProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -24,7 +25,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -24,7 +25,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -25,7 +26,6 @@ from pipecat.services.openai import OpenAILLMService
from pipecat.services.playht import PlayHTTTSService
from pipecat.transcriptions.language import Language
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -9,6 +9,7 @@ import asyncio
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -19,7 +20,6 @@ from pipecat.processors.aggregators.llm_response import (
)
from pipecat.services.azure import AzureLLMService, AzureSTTService, AzureTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -23,7 +24,6 @@ from pipecat.processors.aggregators.llm_response import (
)
from pipecat.services.openai import OpenAILLMService, OpenAITTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openpipe import OpenPipeLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -17,11 +18,9 @@ from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.xtts import XTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -21,7 +22,6 @@ from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.gladia import GladiaSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import asyncio
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.lmnt import LmntTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -21,7 +22,6 @@ from pipecat.services.ai_services import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.together import TogetherLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -25,7 +26,6 @@ from pipecat.services.aws import AWSTTSService
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -25,7 +26,6 @@ from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.google import GoogleTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.processors.filters.wake_check_filter import WakeCheckFilter
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -10,6 +10,7 @@ import os
import sys
import wave
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
Frame,
LLMFullResponseEndFrame,
@@ -28,7 +29,6 @@ from pipecat.processors.logger import FrameLogger
from pipecat.services.cartesia import CartesiaHttpTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.moondream import MoondreamService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.google import GoogleLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,13 +9,13 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -16,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -16,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
@@ -16,7 +17,6 @@ from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMContext
from pipecat.services.together import TogetherLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam

View File

@@ -9,13 +9,13 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam

View File

@@ -9,6 +9,7 @@ import asyncio
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
@@ -19,7 +20,6 @@ from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
@@ -20,7 +21,6 @@ from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.whisper import Model, WhisperSTTService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -28,7 +29,6 @@ from pipecat.transports.services.daily import (
DailyTransport,
DailyTransportMessageFrame,
)
from pipecat.vad.silero import SileroVADAnalyzer
load_dotenv(override=True)

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -21,7 +22,6 @@ from pipecat.processors.user_idle_processor import UserIdleProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -14,6 +14,8 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -25,8 +27,6 @@ from pipecat.services.openai_realtime_beta import (
TurnDetection,
)
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.vad.vad_analyzer import VADParams
load_dotenv(override=True)

View File

@@ -16,6 +16,8 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -26,8 +28,6 @@ from pipecat.services.openai import OpenAILLMService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.vad.vad_analyzer import VADParams
load_dotenv(override=True)

View File

@@ -16,6 +16,8 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -29,8 +31,6 @@ from pipecat.services.openai_realtime_beta import (
TurnDetection,
)
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.vad.vad_analyzer import VADParams
load_dotenv(override=True)

View File

@@ -16,6 +16,8 @@ from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -26,8 +28,6 @@ from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.vad.vad_analyzer import VADParams
load_dotenv(override=True)

View File

@@ -11,6 +11,7 @@ import sys
from PIL import Image
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
ImageRawFrame,
OutputImageRawFrame,
@@ -23,7 +24,6 @@ from pipecat.frames.frames import (
UserImageRawFrame,
UserImageRequestFrame,
)
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -36,7 +36,6 @@ from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.moondream import MoondreamService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -10,6 +10,7 @@ import os
import sys
import wave
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import OutputAudioRawFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame, OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -11,6 +11,7 @@ import sys
from PIL import Image
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -30,7 +31,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
from pypdf import PdfReader
import tiktoken
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -19,7 +20,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure

View File

@@ -1,6 +1,7 @@
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -16,7 +17,6 @@ from pipecat.transports.network.fastapi_websocket import (
FastAPIWebsocketTransport,
FastAPIWebsocketParams,
)
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.serializers.twilio import TwilioFrameSerializer
from loguru import logger

View File

@@ -8,6 +8,7 @@ import asyncio
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -23,7 +24,6 @@ from pipecat.transports.network.websocket_server import (
WebsocketServerParams,
WebsocketServerTransport,
)
from pipecat.vad.silero import SileroVADAnalyzer
from loguru import logger

View File

View File

View File

@@ -0,0 +1,245 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import time
import numpy as np
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
from pipecat.frames.frames import (
AudioRawFrame,
Frame,
StartInterruptionFrame,
StopInterruptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from loguru import logger
# How often should we reset internal model state
_MODEL_RESET_STATES_TIME = 5.0
try:
import onnxruntime
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.")
raise Exception(f"Missing module(s): {e}")
class SileroOnnxModel:
def __init__(self, path, force_onnx_cpu=True):
import numpy as np
global np
opts = onnxruntime.SessionOptions()
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1
if force_onnx_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers():
self.session = onnxruntime.InferenceSession(
path, providers=["CPUExecutionProvider"], sess_options=opts
)
else:
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
self.reset_states()
self.sample_rates = [8000, 16000]
def _validate_input(self, x, sr: int):
if np.ndim(x) == 1:
x = np.expand_dims(x, 0)
if np.ndim(x) > 2:
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
if sr not in self.sample_rates:
raise ValueError(
f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)"
)
if sr / np.shape(x)[1] > 31.25:
raise ValueError("Input audio chunk is too short")
return x, sr
def reset_states(self, batch_size=1):
self._state = np.zeros((2, batch_size, 128), dtype="float32")
self._context = np.zeros((batch_size, 0), dtype="float32")
self._last_sr = 0
self._last_batch_size = 0
def __call__(self, x, sr: int):
x, sr = self._validate_input(x, sr)
num_samples = 512 if sr == 16000 else 256
if np.shape(x)[-1] != num_samples:
raise ValueError(
f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)"
)
batch_size = np.shape(x)[0]
context_size = 64 if sr == 16000 else 32
if not self._last_batch_size:
self.reset_states(batch_size)
if (self._last_sr) and (self._last_sr != sr):
self.reset_states(batch_size)
if (self._last_batch_size) and (self._last_batch_size != batch_size):
self.reset_states(batch_size)
if not np.shape(self._context)[1]:
self._context = np.zeros((batch_size, context_size), dtype="float32")
x = np.concatenate((self._context, x), axis=1)
if sr in [8000, 16000]:
ort_inputs = {"input": x, "state": self._state, "sr": np.array(sr, dtype="int64")}
ort_outs = self.session.run(None, ort_inputs)
out, state = ort_outs
self._state = state
else:
raise ValueError()
self._context = x[..., -context_size:]
self._last_sr = sr
self._last_batch_size = batch_size
return out
class SileroVADAnalyzer(VADAnalyzer):
def __init__(self, *, sample_rate: int = 16000, params: VADParams = VADParams()):
super().__init__(sample_rate=sample_rate, num_channels=1, params=params)
if sample_rate != 16000 and sample_rate != 8000:
raise ValueError("Silero VAD sample rate needs to be 16000 or 8000")
logger.debug("Loading Silero VAD model...")
model_name = "silero_vad.onnx"
package_path = "pipecat.audio.vad.data"
try:
import importlib_resources as impresources
model_file_path = str(impresources.files(package_path).joinpath(model_name))
except BaseException:
from importlib import resources as impresources
try:
with impresources.path(package_path, model_name) as f:
model_file_path = f
except BaseException:
model_file_path = str(impresources.files(package_path).joinpath(model_name))
self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True)
self._last_reset_time = 0
logger.debug("Loaded Silero VAD")
#
# VADAnalyzer
#
def num_frames_required(self) -> int:
return 512 if self.sample_rate == 16000 else 256
def voice_confidence(self, buffer) -> float:
try:
audio_int16 = np.frombuffer(buffer, np.int16)
# Divide by 32768 because we have signed 16-bit data.
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
new_confidence = self._model(audio_float32, self.sample_rate)[0]
# We need to reset the model from time to time because it doesn't
# really need all the data and memory will keep growing otherwise.
curr_time = time.time()
diff_time = curr_time - self._last_reset_time
if diff_time >= _MODEL_RESET_STATES_TIME:
self._model.reset_states()
self._last_reset_time = curr_time
return new_confidence
except Exception as e:
# This comes from an empty audio array
logger.exception(f"Error analyzing audio with Silero VAD: {e}")
return 0
class SileroVAD(FrameProcessor):
def __init__(
self,
*,
sample_rate: int = 16000,
vad_params: VADParams = VADParams(),
audio_passthrough: bool = False,
):
super().__init__()
self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params)
self._audio_passthrough = audio_passthrough
self._processor_vad_state: VADState = VADState.QUIET
#
# FrameProcessor
#
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, AudioRawFrame):
await self._analyze_audio(frame)
if self._audio_passthrough:
await self.push_frame(frame, direction)
else:
await self.push_frame(frame, direction)
#
# Handle interruptions
#
async def _handle_interruptions(self, frame: Frame):
if self.interruptions_allowed:
# Make sure we notify about interruptions quickly out-of-band.
if isinstance(frame, UserStartedSpeakingFrame):
logger.debug("User started speaking")
await self._start_interruption()
# Push an out-of-band frame (i.e. not using the ordered push
# frame task) to stop everything, specially at the output
# transport.
await self.push_frame(StartInterruptionFrame())
elif isinstance(frame, UserStoppedSpeakingFrame):
logger.debug("User stopped speaking")
await self._stop_interruption()
await self.push_frame(StopInterruptionFrame())
await self.push_frame(frame)
async def _analyze_audio(self, frame: AudioRawFrame):
# Check VAD and push event if necessary. We just care about changes
# from QUIET to SPEAKING and vice versa.
new_vad_state = self._vad_analyzer.analyze_audio(frame.audio)
if (
new_vad_state != self._processor_vad_state
and new_vad_state != VADState.STARTING
and new_vad_state != VADState.STOPPING
):
new_frame = None
if new_vad_state == VADState.SPEAKING:
new_frame = UserStartedSpeakingFrame()
elif new_vad_state == VADState.QUIET:
new_frame = UserStoppedSpeakingFrame()
if new_frame:
await self._handle_interruptions(new_frame)
self._processor_vad_state = new_vad_state

View File

@@ -0,0 +1,129 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from abc import abstractmethod
from enum import Enum
from loguru import logger
from pydantic.main import BaseModel
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
class VADState(Enum):
QUIET = 1
STARTING = 2
SPEAKING = 3
STOPPING = 4
class VADParams(BaseModel):
confidence: float = 0.7
start_secs: float = 0.2
stop_secs: float = 0.8
min_volume: float = 0.6
class VADAnalyzer:
def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams):
self._sample_rate = sample_rate
self._num_channels = num_channels
self.set_params(params)
self._vad_buffer = b""
# Volume exponential smoothing
self._smoothing_factor = 0.2
self._prev_volume = 0
@property
def sample_rate(self):
return self._sample_rate
@property
def num_channels(self):
return self._num_channels
@abstractmethod
def num_frames_required(self) -> int:
pass
@abstractmethod
def voice_confidence(self, buffer) -> float:
pass
def set_params(self, params: VADParams):
logger.info(f"Setting VAD params to: {params}")
self._params = params
self._vad_frames = self.num_frames_required()
self._vad_frames_num_bytes = self._vad_frames * self._num_channels * 2
vad_frames_per_sec = self._vad_frames / self._sample_rate
self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec)
self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec)
self._vad_starting_count = 0
self._vad_stopping_count = 0
self._vad_state: VADState = VADState.QUIET
def _get_smoothed_volume(self, audio: bytes) -> float:
volume = calculate_audio_volume(audio, self._sample_rate)
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
def analyze_audio(self, buffer) -> VADState:
self._vad_buffer += buffer
num_required_bytes = self._vad_frames_num_bytes
if len(self._vad_buffer) < num_required_bytes:
return self._vad_state
audio_frames = self._vad_buffer[:num_required_bytes]
self._vad_buffer = self._vad_buffer[num_required_bytes:]
confidence = self.voice_confidence(audio_frames)
volume = self._get_smoothed_volume(audio_frames)
self._prev_volume = volume
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if (
self._vad_state == VADState.STARTING
and self._vad_starting_count >= self._vad_start_frames
):
self._vad_state = VADState.SPEAKING
self._vad_starting_count = 0
if (
self._vad_state == VADState.STOPPING
and self._vad_stopping_count >= self._vad_stop_frames
):
self._vad_state = VADState.QUIET
self._vad_stopping_count = 0
return self._vad_state

View File

@@ -7,12 +7,12 @@
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.clocks.base_clock import BaseClock
from pipecat.metrics.metrics import MetricsData
from pipecat.transcriptions.language import Language
from pipecat.utils.time import nanoseconds_to_str
from pipecat.utils.utils import obj_count, obj_id
from pipecat.vad.vad_analyzer import VADParams
def format_pts(pts: int | None):

View File

@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor
from loguru import logger
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState
from pipecat.frames.frames import (
BotInterruptionFrame,
CancelFrame,
@@ -25,7 +26,6 @@ from pipecat.frames.frames import (
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.base_transport import TransportParams
from pipecat.vad.vad_analyzer import VADAnalyzer, VADState
class BaseInputTransport(FrameProcessor):

View File

@@ -12,8 +12,8 @@ from abc import ABC, abstractmethod
from pydantic import ConfigDict
from pydantic.main import BaseModel
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.vad.vad_analyzer import VADAnalyzer
from loguru import logger

View File

@@ -22,6 +22,7 @@ from daily import (
from loguru import logger
from pydantic.main import BaseModel
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -43,7 +44,6 @@ from pipecat.transcriptions.language import Language
from pipecat.transports.base_input import BaseInputTransport
from pipecat.transports.base_output import BaseOutputTransport
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams
try:
from daily import CallClient, Daily, EventHandler

View File

@@ -13,6 +13,7 @@ from loguru import logger
from pydantic import BaseModel
from scipy import signal
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
@@ -28,7 +29,6 @@ from pipecat.processors.frame_processor import FrameDirection
from pipecat.transports.base_input import BaseInputTransport
from pipecat.transports.base_output import BaseOutputTransport
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.vad.vad_analyzer import VADAnalyzer
try:
from livekit import rtc

View File

@@ -4,242 +4,8 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import time
import numpy as np
from pipecat.frames.frames import (
AudioRawFrame,
Frame,
StartInterruptionFrame,
StopInterruptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
from loguru import logger
# How often should we reset internal model state
_MODEL_RESET_STATES_TIME = 5.0
logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.")
try:
import onnxruntime
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.")
raise Exception(f"Missing module(s): {e}")
class SileroOnnxModel:
def __init__(self, path, force_onnx_cpu=True):
import numpy as np
global np
opts = onnxruntime.SessionOptions()
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1
if force_onnx_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers():
self.session = onnxruntime.InferenceSession(
path, providers=["CPUExecutionProvider"], sess_options=opts
)
else:
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
self.reset_states()
self.sample_rates = [8000, 16000]
def _validate_input(self, x, sr: int):
if np.ndim(x) == 1:
x = np.expand_dims(x, 0)
if np.ndim(x) > 2:
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
if sr not in self.sample_rates:
raise ValueError(
f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)"
)
if sr / np.shape(x)[1] > 31.25:
raise ValueError("Input audio chunk is too short")
return x, sr
def reset_states(self, batch_size=1):
self._state = np.zeros((2, batch_size, 128), dtype="float32")
self._context = np.zeros((batch_size, 0), dtype="float32")
self._last_sr = 0
self._last_batch_size = 0
def __call__(self, x, sr: int):
x, sr = self._validate_input(x, sr)
num_samples = 512 if sr == 16000 else 256
if np.shape(x)[-1] != num_samples:
raise ValueError(
f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)"
)
batch_size = np.shape(x)[0]
context_size = 64 if sr == 16000 else 32
if not self._last_batch_size:
self.reset_states(batch_size)
if (self._last_sr) and (self._last_sr != sr):
self.reset_states(batch_size)
if (self._last_batch_size) and (self._last_batch_size != batch_size):
self.reset_states(batch_size)
if not np.shape(self._context)[1]:
self._context = np.zeros((batch_size, context_size), dtype="float32")
x = np.concatenate((self._context, x), axis=1)
if sr in [8000, 16000]:
ort_inputs = {"input": x, "state": self._state, "sr": np.array(sr, dtype="int64")}
ort_outs = self.session.run(None, ort_inputs)
out, state = ort_outs
self._state = state
else:
raise ValueError()
self._context = x[..., -context_size:]
self._last_sr = sr
self._last_batch_size = batch_size
return out
class SileroVADAnalyzer(VADAnalyzer):
def __init__(self, *, sample_rate: int = 16000, params: VADParams = VADParams()):
super().__init__(sample_rate=sample_rate, num_channels=1, params=params)
if sample_rate != 16000 and sample_rate != 8000:
raise ValueError("Silero VAD sample rate needs to be 16000 or 8000")
logger.debug("Loading Silero VAD model...")
model_name = "silero_vad.onnx"
package_path = "pipecat.vad.data"
try:
import importlib_resources as impresources
model_file_path = str(impresources.files(package_path).joinpath(model_name))
except BaseException:
from importlib import resources as impresources
try:
with impresources.path(package_path, model_name) as f:
model_file_path = f
except BaseException:
model_file_path = str(impresources.files(package_path).joinpath(model_name))
self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True)
self._last_reset_time = 0
logger.debug("Loaded Silero VAD")
#
# VADAnalyzer
#
def num_frames_required(self) -> int:
return 512 if self.sample_rate == 16000 else 256
def voice_confidence(self, buffer) -> float:
try:
audio_int16 = np.frombuffer(buffer, np.int16)
# Divide by 32768 because we have signed 16-bit data.
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
new_confidence = self._model(audio_float32, self.sample_rate)[0]
# We need to reset the model from time to time because it doesn't
# really need all the data and memory will keep growing otherwise.
curr_time = time.time()
diff_time = curr_time - self._last_reset_time
if diff_time >= _MODEL_RESET_STATES_TIME:
self._model.reset_states()
self._last_reset_time = curr_time
return new_confidence
except Exception as e:
# This comes from an empty audio array
logger.exception(f"Error analyzing audio with Silero VAD: {e}")
return 0
class SileroVAD(FrameProcessor):
def __init__(
self,
*,
sample_rate: int = 16000,
vad_params: VADParams = VADParams(),
audio_passthrough: bool = False,
):
super().__init__()
self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params)
self._audio_passthrough = audio_passthrough
self._processor_vad_state: VADState = VADState.QUIET
#
# FrameProcessor
#
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, AudioRawFrame):
await self._analyze_audio(frame)
if self._audio_passthrough:
await self.push_frame(frame, direction)
else:
await self.push_frame(frame, direction)
#
# Handle interruptions
#
async def _handle_interruptions(self, frame: Frame):
if self.interruptions_allowed:
# Make sure we notify about interruptions quickly out-of-band.
if isinstance(frame, UserStartedSpeakingFrame):
logger.debug("User started speaking")
await self._start_interruption()
# Push an out-of-band frame (i.e. not using the ordered push
# frame task) to stop everything, specially at the output
# transport.
await self.push_frame(StartInterruptionFrame())
elif isinstance(frame, UserStoppedSpeakingFrame):
logger.debug("User stopped speaking")
await self._stop_interruption()
await self.push_frame(StopInterruptionFrame())
await self.push_frame(frame)
async def _analyze_audio(self, frame: AudioRawFrame):
# Check VAD and push event if necessary. We just care about changes
# from QUIET to SPEAKING and vice versa.
new_vad_state = self._vad_analyzer.analyze_audio(frame.audio)
if (
new_vad_state != self._processor_vad_state
and new_vad_state != VADState.STARTING
and new_vad_state != VADState.STOPPING
):
new_frame = None
if new_vad_state == VADState.SPEAKING:
new_frame = UserStartedSpeakingFrame()
elif new_vad_state == VADState.QUIET:
new_frame = UserStoppedSpeakingFrame()
if new_frame:
await self._handle_interruptions(new_frame)
self._processor_vad_state = new_vad_state
from ..audio.vad.silero import SileroVAD, SileroVADAnalyzer

View File

@@ -4,126 +4,8 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
from abc import abstractmethod
from enum import Enum
from loguru import logger
from pydantic.main import BaseModel
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.")
class VADState(Enum):
QUIET = 1
STARTING = 2
SPEAKING = 3
STOPPING = 4
class VADParams(BaseModel):
confidence: float = 0.7
start_secs: float = 0.2
stop_secs: float = 0.8
min_volume: float = 0.6
class VADAnalyzer:
def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams):
self._sample_rate = sample_rate
self._num_channels = num_channels
self.set_params(params)
self._vad_buffer = b""
# Volume exponential smoothing
self._smoothing_factor = 0.2
self._prev_volume = 0
@property
def sample_rate(self):
return self._sample_rate
@property
def num_channels(self):
return self._num_channels
@abstractmethod
def num_frames_required(self) -> int:
pass
@abstractmethod
def voice_confidence(self, buffer) -> float:
pass
def set_params(self, params: VADParams):
logger.info(f"Setting VAD params to: {params}")
self._params = params
self._vad_frames = self.num_frames_required()
self._vad_frames_num_bytes = self._vad_frames * self._num_channels * 2
vad_frames_per_sec = self._vad_frames / self._sample_rate
self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec)
self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec)
self._vad_starting_count = 0
self._vad_stopping_count = 0
self._vad_state: VADState = VADState.QUIET
def _get_smoothed_volume(self, audio: bytes) -> float:
volume = calculate_audio_volume(audio, self._sample_rate)
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
def analyze_audio(self, buffer) -> VADState:
self._vad_buffer += buffer
num_required_bytes = self._vad_frames_num_bytes
if len(self._vad_buffer) < num_required_bytes:
return self._vad_state
audio_frames = self._vad_buffer[:num_required_bytes]
self._vad_buffer = self._vad_buffer[num_required_bytes:]
confidence = self.voice_confidence(audio_frames)
volume = self._get_smoothed_volume(audio_frames)
self._prev_volume = volume
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if (
self._vad_state == VADState.STARTING
and self._vad_starting_count >= self._vad_start_frames
):
self._vad_state = VADState.SPEAKING
self._vad_starting_count = 0
if (
self._vad_state == VADState.STOPPING
and self._vad_stopping_count >= self._vad_stop_frames
):
self._vad_state = VADState.QUIET
self._vad_stopping_count = 0
return self._vad_state
from ..audio.vad.vad_analyzer import VADAnalyzer, VADParams, VADState