Merge pull request #607 from pipecat-ai/aleix/pipecat-vad-deprecation
move vad package to audio.vad
This commit is contained in:
@@ -23,6 +23,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
|
||||
match other services.
|
||||
|
||||
- The `vad` package is now deprecated and `audio.vad` should be used
|
||||
instead. The `avd` package will get removed in a future release.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed `SileroVAD` processor to support interruptions properly.
|
||||
|
||||
@@ -14,6 +14,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -27,7 +28,6 @@ from pipecat.services.canonical import CanonicalMetricsService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -25,7 +26,6 @@ from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -14,7 +15,6 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -14,7 +15,7 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyDialinSettings
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -14,7 +15,6 @@ from pipecat.frames.frames import LLMMessagesFrame, EndFrame
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from twilio.rest import Client
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, LLMMessagesFrame, MetricsFrame
|
||||
from pipecat.metrics.metrics import (
|
||||
TTFBMetricsData,
|
||||
@@ -18,7 +19,7 @@ from pipecat.metrics.metrics import (
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
@@ -27,7 +28,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ import sys
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -23,7 +24,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia import CartesiaHttpTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from runner import configure
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -21,7 +22,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ import sys
|
||||
|
||||
import aiohttp
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -21,7 +22,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.processors.frameworks.langchain import LangchainProcessor
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
||||
from langchain_community.chat_message_histories import ChatMessageHistory
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -24,7 +25,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -24,7 +25,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -25,7 +26,6 @@ from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.playht import PlayHTTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
)
|
||||
from pipecat.services.azure import AzureLLMService, AzureSTTService, AzureTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -23,7 +24,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
)
|
||||
from pipecat.services.openai import OpenAILLMService, OpenAITTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openpipe import OpenPipeLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -17,11 +18,9 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.xtts import XTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -21,7 +22,6 @@ from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.gladia import GladiaSTTService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.services.lmnt import LmntTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -21,7 +22,6 @@ from pipecat.services.ai_services import OpenAILLMContext
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.together import TogetherLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -25,7 +26,6 @@ from pipecat.services.aws import AWSTTSService
|
||||
from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -25,7 +26,6 @@ from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.services.google import GoogleTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.processors.filters.wake_check_filter import WakeCheckFilter
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ import os
|
||||
import sys
|
||||
import wave
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMFullResponseEndFrame,
|
||||
@@ -28,7 +29,6 @@ from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.services.cartesia import CartesiaHttpTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.moondream import MoondreamService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.google import GoogleLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,13 +9,13 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -16,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -16,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
@@ -16,7 +17,6 @@ from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMContext
|
||||
from pipecat.services.together import TogetherLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
|
||||
@@ -9,13 +9,13 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.filters.function_filter import FunctionFilter
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
@@ -20,7 +21,6 @@ from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.whisper import Model, WhisperSTTService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -28,7 +29,6 @@ from pipecat.transports.services.daily import (
|
||||
DailyTransport,
|
||||
DailyTransportMessageFrame,
|
||||
)
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -21,7 +22,6 @@ from pipecat.processors.user_idle_processor import UserIdleProcessor
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -25,8 +27,6 @@ from pipecat.services.openai_realtime_beta import (
|
||||
TurnDetection,
|
||||
)
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -26,8 +28,6 @@ from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -29,8 +31,6 @@ from pipecat.services.openai_realtime_beta import (
|
||||
TurnDetection,
|
||||
)
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -26,8 +28,6 @@ from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ import sys
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
ImageRawFrame,
|
||||
OutputImageRawFrame,
|
||||
@@ -23,7 +24,6 @@ from pipecat.frames.frames import (
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -36,7 +36,6 @@ from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.moondream import MoondreamService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ import os
|
||||
import sys
|
||||
import wave
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import OutputAudioRawFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame, OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ import sys
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -30,7 +31,6 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
|
||||
from pypdf import PdfReader
|
||||
import tiktoken
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -19,7 +20,6 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -16,7 +17,6 @@ from pipecat.transports.network.fastapi_websocket import (
|
||||
FastAPIWebsocketTransport,
|
||||
FastAPIWebsocketParams,
|
||||
)
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.serializers.twilio import TwilioFrameSerializer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -8,6 +8,7 @@ import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -23,7 +24,6 @@ from pipecat.transports.network.websocket_server import (
|
||||
WebsocketServerParams,
|
||||
WebsocketServerTransport,
|
||||
)
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
0
src/pipecat/audio/vad/__init__.py
Normal file
0
src/pipecat/audio/vad/__init__.py
Normal file
0
src/pipecat/audio/vad/data/__init__.py
Normal file
0
src/pipecat/audio/vad/data/__init__.py
Normal file
245
src/pipecat/audio/vad/silero.py
Normal file
245
src/pipecat/audio/vad/silero.py
Normal file
@@ -0,0 +1,245 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
Frame,
|
||||
StartInterruptionFrame,
|
||||
StopInterruptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# How often should we reset internal model state
|
||||
_MODEL_RESET_STATES_TIME = 5.0
|
||||
|
||||
try:
|
||||
import onnxruntime
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.")
|
||||
raise Exception(f"Missing module(s): {e}")
|
||||
|
||||
|
||||
class SileroOnnxModel:
|
||||
def __init__(self, path, force_onnx_cpu=True):
|
||||
import numpy as np
|
||||
|
||||
global np
|
||||
|
||||
opts = onnxruntime.SessionOptions()
|
||||
opts.inter_op_num_threads = 1
|
||||
opts.intra_op_num_threads = 1
|
||||
|
||||
if force_onnx_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers():
|
||||
self.session = onnxruntime.InferenceSession(
|
||||
path, providers=["CPUExecutionProvider"], sess_options=opts
|
||||
)
|
||||
else:
|
||||
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
|
||||
|
||||
self.reset_states()
|
||||
self.sample_rates = [8000, 16000]
|
||||
|
||||
def _validate_input(self, x, sr: int):
|
||||
if np.ndim(x) == 1:
|
||||
x = np.expand_dims(x, 0)
|
||||
if np.ndim(x) > 2:
|
||||
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
|
||||
|
||||
if sr not in self.sample_rates:
|
||||
raise ValueError(
|
||||
f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)"
|
||||
)
|
||||
if sr / np.shape(x)[1] > 31.25:
|
||||
raise ValueError("Input audio chunk is too short")
|
||||
|
||||
return x, sr
|
||||
|
||||
def reset_states(self, batch_size=1):
|
||||
self._state = np.zeros((2, batch_size, 128), dtype="float32")
|
||||
self._context = np.zeros((batch_size, 0), dtype="float32")
|
||||
self._last_sr = 0
|
||||
self._last_batch_size = 0
|
||||
|
||||
def __call__(self, x, sr: int):
|
||||
x, sr = self._validate_input(x, sr)
|
||||
num_samples = 512 if sr == 16000 else 256
|
||||
|
||||
if np.shape(x)[-1] != num_samples:
|
||||
raise ValueError(
|
||||
f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)"
|
||||
)
|
||||
|
||||
batch_size = np.shape(x)[0]
|
||||
context_size = 64 if sr == 16000 else 32
|
||||
|
||||
if not self._last_batch_size:
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_sr) and (self._last_sr != sr):
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
||||
self.reset_states(batch_size)
|
||||
|
||||
if not np.shape(self._context)[1]:
|
||||
self._context = np.zeros((batch_size, context_size), dtype="float32")
|
||||
|
||||
x = np.concatenate((self._context, x), axis=1)
|
||||
|
||||
if sr in [8000, 16000]:
|
||||
ort_inputs = {"input": x, "state": self._state, "sr": np.array(sr, dtype="int64")}
|
||||
ort_outs = self.session.run(None, ort_inputs)
|
||||
out, state = ort_outs
|
||||
self._state = state
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
self._context = x[..., -context_size:]
|
||||
self._last_sr = sr
|
||||
self._last_batch_size = batch_size
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class SileroVADAnalyzer(VADAnalyzer):
|
||||
def __init__(self, *, sample_rate: int = 16000, params: VADParams = VADParams()):
|
||||
super().__init__(sample_rate=sample_rate, num_channels=1, params=params)
|
||||
|
||||
if sample_rate != 16000 and sample_rate != 8000:
|
||||
raise ValueError("Silero VAD sample rate needs to be 16000 or 8000")
|
||||
|
||||
logger.debug("Loading Silero VAD model...")
|
||||
|
||||
model_name = "silero_vad.onnx"
|
||||
package_path = "pipecat.audio.vad.data"
|
||||
|
||||
try:
|
||||
import importlib_resources as impresources
|
||||
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
except BaseException:
|
||||
from importlib import resources as impresources
|
||||
|
||||
try:
|
||||
with impresources.path(package_path, model_name) as f:
|
||||
model_file_path = f
|
||||
except BaseException:
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
|
||||
self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True)
|
||||
|
||||
self._last_reset_time = 0
|
||||
|
||||
logger.debug("Loaded Silero VAD")
|
||||
|
||||
#
|
||||
# VADAnalyzer
|
||||
#
|
||||
|
||||
def num_frames_required(self) -> int:
|
||||
return 512 if self.sample_rate == 16000 else 256
|
||||
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
try:
|
||||
audio_int16 = np.frombuffer(buffer, np.int16)
|
||||
# Divide by 32768 because we have signed 16-bit data.
|
||||
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
new_confidence = self._model(audio_float32, self.sample_rate)[0]
|
||||
|
||||
# We need to reset the model from time to time because it doesn't
|
||||
# really need all the data and memory will keep growing otherwise.
|
||||
curr_time = time.time()
|
||||
diff_time = curr_time - self._last_reset_time
|
||||
if diff_time >= _MODEL_RESET_STATES_TIME:
|
||||
self._model.reset_states()
|
||||
self._last_reset_time = curr_time
|
||||
|
||||
return new_confidence
|
||||
except Exception as e:
|
||||
# This comes from an empty audio array
|
||||
logger.exception(f"Error analyzing audio with Silero VAD: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
class SileroVAD(FrameProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
sample_rate: int = 16000,
|
||||
vad_params: VADParams = VADParams(),
|
||||
audio_passthrough: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params)
|
||||
self._audio_passthrough = audio_passthrough
|
||||
|
||||
self._processor_vad_state: VADState = VADState.QUIET
|
||||
|
||||
#
|
||||
# FrameProcessor
|
||||
#
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
await self._analyze_audio(frame)
|
||||
if self._audio_passthrough:
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
#
|
||||
# Handle interruptions
|
||||
#
|
||||
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
if self.interruptions_allowed:
|
||||
# Make sure we notify about interruptions quickly out-of-band.
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
logger.debug("User started speaking")
|
||||
await self._start_interruption()
|
||||
# Push an out-of-band frame (i.e. not using the ordered push
|
||||
# frame task) to stop everything, specially at the output
|
||||
# transport.
|
||||
await self.push_frame(StartInterruptionFrame())
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
logger.debug("User stopped speaking")
|
||||
await self._stop_interruption()
|
||||
await self.push_frame(StopInterruptionFrame())
|
||||
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def _analyze_audio(self, frame: AudioRawFrame):
|
||||
# Check VAD and push event if necessary. We just care about changes
|
||||
# from QUIET to SPEAKING and vice versa.
|
||||
new_vad_state = self._vad_analyzer.analyze_audio(frame.audio)
|
||||
if (
|
||||
new_vad_state != self._processor_vad_state
|
||||
and new_vad_state != VADState.STARTING
|
||||
and new_vad_state != VADState.STOPPING
|
||||
):
|
||||
new_frame = None
|
||||
|
||||
if new_vad_state == VADState.SPEAKING:
|
||||
new_frame = UserStartedSpeakingFrame()
|
||||
elif new_vad_state == VADState.QUIET:
|
||||
new_frame = UserStoppedSpeakingFrame()
|
||||
|
||||
if new_frame:
|
||||
await self._handle_interruptions(new_frame)
|
||||
|
||||
self._processor_vad_state = new_vad_state
|
||||
129
src/pipecat/audio/vad/vad_analyzer.py
Normal file
129
src/pipecat/audio/vad/vad_analyzer.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from abc import abstractmethod
|
||||
from enum import Enum
|
||||
|
||||
from loguru import logger
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
|
||||
|
||||
|
||||
class VADState(Enum):
|
||||
QUIET = 1
|
||||
STARTING = 2
|
||||
SPEAKING = 3
|
||||
STOPPING = 4
|
||||
|
||||
|
||||
class VADParams(BaseModel):
|
||||
confidence: float = 0.7
|
||||
start_secs: float = 0.2
|
||||
stop_secs: float = 0.8
|
||||
min_volume: float = 0.6
|
||||
|
||||
|
||||
class VADAnalyzer:
|
||||
def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams):
|
||||
self._sample_rate = sample_rate
|
||||
self._num_channels = num_channels
|
||||
|
||||
self.set_params(params)
|
||||
|
||||
self._vad_buffer = b""
|
||||
|
||||
# Volume exponential smoothing
|
||||
self._smoothing_factor = 0.2
|
||||
self._prev_volume = 0
|
||||
|
||||
@property
|
||||
def sample_rate(self):
|
||||
return self._sample_rate
|
||||
|
||||
@property
|
||||
def num_channels(self):
|
||||
return self._num_channels
|
||||
|
||||
@abstractmethod
|
||||
def num_frames_required(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
pass
|
||||
|
||||
def set_params(self, params: VADParams):
|
||||
logger.info(f"Setting VAD params to: {params}")
|
||||
self._params = params
|
||||
self._vad_frames = self.num_frames_required()
|
||||
self._vad_frames_num_bytes = self._vad_frames * self._num_channels * 2
|
||||
|
||||
vad_frames_per_sec = self._vad_frames / self._sample_rate
|
||||
|
||||
self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec)
|
||||
self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec)
|
||||
self._vad_starting_count = 0
|
||||
self._vad_stopping_count = 0
|
||||
self._vad_state: VADState = VADState.QUIET
|
||||
|
||||
def _get_smoothed_volume(self, audio: bytes) -> float:
|
||||
volume = calculate_audio_volume(audio, self._sample_rate)
|
||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||
|
||||
def analyze_audio(self, buffer) -> VADState:
|
||||
self._vad_buffer += buffer
|
||||
|
||||
num_required_bytes = self._vad_frames_num_bytes
|
||||
if len(self._vad_buffer) < num_required_bytes:
|
||||
return self._vad_state
|
||||
|
||||
audio_frames = self._vad_buffer[:num_required_bytes]
|
||||
self._vad_buffer = self._vad_buffer[num_required_bytes:]
|
||||
|
||||
confidence = self.voice_confidence(audio_frames)
|
||||
|
||||
volume = self._get_smoothed_volume(audio_frames)
|
||||
self._prev_volume = volume
|
||||
|
||||
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
|
||||
|
||||
if speaking:
|
||||
match self._vad_state:
|
||||
case VADState.QUIET:
|
||||
self._vad_state = VADState.STARTING
|
||||
self._vad_starting_count = 1
|
||||
case VADState.STARTING:
|
||||
self._vad_starting_count += 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_stopping_count = 0
|
||||
else:
|
||||
match self._vad_state:
|
||||
case VADState.STARTING:
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_starting_count = 0
|
||||
case VADState.SPEAKING:
|
||||
self._vad_state = VADState.STOPPING
|
||||
self._vad_stopping_count = 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_stopping_count += 1
|
||||
|
||||
if (
|
||||
self._vad_state == VADState.STARTING
|
||||
and self._vad_starting_count >= self._vad_start_frames
|
||||
):
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_starting_count = 0
|
||||
|
||||
if (
|
||||
self._vad_state == VADState.STOPPING
|
||||
and self._vad_stopping_count >= self._vad_stop_frames
|
||||
):
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_stopping_count = 0
|
||||
|
||||
return self._vad_state
|
||||
@@ -7,12 +7,12 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.clocks.base_clock import BaseClock
|
||||
from pipecat.metrics.metrics import MetricsData
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import nanoseconds_to_str
|
||||
from pipecat.utils.utils import obj_count, obj_id
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
|
||||
|
||||
def format_pts(pts: int | None):
|
||||
|
||||
@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState
|
||||
from pipecat.frames.frames import (
|
||||
BotInterruptionFrame,
|
||||
CancelFrame,
|
||||
@@ -25,7 +26,6 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADState
|
||||
|
||||
|
||||
class BaseInputTransport(FrameProcessor):
|
||||
|
||||
@@ -12,8 +12,8 @@ from abc import ABC, abstractmethod
|
||||
from pydantic import ConfigDict
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ from daily import (
|
||||
from loguru import logger
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
@@ -43,7 +44,6 @@ from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_input import BaseInputTransport
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams
|
||||
|
||||
try:
|
||||
from daily import CallClient, Daily, EventHandler
|
||||
|
||||
@@ -13,6 +13,7 @@ from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
from scipy import signal
|
||||
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
@@ -28,7 +29,6 @@ from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.transports.base_input import BaseInputTransport
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer
|
||||
|
||||
try:
|
||||
from livekit import rtc
|
||||
|
||||
@@ -4,242 +4,8 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
Frame,
|
||||
StartInterruptionFrame,
|
||||
StopInterruptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# How often should we reset internal model state
|
||||
_MODEL_RESET_STATES_TIME = 5.0
|
||||
logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.")
|
||||
|
||||
try:
|
||||
import onnxruntime
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.")
|
||||
raise Exception(f"Missing module(s): {e}")
|
||||
|
||||
|
||||
class SileroOnnxModel:
|
||||
def __init__(self, path, force_onnx_cpu=True):
|
||||
import numpy as np
|
||||
|
||||
global np
|
||||
|
||||
opts = onnxruntime.SessionOptions()
|
||||
opts.inter_op_num_threads = 1
|
||||
opts.intra_op_num_threads = 1
|
||||
|
||||
if force_onnx_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers():
|
||||
self.session = onnxruntime.InferenceSession(
|
||||
path, providers=["CPUExecutionProvider"], sess_options=opts
|
||||
)
|
||||
else:
|
||||
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
|
||||
|
||||
self.reset_states()
|
||||
self.sample_rates = [8000, 16000]
|
||||
|
||||
def _validate_input(self, x, sr: int):
|
||||
if np.ndim(x) == 1:
|
||||
x = np.expand_dims(x, 0)
|
||||
if np.ndim(x) > 2:
|
||||
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
|
||||
|
||||
if sr not in self.sample_rates:
|
||||
raise ValueError(
|
||||
f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)"
|
||||
)
|
||||
if sr / np.shape(x)[1] > 31.25:
|
||||
raise ValueError("Input audio chunk is too short")
|
||||
|
||||
return x, sr
|
||||
|
||||
def reset_states(self, batch_size=1):
|
||||
self._state = np.zeros((2, batch_size, 128), dtype="float32")
|
||||
self._context = np.zeros((batch_size, 0), dtype="float32")
|
||||
self._last_sr = 0
|
||||
self._last_batch_size = 0
|
||||
|
||||
def __call__(self, x, sr: int):
|
||||
x, sr = self._validate_input(x, sr)
|
||||
num_samples = 512 if sr == 16000 else 256
|
||||
|
||||
if np.shape(x)[-1] != num_samples:
|
||||
raise ValueError(
|
||||
f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)"
|
||||
)
|
||||
|
||||
batch_size = np.shape(x)[0]
|
||||
context_size = 64 if sr == 16000 else 32
|
||||
|
||||
if not self._last_batch_size:
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_sr) and (self._last_sr != sr):
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
||||
self.reset_states(batch_size)
|
||||
|
||||
if not np.shape(self._context)[1]:
|
||||
self._context = np.zeros((batch_size, context_size), dtype="float32")
|
||||
|
||||
x = np.concatenate((self._context, x), axis=1)
|
||||
|
||||
if sr in [8000, 16000]:
|
||||
ort_inputs = {"input": x, "state": self._state, "sr": np.array(sr, dtype="int64")}
|
||||
ort_outs = self.session.run(None, ort_inputs)
|
||||
out, state = ort_outs
|
||||
self._state = state
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
self._context = x[..., -context_size:]
|
||||
self._last_sr = sr
|
||||
self._last_batch_size = batch_size
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class SileroVADAnalyzer(VADAnalyzer):
|
||||
def __init__(self, *, sample_rate: int = 16000, params: VADParams = VADParams()):
|
||||
super().__init__(sample_rate=sample_rate, num_channels=1, params=params)
|
||||
|
||||
if sample_rate != 16000 and sample_rate != 8000:
|
||||
raise ValueError("Silero VAD sample rate needs to be 16000 or 8000")
|
||||
|
||||
logger.debug("Loading Silero VAD model...")
|
||||
|
||||
model_name = "silero_vad.onnx"
|
||||
package_path = "pipecat.vad.data"
|
||||
|
||||
try:
|
||||
import importlib_resources as impresources
|
||||
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
except BaseException:
|
||||
from importlib import resources as impresources
|
||||
|
||||
try:
|
||||
with impresources.path(package_path, model_name) as f:
|
||||
model_file_path = f
|
||||
except BaseException:
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
|
||||
self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True)
|
||||
|
||||
self._last_reset_time = 0
|
||||
|
||||
logger.debug("Loaded Silero VAD")
|
||||
|
||||
#
|
||||
# VADAnalyzer
|
||||
#
|
||||
|
||||
def num_frames_required(self) -> int:
|
||||
return 512 if self.sample_rate == 16000 else 256
|
||||
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
try:
|
||||
audio_int16 = np.frombuffer(buffer, np.int16)
|
||||
# Divide by 32768 because we have signed 16-bit data.
|
||||
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
new_confidence = self._model(audio_float32, self.sample_rate)[0]
|
||||
|
||||
# We need to reset the model from time to time because it doesn't
|
||||
# really need all the data and memory will keep growing otherwise.
|
||||
curr_time = time.time()
|
||||
diff_time = curr_time - self._last_reset_time
|
||||
if diff_time >= _MODEL_RESET_STATES_TIME:
|
||||
self._model.reset_states()
|
||||
self._last_reset_time = curr_time
|
||||
|
||||
return new_confidence
|
||||
except Exception as e:
|
||||
# This comes from an empty audio array
|
||||
logger.exception(f"Error analyzing audio with Silero VAD: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
class SileroVAD(FrameProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
sample_rate: int = 16000,
|
||||
vad_params: VADParams = VADParams(),
|
||||
audio_passthrough: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params)
|
||||
self._audio_passthrough = audio_passthrough
|
||||
|
||||
self._processor_vad_state: VADState = VADState.QUIET
|
||||
|
||||
#
|
||||
# FrameProcessor
|
||||
#
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
await self._analyze_audio(frame)
|
||||
if self._audio_passthrough:
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
#
|
||||
# Handle interruptions
|
||||
#
|
||||
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
if self.interruptions_allowed:
|
||||
# Make sure we notify about interruptions quickly out-of-band.
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
logger.debug("User started speaking")
|
||||
await self._start_interruption()
|
||||
# Push an out-of-band frame (i.e. not using the ordered push
|
||||
# frame task) to stop everything, specially at the output
|
||||
# transport.
|
||||
await self.push_frame(StartInterruptionFrame())
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
logger.debug("User stopped speaking")
|
||||
await self._stop_interruption()
|
||||
await self.push_frame(StopInterruptionFrame())
|
||||
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def _analyze_audio(self, frame: AudioRawFrame):
|
||||
# Check VAD and push event if necessary. We just care about changes
|
||||
# from QUIET to SPEAKING and vice versa.
|
||||
new_vad_state = self._vad_analyzer.analyze_audio(frame.audio)
|
||||
if (
|
||||
new_vad_state != self._processor_vad_state
|
||||
and new_vad_state != VADState.STARTING
|
||||
and new_vad_state != VADState.STOPPING
|
||||
):
|
||||
new_frame = None
|
||||
|
||||
if new_vad_state == VADState.SPEAKING:
|
||||
new_frame = UserStartedSpeakingFrame()
|
||||
elif new_vad_state == VADState.QUIET:
|
||||
new_frame = UserStoppedSpeakingFrame()
|
||||
|
||||
if new_frame:
|
||||
await self._handle_interruptions(new_frame)
|
||||
|
||||
self._processor_vad_state = new_vad_state
|
||||
from ..audio.vad.silero import SileroVAD, SileroVADAnalyzer
|
||||
|
||||
@@ -4,126 +4,8 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from abc import abstractmethod
|
||||
from enum import Enum
|
||||
|
||||
from loguru import logger
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
|
||||
logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.")
|
||||
|
||||
|
||||
class VADState(Enum):
|
||||
QUIET = 1
|
||||
STARTING = 2
|
||||
SPEAKING = 3
|
||||
STOPPING = 4
|
||||
|
||||
|
||||
class VADParams(BaseModel):
|
||||
confidence: float = 0.7
|
||||
start_secs: float = 0.2
|
||||
stop_secs: float = 0.8
|
||||
min_volume: float = 0.6
|
||||
|
||||
|
||||
class VADAnalyzer:
|
||||
def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams):
|
||||
self._sample_rate = sample_rate
|
||||
self._num_channels = num_channels
|
||||
|
||||
self.set_params(params)
|
||||
|
||||
self._vad_buffer = b""
|
||||
|
||||
# Volume exponential smoothing
|
||||
self._smoothing_factor = 0.2
|
||||
self._prev_volume = 0
|
||||
|
||||
@property
|
||||
def sample_rate(self):
|
||||
return self._sample_rate
|
||||
|
||||
@property
|
||||
def num_channels(self):
|
||||
return self._num_channels
|
||||
|
||||
@abstractmethod
|
||||
def num_frames_required(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
pass
|
||||
|
||||
def set_params(self, params: VADParams):
|
||||
logger.info(f"Setting VAD params to: {params}")
|
||||
self._params = params
|
||||
self._vad_frames = self.num_frames_required()
|
||||
self._vad_frames_num_bytes = self._vad_frames * self._num_channels * 2
|
||||
|
||||
vad_frames_per_sec = self._vad_frames / self._sample_rate
|
||||
|
||||
self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec)
|
||||
self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec)
|
||||
self._vad_starting_count = 0
|
||||
self._vad_stopping_count = 0
|
||||
self._vad_state: VADState = VADState.QUIET
|
||||
|
||||
def _get_smoothed_volume(self, audio: bytes) -> float:
|
||||
volume = calculate_audio_volume(audio, self._sample_rate)
|
||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||
|
||||
def analyze_audio(self, buffer) -> VADState:
|
||||
self._vad_buffer += buffer
|
||||
|
||||
num_required_bytes = self._vad_frames_num_bytes
|
||||
if len(self._vad_buffer) < num_required_bytes:
|
||||
return self._vad_state
|
||||
|
||||
audio_frames = self._vad_buffer[:num_required_bytes]
|
||||
self._vad_buffer = self._vad_buffer[num_required_bytes:]
|
||||
|
||||
confidence = self.voice_confidence(audio_frames)
|
||||
|
||||
volume = self._get_smoothed_volume(audio_frames)
|
||||
self._prev_volume = volume
|
||||
|
||||
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
|
||||
|
||||
if speaking:
|
||||
match self._vad_state:
|
||||
case VADState.QUIET:
|
||||
self._vad_state = VADState.STARTING
|
||||
self._vad_starting_count = 1
|
||||
case VADState.STARTING:
|
||||
self._vad_starting_count += 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_stopping_count = 0
|
||||
else:
|
||||
match self._vad_state:
|
||||
case VADState.STARTING:
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_starting_count = 0
|
||||
case VADState.SPEAKING:
|
||||
self._vad_state = VADState.STOPPING
|
||||
self._vad_stopping_count = 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_stopping_count += 1
|
||||
|
||||
if (
|
||||
self._vad_state == VADState.STARTING
|
||||
and self._vad_starting_count >= self._vad_start_frames
|
||||
):
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_starting_count = 0
|
||||
|
||||
if (
|
||||
self._vad_state == VADState.STOPPING
|
||||
and self._vad_stopping_count >= self._vad_stop_frames
|
||||
):
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_stopping_count = 0
|
||||
|
||||
return self._vad_state
|
||||
from ..audio.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
|
||||
|
||||
Reference in New Issue
Block a user