Merge pull request #469 from pipecat-ai/lewis/remove_torch_dependency
Remove torch dependency for using silero_vad
This commit is contained in:
@@ -54,7 +54,7 @@ moondream = [ "einops~=0.8.0", "timm~=1.0.8", "transformers~=4.44.0" ]
|
||||
openai = [ "openai~=1.37.2" ]
|
||||
openpipe = [ "openpipe~=4.24.0" ]
|
||||
playht = [ "pyht~=0.0.28" ]
|
||||
silero = [ "silero-vad~=5.1" ]
|
||||
silero = [ "onnxruntime>=1.16.1" ]
|
||||
together = [ "together~=1.2.7" ]
|
||||
websocket = [ "websockets~=12.0", "fastapi~=0.112.1" ]
|
||||
whisper = [ "faster-whisper~=1.0.3" ]
|
||||
|
||||
0
src/pipecat/vad/data/__init__.py
Normal file
0
src/pipecat/vad/data/__init__.py
Normal file
BIN
src/pipecat/vad/data/silero_vad.onnx
Normal file
BIN
src/pipecat/vad/data/silero_vad.onnx
Normal file
Binary file not shown.
@@ -14,17 +14,94 @@ from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# How often should we reset internal model state
|
||||
_MODEL_RESET_STATES_TIME = 5.0
|
||||
|
||||
try:
|
||||
from silero_vad import load_silero_vad
|
||||
import torch
|
||||
import onnxruntime
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Silero VAD, you need to `pip install pipecat-ai[silero]`.")
|
||||
raise Exception(f"Missing module(s): {e}")
|
||||
|
||||
# How often should we reset internal model state
|
||||
_MODEL_RESET_STATES_TIME = 5.0
|
||||
|
||||
class SileroOnnxModel():
|
||||
|
||||
def __init__(self, path, force_onnx_cpu=True):
|
||||
import numpy as np
|
||||
global np
|
||||
|
||||
opts = onnxruntime.SessionOptions()
|
||||
opts.inter_op_num_threads = 1
|
||||
opts.intra_op_num_threads = 1
|
||||
|
||||
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
||||
self.session = onnxruntime.InferenceSession(
|
||||
path, providers=['CPUExecutionProvider'], sess_options=opts)
|
||||
else:
|
||||
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
|
||||
|
||||
self.reset_states()
|
||||
self.sample_rates = [8000, 16000]
|
||||
|
||||
def _validate_input(self, x, sr: int):
|
||||
if np.ndim(x) == 1:
|
||||
x = np.expand_dims(x, 0)
|
||||
if np.ndim(x) > 2:
|
||||
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
|
||||
|
||||
if sr not in self.sample_rates:
|
||||
raise ValueError(
|
||||
f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
|
||||
if sr / np.shape(x)[1] > 31.25:
|
||||
raise ValueError("Input audio chunk is too short")
|
||||
|
||||
return x, sr
|
||||
|
||||
def reset_states(self, batch_size=1):
|
||||
self._state = np.zeros((2, batch_size, 128), dtype='float32')
|
||||
self._context = np.zeros((batch_size, 0), dtype='float32')
|
||||
self._last_sr = 0
|
||||
self._last_batch_size = 0
|
||||
|
||||
def __call__(self, x, sr: int):
|
||||
|
||||
x, sr = self._validate_input(x, sr)
|
||||
num_samples = 512 if sr == 16000 else 256
|
||||
|
||||
if np.shape(x)[-1] != num_samples:
|
||||
raise ValueError(
|
||||
f"Provided number of samples is {np.shape(x)[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
|
||||
|
||||
batch_size = np.shape(x)[0]
|
||||
context_size = 64 if sr == 16000 else 32
|
||||
|
||||
if not self._last_batch_size:
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_sr) and (self._last_sr != sr):
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
||||
self.reset_states(batch_size)
|
||||
|
||||
if not np.shape(self._context)[1]:
|
||||
self._context = np.zeros((batch_size, context_size), dtype='float32')
|
||||
|
||||
x = np.concatenate((self._context, x), axis=1)
|
||||
|
||||
if sr in [8000, 16000]:
|
||||
ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')}
|
||||
ort_outs = self.session.run(None, ort_inputs)
|
||||
out, state = ort_outs
|
||||
self._state = state
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
self._context = x[..., -context_size:]
|
||||
self._last_sr = sr
|
||||
self._last_batch_size = batch_size
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class SileroVADAnalyzer(VADAnalyzer):
|
||||
@@ -41,7 +118,21 @@ class SileroVADAnalyzer(VADAnalyzer):
|
||||
|
||||
logger.debug("Loading Silero VAD model...")
|
||||
|
||||
self._model = load_silero_vad()
|
||||
model_name = 'silero_vad.onnx'
|
||||
package_path = "pipecat.vad.data"
|
||||
|
||||
try:
|
||||
import importlib_resources as impresources
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
except BaseException:
|
||||
from importlib import resources as impresources
|
||||
try:
|
||||
with impresources.path(package_path, model_name) as f:
|
||||
model_file_path = f
|
||||
except BaseException:
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
|
||||
self._model = SileroOnnxModel(model_file_path, force_onnx_cpu=True)
|
||||
|
||||
self._last_reset_time = 0
|
||||
|
||||
@@ -59,7 +150,7 @@ class SileroVADAnalyzer(VADAnalyzer):
|
||||
audio_int16 = np.frombuffer(buffer, np.int16)
|
||||
# Divide by 32768 because we have signed 16-bit data.
|
||||
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
new_confidence = self._model(torch.from_numpy(audio_float32), self.sample_rate).item()
|
||||
new_confidence = self._model(audio_float32, self.sample_rate)[0]
|
||||
|
||||
# We need to reset the model from time to time because it doesn't
|
||||
# really need all the data and memory will keep growing otherwise.
|
||||
|
||||
Reference in New Issue
Block a user