Add energy based vad fallback
This commit is contained in:
BIN
data/response.wav
Normal file
BIN
data/response.wav
Normal file
Binary file not shown.
@@ -82,8 +82,20 @@ class SileroVAD:
|
|||||||
Tuple of (label, probability) where label is "Speech" or "Silence"
|
Tuple of (label, probability) where label is "Speech" or "Silence"
|
||||||
"""
|
"""
|
||||||
if self.session is None or not ONNX_AVAILABLE:
|
if self.session is None or not ONNX_AVAILABLE:
|
||||||
# If model not loaded or onnxruntime not available, assume speech
|
# Fallback energy-based VAD when model isn't available.
|
||||||
return "Speech", 1.0
|
# Map RMS energy to a pseudo-probability so the existing threshold works.
|
||||||
|
if not pcm_bytes:
|
||||||
|
return "Silence", 0.0
|
||||||
|
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||||
|
if audio_int16.size == 0:
|
||||||
|
return "Silence", 0.0
|
||||||
|
audio_float = audio_int16.astype(np.float32) / 32768.0
|
||||||
|
rms = float(np.sqrt(np.mean(audio_float * audio_float)))
|
||||||
|
# Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale.
|
||||||
|
# Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS.
|
||||||
|
probability = min(1.0, rms / 0.05)
|
||||||
|
label = "Speech" if probability >= 0.5 else "Silence"
|
||||||
|
return label, probability
|
||||||
|
|
||||||
# Convert bytes to numpy array of int16
|
# Convert bytes to numpy array of int16
|
||||||
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||||
|
|||||||
Reference in New Issue
Block a user