diff --git a/data/response.wav b/data/response.wav new file mode 100644 index 0000000..ecf1f9a Binary files /dev/null and b/data/response.wav differ diff --git a/processors/vad.py b/processors/vad.py index 65bb9ce..1c938fa 100644 --- a/processors/vad.py +++ b/processors/vad.py @@ -82,8 +82,20 @@ class SileroVAD: Tuple of (label, probability) where label is "Speech" or "Silence" """ if self.session is None or not ONNX_AVAILABLE: - # If model not loaded or onnxruntime not available, assume speech - return "Speech", 1.0 + # Fallback energy-based VAD when model isn't available. + # Map RMS energy to a pseudo-probability so the existing threshold works. + if not pcm_bytes: + return "Silence", 0.0 + audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16) + if audio_int16.size == 0: + return "Silence", 0.0 + audio_float = audio_int16.astype(np.float32) / 32768.0 + rms = float(np.sqrt(np.mean(audio_float * audio_float))) + # Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale. + # Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS. + probability = min(1.0, rms / 0.05) + label = "Speech" if probability >= 0.5 else "Silence" + return label, probability # Convert bytes to numpy array of int16 audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)