From 7be8fda4244b34b7281b79e570bd8abae66554e3 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Fri, 6 Feb 2026 11:36:39 +0800 Subject: [PATCH] Fix microphone talk eou missing and clean chat log --- examples/web_client.html | 9 ++++++++- processors/vad.py | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/examples/web_client.html b/examples/web_client.html index 0d9b6f7..bee3d28 100644 --- a/examples/web_client.html +++ b/examples/web_client.html @@ -719,7 +719,14 @@ sendCommand({ command: "chat", text }); chatInput.value = ""; }); - clearLogBtn.addEventListener("click", () => (logEl.innerHTML = "")); + clearLogBtn.addEventListener("click", () => { + logEl.innerHTML = ""; + chatHistory.innerHTML = ""; + setInterim("You", ""); + setInterim("AI", ""); + interimUserText = ""; + interimAiText = ""; + }); inputSelect.addEventListener("change", () => { if (micStream) { stopMic(); diff --git a/processors/vad.py b/processors/vad.py index c0256d4..cad6e8b 100644 --- a/processors/vad.py +++ b/processors/vad.py @@ -63,6 +63,7 @@ class SileroVAD: self.min_chunk_size = 512 self.last_label = "Silence" self.last_probability = 0.0 + self._energy_noise_floor = 1e-4 def _reset_state(self): # Silero VAD V4+ expects state shape [2, 1, 128] @@ -81,8 +82,7 @@ class SileroVAD: Tuple of (label, probability) where label is "Speech" or "Silence" """ if self.session is None or not ONNX_AVAILABLE: - # Fallback energy-based VAD when model isn't available. - # Map RMS energy to a pseudo-probability so the existing threshold works. + # Fallback energy-based VAD with adaptive noise floor. if not pcm_bytes: return "Silence", 0.0 audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16) @@ -90,9 +90,17 @@ class SileroVAD: return "Silence", 0.0 audio_float = audio_int16.astype(np.float32) / 32768.0 rms = float(np.sqrt(np.mean(audio_float * audio_float))) - # Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale. - # Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS. - probability = min(1.0, rms / 0.05) + + # Update adaptive noise floor (slowly rises, faster to fall) + if rms < self._energy_noise_floor: + self._energy_noise_floor = 0.95 * self._energy_noise_floor + 0.05 * rms + else: + self._energy_noise_floor = 0.995 * self._energy_noise_floor + 0.005 * rms + + # Compute SNR-like ratio and map to probability + denom = max(self._energy_noise_floor, 1e-6) + snr = max(0.0, (rms - denom) / denom) + probability = min(1.0, snr / 3.0) # ~3x above noise => strong speech label = "Speech" if probability >= 0.5 else "Silence" return label, probability