From 7be8fda4244b34b7281b79e570bd8abae66554e3 Mon Sep 17 00:00:00 2001
From: Xin Wang <wx44wx@XindeMac-mini.local>
Date: Fri, 6 Feb 2026 11:36:39 +0800
Subject: [PATCH] Fix microphone talk eou missing and clean chat log

---
 examples/web_client.html |  9 ++++++++-
 processors/vad.py        | 18 +++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/examples/web_client.html b/examples/web_client.html
index 0d9b6f7..bee3d28 100644
--- a/examples/web_client.html
+++ b/examples/web_client.html
@@ -719,7 +719,14 @@
         sendCommand({ command: "chat", text });
         chatInput.value = "";
       });
-      clearLogBtn.addEventListener("click", () => (logEl.innerHTML = ""));
+      clearLogBtn.addEventListener("click", () => {
+        logEl.innerHTML = "";
+        chatHistory.innerHTML = "";
+        setInterim("You", "");
+        setInterim("AI", "");
+        interimUserText = "";
+        interimAiText = "";
+      });
       inputSelect.addEventListener("change", () => {
         if (micStream) {
           stopMic();
diff --git a/processors/vad.py b/processors/vad.py
index c0256d4..cad6e8b 100644
--- a/processors/vad.py
+++ b/processors/vad.py
@@ -63,6 +63,7 @@ class SileroVAD:
         self.min_chunk_size = 512
         self.last_label = "Silence"
         self.last_probability = 0.0
+        self._energy_noise_floor = 1e-4
 
     def _reset_state(self):
         # Silero VAD V4+ expects state shape [2, 1, 128]
@@ -81,8 +82,7 @@ class SileroVAD:
             Tuple of (label, probability) where label is "Speech" or "Silence"
         """
         if self.session is None or not ONNX_AVAILABLE:
-            # Fallback energy-based VAD when model isn't available.
-            # Map RMS energy to a pseudo-probability so the existing threshold works.
+            # Fallback energy-based VAD with adaptive noise floor.
             if not pcm_bytes:
                 return "Silence", 0.0
             audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
@@ -90,9 +90,17 @@ class SileroVAD:
                 return "Silence", 0.0
             audio_float = audio_int16.astype(np.float32) / 32768.0
             rms = float(np.sqrt(np.mean(audio_float * audio_float)))
-            # Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale.
-            # Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS.
-            probability = min(1.0, rms / 0.05)
+
+            # Update adaptive noise floor (slowly rises, faster to fall)
+            if rms < self._energy_noise_floor:
+                self._energy_noise_floor = 0.95 * self._energy_noise_floor + 0.05 * rms
+            else:
+                self._energy_noise_floor = 0.995 * self._energy_noise_floor + 0.005 * rms
+
+            # Compute SNR-like ratio and map to probability
+            denom = max(self._energy_noise_floor, 1e-6)
+            snr = max(0.0, (rms - denom) / denom)
+            probability = min(1.0, snr / 3.0)  # ~3x above noise => strong speech
             label = "Speech" if probability >= 0.5 else "Silence"
             return label, probability