Fix microphone talk eou missing and clean chat log
This commit is contained in:
@@ -719,7 +719,14 @@
|
|||||||
sendCommand({ command: "chat", text });
|
sendCommand({ command: "chat", text });
|
||||||
chatInput.value = "";
|
chatInput.value = "";
|
||||||
});
|
});
|
||||||
clearLogBtn.addEventListener("click", () => (logEl.innerHTML = ""));
|
clearLogBtn.addEventListener("click", () => {
|
||||||
|
logEl.innerHTML = "";
|
||||||
|
chatHistory.innerHTML = "";
|
||||||
|
setInterim("You", "");
|
||||||
|
setInterim("AI", "");
|
||||||
|
interimUserText = "";
|
||||||
|
interimAiText = "";
|
||||||
|
});
|
||||||
inputSelect.addEventListener("change", () => {
|
inputSelect.addEventListener("change", () => {
|
||||||
if (micStream) {
|
if (micStream) {
|
||||||
stopMic();
|
stopMic();
|
||||||
|
|||||||
@@ -63,6 +63,7 @@ class SileroVAD:
|
|||||||
self.min_chunk_size = 512
|
self.min_chunk_size = 512
|
||||||
self.last_label = "Silence"
|
self.last_label = "Silence"
|
||||||
self.last_probability = 0.0
|
self.last_probability = 0.0
|
||||||
|
self._energy_noise_floor = 1e-4
|
||||||
|
|
||||||
def _reset_state(self):
|
def _reset_state(self):
|
||||||
# Silero VAD V4+ expects state shape [2, 1, 128]
|
# Silero VAD V4+ expects state shape [2, 1, 128]
|
||||||
@@ -81,8 +82,7 @@ class SileroVAD:
|
|||||||
Tuple of (label, probability) where label is "Speech" or "Silence"
|
Tuple of (label, probability) where label is "Speech" or "Silence"
|
||||||
"""
|
"""
|
||||||
if self.session is None or not ONNX_AVAILABLE:
|
if self.session is None or not ONNX_AVAILABLE:
|
||||||
# Fallback energy-based VAD when model isn't available.
|
# Fallback energy-based VAD with adaptive noise floor.
|
||||||
# Map RMS energy to a pseudo-probability so the existing threshold works.
|
|
||||||
if not pcm_bytes:
|
if not pcm_bytes:
|
||||||
return "Silence", 0.0
|
return "Silence", 0.0
|
||||||
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||||
@@ -90,9 +90,17 @@ class SileroVAD:
|
|||||||
return "Silence", 0.0
|
return "Silence", 0.0
|
||||||
audio_float = audio_int16.astype(np.float32) / 32768.0
|
audio_float = audio_int16.astype(np.float32) / 32768.0
|
||||||
rms = float(np.sqrt(np.mean(audio_float * audio_float)))
|
rms = float(np.sqrt(np.mean(audio_float * audio_float)))
|
||||||
# Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale.
|
|
||||||
# Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS.
|
# Update adaptive noise floor (slowly rises, faster to fall)
|
||||||
probability = min(1.0, rms / 0.05)
|
if rms < self._energy_noise_floor:
|
||||||
|
self._energy_noise_floor = 0.95 * self._energy_noise_floor + 0.05 * rms
|
||||||
|
else:
|
||||||
|
self._energy_noise_floor = 0.995 * self._energy_noise_floor + 0.005 * rms
|
||||||
|
|
||||||
|
# Compute SNR-like ratio and map to probability
|
||||||
|
denom = max(self._energy_noise_floor, 1e-6)
|
||||||
|
snr = max(0.0, (rms - denom) / denom)
|
||||||
|
probability = min(1.0, snr / 3.0) # ~3x above noise => strong speech
|
||||||
label = "Speech" if probability >= 0.5 else "Silence"
|
label = "Speech" if probability >= 0.5 else "Silence"
|
||||||
return label, probability
|
return label, probability
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user