From 4ceb3ec96f7608e47819cd6209e0964958787b3a Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Fri, 6 Feb 2026 07:23:31 +0800 Subject: [PATCH] Fix Duplicate / inconsistent EOU --- core/duplex_pipeline.py | 4 ++-- processors/vad.py | 14 +------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/core/duplex_pipeline.py b/core/duplex_pipeline.py index 3f12c7c..8a0211f 100644 --- a/core/duplex_pipeline.py +++ b/core/duplex_pipeline.py @@ -85,8 +85,8 @@ class DuplexPipeline: # Initialize EOU detector self.eou_detector = EouDetector( - silence_threshold_ms=600, - min_speech_duration_ms=200 + silence_threshold_ms=settings.vad_eou_threshold_ms, + min_speech_duration_ms=settings.vad_min_speech_duration_ms ) # Initialize services diff --git a/processors/vad.py b/processors/vad.py index 1c938fa..c0256d4 100644 --- a/processors/vad.py +++ b/processors/vad.py @@ -6,7 +6,6 @@ from typing import Tuple, Optional import numpy as np from loguru import logger -from processors.eou import EouDetector # Try to import onnxruntime (optional for VAD functionality) try: @@ -160,25 +159,19 @@ class VADProcessor: Tracks speech/silence state and emits events on transitions. """ - def __init__(self, vad_model: SileroVAD, threshold: float = 0.5, - silence_threshold_ms: int = 1000, min_speech_duration_ms: int = 250): + def __init__(self, vad_model: SileroVAD, threshold: float = 0.5): """ Initialize VAD processor. Args: vad_model: Silero VAD model instance threshold: Speech detection threshold - silence_threshold_ms: EOU silence threshold in ms (longer = one EOU across short pauses) - min_speech_duration_ms: EOU min speech duration in ms (ignore very short noises) """ self.vad = vad_model self.threshold = threshold - self._eou_silence_ms = silence_threshold_ms - self._eou_min_speech_ms = min_speech_duration_ms self.is_speaking = False self.speech_start_time: Optional[float] = None self.silence_start_time: Optional[float] = None - self.eou_detector = EouDetector(silence_threshold_ms, min_speech_duration_ms) def process(self, pcm_bytes: bytes, chunk_size_ms: int = 20) -> Optional[Tuple[str, float]]: """ @@ -196,10 +189,6 @@ class VADProcessor: # Check if this is speech based on threshold is_speech = probability >= self.threshold - # Check EOU - if self.eou_detector.process("Speech" if is_speech else "Silence"): - return ("eou", probability) - # State transition: Silence -> Speech if is_speech and not self.is_speaking: self.is_speaking = True @@ -222,4 +211,3 @@ class VADProcessor: self.is_speaking = False self.speech_start_time = None self.silence_start_time = None - self.eou_detector = EouDetector(self._eou_silence_ms, self._eou_min_speech_ms)