Fix Duplicate / inconsistent EOU
This commit is contained in:
@@ -85,8 +85,8 @@ class DuplexPipeline:
|
|||||||
|
|
||||||
# Initialize EOU detector
|
# Initialize EOU detector
|
||||||
self.eou_detector = EouDetector(
|
self.eou_detector = EouDetector(
|
||||||
silence_threshold_ms=600,
|
silence_threshold_ms=settings.vad_eou_threshold_ms,
|
||||||
min_speech_duration_ms=200
|
min_speech_duration_ms=settings.vad_min_speech_duration_ms
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize services
|
# Initialize services
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ from typing import Tuple, Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from processors.eou import EouDetector
|
|
||||||
|
|
||||||
# Try to import onnxruntime (optional for VAD functionality)
|
# Try to import onnxruntime (optional for VAD functionality)
|
||||||
try:
|
try:
|
||||||
@@ -160,25 +159,19 @@ class VADProcessor:
|
|||||||
Tracks speech/silence state and emits events on transitions.
|
Tracks speech/silence state and emits events on transitions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vad_model: SileroVAD, threshold: float = 0.5,
|
def __init__(self, vad_model: SileroVAD, threshold: float = 0.5):
|
||||||
silence_threshold_ms: int = 1000, min_speech_duration_ms: int = 250):
|
|
||||||
"""
|
"""
|
||||||
Initialize VAD processor.
|
Initialize VAD processor.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vad_model: Silero VAD model instance
|
vad_model: Silero VAD model instance
|
||||||
threshold: Speech detection threshold
|
threshold: Speech detection threshold
|
||||||
silence_threshold_ms: EOU silence threshold in ms (longer = one EOU across short pauses)
|
|
||||||
min_speech_duration_ms: EOU min speech duration in ms (ignore very short noises)
|
|
||||||
"""
|
"""
|
||||||
self.vad = vad_model
|
self.vad = vad_model
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
self._eou_silence_ms = silence_threshold_ms
|
|
||||||
self._eou_min_speech_ms = min_speech_duration_ms
|
|
||||||
self.is_speaking = False
|
self.is_speaking = False
|
||||||
self.speech_start_time: Optional[float] = None
|
self.speech_start_time: Optional[float] = None
|
||||||
self.silence_start_time: Optional[float] = None
|
self.silence_start_time: Optional[float] = None
|
||||||
self.eou_detector = EouDetector(silence_threshold_ms, min_speech_duration_ms)
|
|
||||||
|
|
||||||
def process(self, pcm_bytes: bytes, chunk_size_ms: int = 20) -> Optional[Tuple[str, float]]:
|
def process(self, pcm_bytes: bytes, chunk_size_ms: int = 20) -> Optional[Tuple[str, float]]:
|
||||||
"""
|
"""
|
||||||
@@ -196,10 +189,6 @@ class VADProcessor:
|
|||||||
# Check if this is speech based on threshold
|
# Check if this is speech based on threshold
|
||||||
is_speech = probability >= self.threshold
|
is_speech = probability >= self.threshold
|
||||||
|
|
||||||
# Check EOU
|
|
||||||
if self.eou_detector.process("Speech" if is_speech else "Silence"):
|
|
||||||
return ("eou", probability)
|
|
||||||
|
|
||||||
# State transition: Silence -> Speech
|
# State transition: Silence -> Speech
|
||||||
if is_speech and not self.is_speaking:
|
if is_speech and not self.is_speaking:
|
||||||
self.is_speaking = True
|
self.is_speaking = True
|
||||||
@@ -222,4 +211,3 @@ class VADProcessor:
|
|||||||
self.is_speaking = False
|
self.is_speaking = False
|
||||||
self.speech_start_time = None
|
self.speech_start_time = None
|
||||||
self.silence_start_time = None
|
self.silence_start_time = None
|
||||||
self.eou_detector = EouDetector(self._eou_silence_ms, self._eou_min_speech_ms)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user