81 lines
2.8 KiB
Python
81 lines
2.8 KiB
Python
"""End-of-Utterance Detection."""
|
|
|
|
import time
|
|
from typing import Optional
|
|
|
|
|
|
class EouDetector:
|
|
"""
|
|
End-of-utterance detector. Fires EOU only after continuous silence for
|
|
silence_threshold_ms. Short pauses between sentences do not trigger EOU
|
|
because speech resets the silence timer (one EOU per turn).
|
|
"""
|
|
|
|
def __init__(self, silence_threshold_ms: int = 1000, min_speech_duration_ms: int = 250):
|
|
"""
|
|
Initialize EOU detector.
|
|
|
|
Args:
|
|
silence_threshold_ms: How long silence must last to trigger EOU (default 1000ms)
|
|
min_speech_duration_ms: Minimum speech duration to consider valid (default 250ms)
|
|
"""
|
|
self.threshold = silence_threshold_ms / 1000.0
|
|
self.min_speech = min_speech_duration_ms / 1000.0
|
|
self._silence_threshold_ms = silence_threshold_ms
|
|
self._min_speech_duration_ms = min_speech_duration_ms
|
|
|
|
# State
|
|
self.is_speaking = False
|
|
self.speech_start_time = 0.0
|
|
self.silence_start_time: Optional[float] = None
|
|
self.triggered = False
|
|
|
|
def process(self, vad_status: str) -> bool:
|
|
"""
|
|
Process VAD status and detect end of utterance.
|
|
|
|
Input: "Speech" or "Silence" (from VAD).
|
|
Output: True if EOU detected, False otherwise.
|
|
|
|
Short breaks between phrases reset the silence clock when speech
|
|
resumes, so only one EOU is emitted after the user truly stops.
|
|
"""
|
|
now = time.time()
|
|
|
|
if vad_status == "Speech":
|
|
if not self.is_speaking:
|
|
self.is_speaking = True
|
|
self.speech_start_time = now
|
|
self.triggered = False
|
|
# Any speech resets silence timer — short pause + more speech = one utterance
|
|
self.silence_start_time = None
|
|
return False
|
|
|
|
if vad_status == "Silence":
|
|
if not self.is_speaking:
|
|
return False
|
|
if self.silence_start_time is None:
|
|
self.silence_start_time = now
|
|
|
|
speech_duration = self.silence_start_time - self.speech_start_time
|
|
if speech_duration < self.min_speech:
|
|
self.is_speaking = False
|
|
self.silence_start_time = None
|
|
return False
|
|
|
|
silence_duration = now - self.silence_start_time
|
|
if silence_duration >= self.threshold and not self.triggered:
|
|
self.triggered = True
|
|
self.is_speaking = False
|
|
self.silence_start_time = None
|
|
return True
|
|
|
|
return False
|
|
|
|
def reset(self) -> None:
|
|
"""Reset EOU detector state."""
|
|
self.is_speaking = False
|
|
self.speech_start_time = 0.0
|
|
self.silence_start_time = None
|
|
self.triggered = False
|