add min speech duration.

This commit is contained in:
Gökmen Görgen
2026-01-19 17:33:27 +01:00
parent abebcf37bd
commit 5fd43faec3
3 changed files with 37 additions and 10 deletions

View File

@@ -62,7 +62,9 @@ transport_params = {
lambda aic: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=aic.create_vad_analyzer(speech_hold_duration=0.05, sensitivity=6.0),
vad_analyzer=aic.create_vad_analyzer(
speech_hold_duration=0.05, minimum_speech_duration=0.0, sensitivity=6.0
),
audio_in_filter=aic,
)
)(_create_aic_filter()),
@@ -70,7 +72,9 @@ transport_params = {
lambda aic: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=aic.create_vad_analyzer(speech_hold_duration=0.05, sensitivity=6.0),
vad_analyzer=aic.create_vad_analyzer(
speech_hold_duration=0.05, minimum_speech_duration=0.0, sensitivity=6.0
),
audio_in_filter=aic,
)
)(_create_aic_filter()),
@@ -78,7 +82,9 @@ transport_params = {
lambda aic: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=aic.create_vad_analyzer(speech_hold_duration=0.05, sensitivity=6.0),
vad_analyzer=aic.create_vad_analyzer(
speech_hold_duration=0.05, minimum_speech_duration=0.0, sensitivity=6.0
),
audio_in_filter=aic,
)
)(_create_aic_filter()),

View File

@@ -121,6 +121,7 @@ class AICFilter(BaseAudioFilter):
self,
*,
speech_hold_duration: Optional[float] = None,
minimum_speech_duration: Optional[float] = None,
sensitivity: Optional[float] = None,
):
"""Return an analyzer that will lazily instantiate the AIC VAD when ready.
@@ -129,6 +130,9 @@ class AICFilter(BaseAudioFilter):
- speech_hold_duration:
How long VAD continues detecting after speech ends (in seconds).
Range: 0.0 .. 20x model window length, Default (SDK): 0.05s
- minimum_speech_duration:
Minimum duration of speech required before VAD reports speech detected
(in seconds). Range: 0.0 .. 20x model window length, Default (SDK): 0.0s
- sensitivity:
Energy threshold sensitivity. Energy threshold = 10 ** (-sensitivity).
Range: 1.0 .. 15.0, Default (SDK): 6.0
@@ -136,6 +140,8 @@ class AICFilter(BaseAudioFilter):
Args:
speech_hold_duration: Optional speech hold duration to configure on the VAD.
If None, SDK default (0.05s) is used.
minimum_speech_duration: Optional minimum speech duration before VAD reports
speech detected. If None, SDK default (0.0s) is used.
sensitivity: Optional sensitivity (energy threshold) to configure on the VAD.
Range: 1.0 .. 15.0. If None, SDK default (6.0) is used.
@@ -146,6 +152,7 @@ class AICFilter(BaseAudioFilter):
return AICVADAnalyzer(
vad_context_factory=lambda: self.get_vad_context(),
speech_hold_duration=speech_hold_duration,
minimum_speech_duration=minimum_speech_duration,
sensitivity=sensitivity,
)

View File

@@ -10,6 +10,7 @@ Classes:
from typing import Any, Callable, Optional
from aic_sdk import VadParameter
from loguru import logger
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams
@@ -29,6 +30,10 @@ class AICVADAnalyzer(VADAnalyzer):
no longer contains speech (in seconds).
Range: 0.0 .. 20x model window length
Default (SDK): 0.05s (50ms)
- minimum_speech_duration:
Minimum duration of speech required before VAD reports speech detected (in seconds).
Range: 0.0 .. 20x model window length
Default (SDK): 0.0s
- sensitivity:
Controls the energy threshold sensitivity. Higher values make the detector
less sensitive (require more energy to count as speech).
@@ -37,7 +42,7 @@ class AICVADAnalyzer(VADAnalyzer):
Default (SDK): 6.0
.. note::
This class requires aic-sdk >= 2.0.0 (uses 'aic_sdk' module).
This class requires aic-sdk ~= 2.0.0 (uses 'aic_sdk' module).
"""
def __init__(
@@ -45,6 +50,7 @@ class AICVADAnalyzer(VADAnalyzer):
*,
vad_context_factory: Optional[Callable[[], Any]] = None,
speech_hold_duration: Optional[float] = None,
minimum_speech_duration: Optional[float] = None,
sensitivity: Optional[float] = None,
):
"""Create an AIC VAD analyzer.
@@ -58,6 +64,11 @@ class AICVADAnalyzer(VADAnalyzer):
Optional override for AIC VAD speech hold duration (in seconds).
Range: 0.0 .. 20x model window length.
If None, the SDK default (0.05s) is used.
minimum_speech_duration:
Optional override for minimum speech duration before VAD reports
speech detected (in seconds).
Range: 0.0 .. 20x model window length.
If None, the SDK default (0.0s) is used.
sensitivity:
Optional override for AIC VAD sensitivity (energy threshold).
Range: 1.0 .. 15.0. Energy threshold = 10 ** (-sensitivity).
@@ -66,9 +77,11 @@ class AICVADAnalyzer(VADAnalyzer):
# Use fixed VAD parameters for AIC: no user override
fixed_params = VADParams(confidence=0.5, start_secs=0.0, stop_secs=0.0, min_volume=0.0)
super().__init__(sample_rate=None, params=fixed_params)
self._vad_context_factory = vad_context_factory
self._vad_ctx: Optional[Any] = None
self._pending_speech_hold_duration: Optional[float] = speech_hold_duration
self._pending_minimum_speech_duration: Optional[float] = minimum_speech_duration
self._pending_sensitivity: Optional[float] = sensitivity
def bind_vad_context_factory(self, vad_context_factory: Callable[[], Any]):
@@ -78,19 +91,20 @@ class AICVADAnalyzer(VADAnalyzer):
def _apply_vad_params(self):
"""Apply optional AIC VAD parameters if available."""
from aic_sdk import VadParameter
if self._vad_ctx is None or VadParameter is None:
return
try:
if self._pending_speech_hold_duration is not None:
self._vad_ctx.set_parameter(
VadParameter.SpeechHoldDuration, float(self._pending_speech_hold_duration)
VadParameter.SpeechHoldDuration, self._pending_speech_hold_duration
)
if self._pending_minimum_speech_duration is not None:
self._vad_ctx.set_parameter(
VadParameter.MinimumSpeechDuration, self._pending_minimum_speech_duration
)
if self._pending_sensitivity is not None:
self._vad_ctx.set_parameter(
VadParameter.Sensitivity, float(self._pending_sensitivity)
)
self._vad_ctx.set_parameter(VadParameter.Sensitivity, self._pending_sensitivity)
except Exception as e: # noqa: BLE001
logger.debug(f"AIC VAD parameter application deferred/failed: {e}")