Rework audio idle detection with timestamp-based adaptive sleep
Replaces the per-frame asyncio.Event signaling with a monotonic timestamp updated on each audio frame. The handler sleeps until the next deadline (last_audio_time + timeout), recomputing on each wake-up to account for audio arriving during sleep. This avoids waking the handler on every audio frame (~50/s at 20ms chunks), and guarantees detection latency is bounded by timeout rather than 2 * timeout. Also renames audio_starvation_timeout to audio_idle_timeout and associated identifiers for consistency with existing pipecat naming (user_idle_timeout, etc.).
This commit is contained in:
1
changelog/4244.fixed.md
Normal file
1
changelog/4244.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `VADController` getting stuck in the `SPEAKING` state when audio frames stop arriving mid-speech (e.g. user mutes mic). A new `audio_idle_timeout` parameter (default 1s, set to 0 to disable) forces a transition back to `QUIET` and emits `on_speech_stopped` when no audio is received while speaking.
|
||||
@@ -39,8 +39,8 @@ class VADController(BaseObject):
|
||||
Event handlers available:
|
||||
|
||||
- on_speech_started: Called when speech begins.
|
||||
- on_speech_stopped: Called when speech ends, including forced stop on
|
||||
audio starvation (no frames received while speaking).
|
||||
- on_speech_stopped: Called when speech ends, including forced stop when
|
||||
the audio stream goes idle (no frames received while speaking).
|
||||
- on_speech_activity: Called periodically while speech is detected.
|
||||
- on_push_frame: Called when the controller wants to push a frame.
|
||||
- on_broadcast_frame: Called when the controller wants to broadcast a frame.
|
||||
@@ -73,7 +73,7 @@ class VADController(BaseObject):
|
||||
vad_analyzer: VADAnalyzer,
|
||||
*,
|
||||
speech_activity_period: float = 0.2,
|
||||
audio_starvation_timeout: float = 1.0,
|
||||
audio_idle_timeout: float = 1.0,
|
||||
):
|
||||
"""Initialize the VAD controller.
|
||||
|
||||
@@ -81,7 +81,7 @@ class VADController(BaseObject):
|
||||
vad_analyzer: The `VADAnalyzer` instance for processing audio.
|
||||
speech_activity_period: Minimum interval in seconds between
|
||||
`on_speech_activity` events. Defaults to 0.2.
|
||||
audio_starvation_timeout: Timeout in seconds to force speech stop
|
||||
audio_idle_timeout: Timeout in seconds to force speech stop
|
||||
when no audio frames are received while in SPEAKING state.
|
||||
This handles cases like mic mute mid-speech.
|
||||
Set to 0 to disable. Defaults to 1.0.
|
||||
@@ -90,18 +90,19 @@ class VADController(BaseObject):
|
||||
self._vad_analyzer = vad_analyzer
|
||||
self._vad_state: VADState = VADState.QUIET
|
||||
|
||||
self._task_manager: Optional[BaseTaskManager] = None
|
||||
|
||||
# Last time a on_speech_activity was triggered.
|
||||
self._speech_activity_time = 0
|
||||
# How often a on_speech_activity event should be triggered (value should
|
||||
# be greater than the audio chunks to have any effect).
|
||||
self._speech_activity_period = speech_activity_period
|
||||
|
||||
# Audio starvation detection: force speech stop when no audio arrives
|
||||
# Audio idle detection: force speech stop when no audio arrives
|
||||
# while in SPEAKING state (e.g. user mutes mic mid-speech).
|
||||
self._audio_starvation_timeout = audio_starvation_timeout
|
||||
self._task_manager: Optional[BaseTaskManager] = None
|
||||
self._audio_received_event = asyncio.Event()
|
||||
self._starvation_task: Optional[asyncio.Task] = None
|
||||
self._last_audio_time: float = 0.0
|
||||
self._audio_idle_timeout = audio_idle_timeout
|
||||
self._audio_idle_task: Optional[asyncio.Task] = None
|
||||
|
||||
self._register_event_handler("on_speech_started", sync=True)
|
||||
self._register_event_handler("on_speech_stopped", sync=True)
|
||||
@@ -116,11 +117,11 @@ class VADController(BaseObject):
|
||||
task_manager: The task manager to be associated with this instance.
|
||||
"""
|
||||
self._task_manager = task_manager
|
||||
self._audio_received_event.clear()
|
||||
if self._audio_starvation_timeout > 0 and not self._starvation_task:
|
||||
self._starvation_task = self._task_manager.create_task(
|
||||
self._audio_starvation_handler(),
|
||||
f"{self}::_audio_starvation_handler",
|
||||
self._last_audio_time = time.monotonic()
|
||||
if self._audio_idle_timeout > 0 and not self._audio_idle_task:
|
||||
self._audio_idle_task = self._task_manager.create_task(
|
||||
self._audio_idle_handler(),
|
||||
f"{self}::_audio_idle_handler",
|
||||
)
|
||||
|
||||
async def process_frame(self, frame: Frame):
|
||||
@@ -153,9 +154,9 @@ class VADController(BaseObject):
|
||||
before returning.
|
||||
"""
|
||||
await super().cleanup()
|
||||
if self._starvation_task and self._task_manager:
|
||||
await self._task_manager.cancel_task(self._starvation_task)
|
||||
self._starvation_task = None
|
||||
if self._audio_idle_task and self._task_manager:
|
||||
await self._task_manager.cancel_task(self._audio_idle_task)
|
||||
self._audio_idle_task = None
|
||||
if self._vad_analyzer:
|
||||
await self._vad_analyzer.cleanup()
|
||||
|
||||
@@ -168,7 +169,7 @@ class VADController(BaseObject):
|
||||
Args:
|
||||
frame: Audio frame to process.
|
||||
"""
|
||||
self._audio_received_event.set()
|
||||
self._last_audio_time = time.monotonic()
|
||||
|
||||
self._vad_state = await self._handle_vad(frame.audio, self._vad_state)
|
||||
|
||||
@@ -191,25 +192,28 @@ class VADController(BaseObject):
|
||||
vad_state = new_vad_state
|
||||
return vad_state
|
||||
|
||||
async def _audio_starvation_handler(self):
|
||||
"""Monitor for audio starvation while in SPEAKING state.
|
||||
async def _audio_idle_handler(self):
|
||||
"""Monitor for an idle audio stream while in SPEAKING state.
|
||||
|
||||
When no audio frames arrive for `audio_starvation_timeout` seconds
|
||||
When no audio frames arrive for `audio_idle_timeout` seconds
|
||||
(e.g. user mutes mic mid-speech), forces a transition to QUIET and
|
||||
emits `on_speech_stopped`.
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._audio_received_event.wait(),
|
||||
timeout=self._audio_starvation_timeout,
|
||||
)
|
||||
self._audio_received_event.clear()
|
||||
except asyncio.TimeoutError:
|
||||
if self._vad_state == VADState.SPEAKING:
|
||||
logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
|
||||
self._vad_state = VADState.QUIET
|
||||
await self._call_event_handler("on_speech_stopped")
|
||||
deadline = self._last_audio_time + self._audio_idle_timeout
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining > 0:
|
||||
# Audio is still recent; sleep only for the remaining window.
|
||||
await asyncio.sleep(remaining)
|
||||
continue
|
||||
|
||||
if self._vad_state == VADState.SPEAKING:
|
||||
logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
|
||||
self._vad_state = VADState.QUIET
|
||||
await self._call_event_handler("on_speech_stopped")
|
||||
|
||||
# Wait for the next potential idle window.
|
||||
await asyncio.sleep(self._audio_idle_timeout)
|
||||
|
||||
async def _maybe_speech_activity(self):
|
||||
"""Handle user speaking frame."""
|
||||
|
||||
@@ -107,7 +107,7 @@ class LLMUserAggregatorParams:
|
||||
has been idle (not speaking) for this duration. Set to 0 to disable
|
||||
idle detection.
|
||||
vad_analyzer: Voice Activity Detection analyzer instance.
|
||||
audio_starvation_timeout: Timeout in seconds to force speech stop when
|
||||
audio_idle_timeout: Timeout in seconds to force speech stop when
|
||||
no audio frames are received while in SPEAKING state (e.g. user mutes
|
||||
mic mid-speech). Set to 0 to disable. Defaults to 1.0.
|
||||
filter_incomplete_user_turns: Whether to filter out incomplete user turns.
|
||||
@@ -124,7 +124,7 @@ class LLMUserAggregatorParams:
|
||||
user_turn_stop_timeout: float = 5.0
|
||||
user_idle_timeout: float = 0
|
||||
vad_analyzer: Optional[VADAnalyzer] = None
|
||||
audio_starvation_timeout: float = 1.0
|
||||
audio_idle_timeout: float = 1.0
|
||||
filter_incomplete_user_turns: bool = False
|
||||
user_turn_completion_config: Optional[UserTurnCompletionConfig] = None
|
||||
|
||||
@@ -473,7 +473,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
if self._params.vad_analyzer:
|
||||
self._vad_controller = VADController(
|
||||
self._params.vad_analyzer,
|
||||
audio_starvation_timeout=self._params.audio_starvation_timeout,
|
||||
audio_idle_timeout=self._params.audio_idle_timeout,
|
||||
)
|
||||
self._vad_controller.add_event_handler("on_speech_started", self._on_vad_speech_started)
|
||||
self._vad_controller.add_event_handler("on_speech_stopped", self._on_vad_speech_stopped)
|
||||
|
||||
@@ -46,7 +46,7 @@ class VADProcessor(FrameProcessor):
|
||||
*,
|
||||
vad_analyzer: VADAnalyzer,
|
||||
speech_activity_period: float = 0.2,
|
||||
audio_starvation_timeout: float = 1.0,
|
||||
audio_idle_timeout: float = 1.0,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the VAD processor.
|
||||
@@ -55,7 +55,7 @@ class VADProcessor(FrameProcessor):
|
||||
vad_analyzer: The VADAnalyzer instance for processing audio.
|
||||
speech_activity_period: Minimum interval in seconds between
|
||||
UserSpeakingFrame pushes. Defaults to 0.2.
|
||||
audio_starvation_timeout: Timeout in seconds to force speech stop
|
||||
audio_idle_timeout: Timeout in seconds to force speech stop
|
||||
when no audio frames are received while in SPEAKING state.
|
||||
Set to 0 to disable. Defaults to 1.0.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
@@ -64,7 +64,7 @@ class VADProcessor(FrameProcessor):
|
||||
self._vad_controller = VADController(
|
||||
vad_analyzer,
|
||||
speech_activity_period=speech_activity_period,
|
||||
audio_starvation_timeout=audio_starvation_timeout,
|
||||
audio_idle_timeout=audio_idle_timeout,
|
||||
)
|
||||
|
||||
# Push VAD frames when speech events are detected
|
||||
|
||||
@@ -208,18 +208,18 @@ class TestVADController(unittest.IsolatedAsyncioTestCase):
|
||||
self.assertIsInstance(broadcast_calls[0][1]["vad_params"], VADParams)
|
||||
|
||||
|
||||
AUDIO_STARVATION_TIMEOUT = 0.1
|
||||
AUDIO_IDLE_TIMEOUT = 0.1
|
||||
|
||||
|
||||
class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
|
||||
class TestVADControllerAudioIdle(unittest.IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
self.task_manager = TaskManager()
|
||||
self.task_manager.setup(TaskManagerParams(loop=asyncio.get_running_loop()))
|
||||
|
||||
async def test_audio_starvation_forces_speech_stop(self):
|
||||
async def test_audio_idle_forces_speech_stop(self):
|
||||
"""Test that on_speech_stopped fires when no audio arrives while SPEAKING."""
|
||||
analyzer = MockVADAnalyzer()
|
||||
controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
|
||||
controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)
|
||||
|
||||
speech_stopped = False
|
||||
|
||||
@@ -238,16 +238,16 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
|
||||
await controller.process_frame(audio_frame)
|
||||
self.assertFalse(speech_stopped)
|
||||
|
||||
# Stop sending audio, wait for starvation timeout
|
||||
await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
|
||||
# Stop sending audio, wait for idle timeout
|
||||
await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
|
||||
self.assertTrue(speech_stopped)
|
||||
|
||||
await controller.cleanup()
|
||||
|
||||
async def test_audio_starvation_does_not_fire_when_quiet(self):
|
||||
"""Test that starvation timeout does NOT fire when VAD is in QUIET state."""
|
||||
async def test_audio_idle_does_not_fire_when_quiet(self):
|
||||
"""Test that idle timeout does NOT fire when VAD is in QUIET state."""
|
||||
analyzer = MockVADAnalyzer()
|
||||
controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
|
||||
controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)
|
||||
|
||||
speech_stopped = False
|
||||
|
||||
@@ -260,8 +260,8 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
|
||||
await controller.process_frame(start_frame)
|
||||
await controller.setup(self.task_manager)
|
||||
|
||||
# Stay in QUIET state, wait past starvation timeout
|
||||
await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
|
||||
# Stay in QUIET state, wait past idle timeout
|
||||
await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
|
||||
self.assertFalse(speech_stopped)
|
||||
|
||||
await controller.cleanup()
|
||||
|
||||
Reference in New Issue
Block a user