Rework audio idle detection with timestamp-based adaptive sleep

Replaces the per-frame asyncio.Event signaling with a monotonic
timestamp updated on each audio frame. The handler sleeps until the
next deadline (last_audio_time + timeout), recomputing on each wake-up
to account for audio arriving during sleep.

This avoids waking the handler on every audio frame (~50/s at 20ms
chunks), and guarantees detection latency is bounded by timeout rather
than 2 * timeout.

Also renames audio_starvation_timeout to audio_idle_timeout and
associated identifiers for consistency with existing pipecat naming
(user_idle_timeout, etc.).
This commit is contained in:
Aleix Conchillo Flaqué
2026-04-10 09:55:54 -07:00
parent cb2c1868b0
commit dcd21e7ff4
5 changed files with 54 additions and 49 deletions

1
changelog/4244.fixed.md Normal file
View File

@@ -0,0 +1 @@
- Fixed `VADController` getting stuck in the `SPEAKING` state when audio frames stop arriving mid-speech (e.g. user mutes mic). A new `audio_idle_timeout` parameter (default 1s, set to 0 to disable) forces a transition back to `QUIET` and emits `on_speech_stopped` when no audio is received while speaking.

View File

@@ -39,8 +39,8 @@ class VADController(BaseObject):
Event handlers available:
- on_speech_started: Called when speech begins.
- on_speech_stopped: Called when speech ends, including forced stop on
audio starvation (no frames received while speaking).
- on_speech_stopped: Called when speech ends, including forced stop when
the audio stream goes idle (no frames received while speaking).
- on_speech_activity: Called periodically while speech is detected.
- on_push_frame: Called when the controller wants to push a frame.
- on_broadcast_frame: Called when the controller wants to broadcast a frame.
@@ -73,7 +73,7 @@ class VADController(BaseObject):
vad_analyzer: VADAnalyzer,
*,
speech_activity_period: float = 0.2,
audio_starvation_timeout: float = 1.0,
audio_idle_timeout: float = 1.0,
):
"""Initialize the VAD controller.
@@ -81,7 +81,7 @@ class VADController(BaseObject):
vad_analyzer: The `VADAnalyzer` instance for processing audio.
speech_activity_period: Minimum interval in seconds between
`on_speech_activity` events. Defaults to 0.2.
audio_starvation_timeout: Timeout in seconds to force speech stop
audio_idle_timeout: Timeout in seconds to force speech stop
when no audio frames are received while in SPEAKING state.
This handles cases like mic mute mid-speech.
Set to 0 to disable. Defaults to 1.0.
@@ -90,18 +90,19 @@ class VADController(BaseObject):
self._vad_analyzer = vad_analyzer
self._vad_state: VADState = VADState.QUIET
self._task_manager: Optional[BaseTaskManager] = None
# Last time a on_speech_activity was triggered.
self._speech_activity_time = 0
# How often a on_speech_activity event should be triggered (value should
# be greater than the audio chunks to have any effect).
self._speech_activity_period = speech_activity_period
# Audio starvation detection: force speech stop when no audio arrives
# Audio idle detection: force speech stop when no audio arrives
# while in SPEAKING state (e.g. user mutes mic mid-speech).
self._audio_starvation_timeout = audio_starvation_timeout
self._task_manager: Optional[BaseTaskManager] = None
self._audio_received_event = asyncio.Event()
self._starvation_task: Optional[asyncio.Task] = None
self._last_audio_time: float = 0.0
self._audio_idle_timeout = audio_idle_timeout
self._audio_idle_task: Optional[asyncio.Task] = None
self._register_event_handler("on_speech_started", sync=True)
self._register_event_handler("on_speech_stopped", sync=True)
@@ -116,11 +117,11 @@ class VADController(BaseObject):
task_manager: The task manager to be associated with this instance.
"""
self._task_manager = task_manager
self._audio_received_event.clear()
if self._audio_starvation_timeout > 0 and not self._starvation_task:
self._starvation_task = self._task_manager.create_task(
self._audio_starvation_handler(),
f"{self}::_audio_starvation_handler",
self._last_audio_time = time.monotonic()
if self._audio_idle_timeout > 0 and not self._audio_idle_task:
self._audio_idle_task = self._task_manager.create_task(
self._audio_idle_handler(),
f"{self}::_audio_idle_handler",
)
async def process_frame(self, frame: Frame):
@@ -153,9 +154,9 @@ class VADController(BaseObject):
before returning.
"""
await super().cleanup()
if self._starvation_task and self._task_manager:
await self._task_manager.cancel_task(self._starvation_task)
self._starvation_task = None
if self._audio_idle_task and self._task_manager:
await self._task_manager.cancel_task(self._audio_idle_task)
self._audio_idle_task = None
if self._vad_analyzer:
await self._vad_analyzer.cleanup()
@@ -168,7 +169,7 @@ class VADController(BaseObject):
Args:
frame: Audio frame to process.
"""
self._audio_received_event.set()
self._last_audio_time = time.monotonic()
self._vad_state = await self._handle_vad(frame.audio, self._vad_state)
@@ -191,25 +192,28 @@ class VADController(BaseObject):
vad_state = new_vad_state
return vad_state
async def _audio_starvation_handler(self):
"""Monitor for audio starvation while in SPEAKING state.
async def _audio_idle_handler(self):
"""Monitor for an idle audio stream while in SPEAKING state.
When no audio frames arrive for `audio_starvation_timeout` seconds
When no audio frames arrive for `audio_idle_timeout` seconds
(e.g. user mutes mic mid-speech), forces a transition to QUIET and
emits `on_speech_stopped`.
"""
while True:
try:
await asyncio.wait_for(
self._audio_received_event.wait(),
timeout=self._audio_starvation_timeout,
)
self._audio_received_event.clear()
except asyncio.TimeoutError:
if self._vad_state == VADState.SPEAKING:
logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
self._vad_state = VADState.QUIET
await self._call_event_handler("on_speech_stopped")
deadline = self._last_audio_time + self._audio_idle_timeout
remaining = deadline - time.monotonic()
if remaining > 0:
# Audio is still recent; sleep only for the remaining window.
await asyncio.sleep(remaining)
continue
if self._vad_state == VADState.SPEAKING:
logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
self._vad_state = VADState.QUIET
await self._call_event_handler("on_speech_stopped")
# Wait for the next potential idle window.
await asyncio.sleep(self._audio_idle_timeout)
async def _maybe_speech_activity(self):
"""Handle user speaking frame."""

View File

@@ -107,7 +107,7 @@ class LLMUserAggregatorParams:
has been idle (not speaking) for this duration. Set to 0 to disable
idle detection.
vad_analyzer: Voice Activity Detection analyzer instance.
audio_starvation_timeout: Timeout in seconds to force speech stop when
audio_idle_timeout: Timeout in seconds to force speech stop when
no audio frames are received while in SPEAKING state (e.g. user mutes
mic mid-speech). Set to 0 to disable. Defaults to 1.0.
filter_incomplete_user_turns: Whether to filter out incomplete user turns.
@@ -124,7 +124,7 @@ class LLMUserAggregatorParams:
user_turn_stop_timeout: float = 5.0
user_idle_timeout: float = 0
vad_analyzer: Optional[VADAnalyzer] = None
audio_starvation_timeout: float = 1.0
audio_idle_timeout: float = 1.0
filter_incomplete_user_turns: bool = False
user_turn_completion_config: Optional[UserTurnCompletionConfig] = None
@@ -473,7 +473,7 @@ class LLMUserAggregator(LLMContextAggregator):
if self._params.vad_analyzer:
self._vad_controller = VADController(
self._params.vad_analyzer,
audio_starvation_timeout=self._params.audio_starvation_timeout,
audio_idle_timeout=self._params.audio_idle_timeout,
)
self._vad_controller.add_event_handler("on_speech_started", self._on_vad_speech_started)
self._vad_controller.add_event_handler("on_speech_stopped", self._on_vad_speech_stopped)

View File

@@ -46,7 +46,7 @@ class VADProcessor(FrameProcessor):
*,
vad_analyzer: VADAnalyzer,
speech_activity_period: float = 0.2,
audio_starvation_timeout: float = 1.0,
audio_idle_timeout: float = 1.0,
**kwargs,
):
"""Initialize the VAD processor.
@@ -55,7 +55,7 @@ class VADProcessor(FrameProcessor):
vad_analyzer: The VADAnalyzer instance for processing audio.
speech_activity_period: Minimum interval in seconds between
UserSpeakingFrame pushes. Defaults to 0.2.
audio_starvation_timeout: Timeout in seconds to force speech stop
audio_idle_timeout: Timeout in seconds to force speech stop
when no audio frames are received while in SPEAKING state.
Set to 0 to disable. Defaults to 1.0.
**kwargs: Additional arguments passed to parent class.
@@ -64,7 +64,7 @@ class VADProcessor(FrameProcessor):
self._vad_controller = VADController(
vad_analyzer,
speech_activity_period=speech_activity_period,
audio_starvation_timeout=audio_starvation_timeout,
audio_idle_timeout=audio_idle_timeout,
)
# Push VAD frames when speech events are detected

View File

@@ -208,18 +208,18 @@ class TestVADController(unittest.IsolatedAsyncioTestCase):
self.assertIsInstance(broadcast_calls[0][1]["vad_params"], VADParams)
AUDIO_STARVATION_TIMEOUT = 0.1
AUDIO_IDLE_TIMEOUT = 0.1
class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
class TestVADControllerAudioIdle(unittest.IsolatedAsyncioTestCase):
async def asyncSetUp(self):
self.task_manager = TaskManager()
self.task_manager.setup(TaskManagerParams(loop=asyncio.get_running_loop()))
async def test_audio_starvation_forces_speech_stop(self):
async def test_audio_idle_forces_speech_stop(self):
"""Test that on_speech_stopped fires when no audio arrives while SPEAKING."""
analyzer = MockVADAnalyzer()
controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)
speech_stopped = False
@@ -238,16 +238,16 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
await controller.process_frame(audio_frame)
self.assertFalse(speech_stopped)
# Stop sending audio, wait for starvation timeout
await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
# Stop sending audio, wait for idle timeout
await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
self.assertTrue(speech_stopped)
await controller.cleanup()
async def test_audio_starvation_does_not_fire_when_quiet(self):
"""Test that starvation timeout does NOT fire when VAD is in QUIET state."""
async def test_audio_idle_does_not_fire_when_quiet(self):
"""Test that idle timeout does NOT fire when VAD is in QUIET state."""
analyzer = MockVADAnalyzer()
controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)
speech_stopped = False
@@ -260,8 +260,8 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
await controller.process_frame(start_frame)
await controller.setup(self.task_manager)
# Stay in QUIET state, wait past starvation timeout
await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
# Stay in QUIET state, wait past idle timeout
await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
self.assertFalse(speech_stopped)
await controller.cleanup()