From dcd21e7ff4fcd72a392a89292f8857f42d5bff7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Fri, 10 Apr 2026 09:55:54 -0700 Subject: [PATCH] Rework audio idle detection with timestamp-based adaptive sleep Replaces the per-frame asyncio.Event signaling with a monotonic timestamp updated on each audio frame. The handler sleeps until the next deadline (last_audio_time + timeout), recomputing on each wake-up to account for audio arriving during sleep. This avoids waking the handler on every audio frame (~50/s at 20ms chunks), and guarantees detection latency is bounded by timeout rather than 2 * timeout. Also renames audio_starvation_timeout to audio_idle_timeout and associated identifiers for consistency with existing pipecat naming (user_idle_timeout, etc.). --- changelog/4244.fixed.md | 1 + src/pipecat/audio/vad/vad_controller.py | 68 ++++++++++--------- .../aggregators/llm_response_universal.py | 6 +- src/pipecat/processors/audio/vad_processor.py | 6 +- tests/test_vad_controller.py | 22 +++--- 5 files changed, 54 insertions(+), 49 deletions(-) create mode 100644 changelog/4244.fixed.md diff --git a/changelog/4244.fixed.md b/changelog/4244.fixed.md new file mode 100644 index 000000000..271e39c37 --- /dev/null +++ b/changelog/4244.fixed.md @@ -0,0 +1 @@ +- Fixed `VADController` getting stuck in the `SPEAKING` state when audio frames stop arriving mid-speech (e.g. user mutes mic). A new `audio_idle_timeout` parameter (default 1s, set to 0 to disable) forces a transition back to `QUIET` and emits `on_speech_stopped` when no audio is received while speaking. diff --git a/src/pipecat/audio/vad/vad_controller.py b/src/pipecat/audio/vad/vad_controller.py index 2d7bbf817..fefe3bec1 100644 --- a/src/pipecat/audio/vad/vad_controller.py +++ b/src/pipecat/audio/vad/vad_controller.py @@ -39,8 +39,8 @@ class VADController(BaseObject): Event handlers available: - on_speech_started: Called when speech begins. - - on_speech_stopped: Called when speech ends, including forced stop on - audio starvation (no frames received while speaking). + - on_speech_stopped: Called when speech ends, including forced stop when + the audio stream goes idle (no frames received while speaking). - on_speech_activity: Called periodically while speech is detected. - on_push_frame: Called when the controller wants to push a frame. - on_broadcast_frame: Called when the controller wants to broadcast a frame. @@ -73,7 +73,7 @@ class VADController(BaseObject): vad_analyzer: VADAnalyzer, *, speech_activity_period: float = 0.2, - audio_starvation_timeout: float = 1.0, + audio_idle_timeout: float = 1.0, ): """Initialize the VAD controller. @@ -81,7 +81,7 @@ class VADController(BaseObject): vad_analyzer: The `VADAnalyzer` instance for processing audio. speech_activity_period: Minimum interval in seconds between `on_speech_activity` events. Defaults to 0.2. - audio_starvation_timeout: Timeout in seconds to force speech stop + audio_idle_timeout: Timeout in seconds to force speech stop when no audio frames are received while in SPEAKING state. This handles cases like mic mute mid-speech. Set to 0 to disable. Defaults to 1.0. @@ -90,18 +90,19 @@ class VADController(BaseObject): self._vad_analyzer = vad_analyzer self._vad_state: VADState = VADState.QUIET + self._task_manager: Optional[BaseTaskManager] = None + # Last time a on_speech_activity was triggered. self._speech_activity_time = 0 # How often a on_speech_activity event should be triggered (value should # be greater than the audio chunks to have any effect). self._speech_activity_period = speech_activity_period - # Audio starvation detection: force speech stop when no audio arrives + # Audio idle detection: force speech stop when no audio arrives # while in SPEAKING state (e.g. user mutes mic mid-speech). - self._audio_starvation_timeout = audio_starvation_timeout - self._task_manager: Optional[BaseTaskManager] = None - self._audio_received_event = asyncio.Event() - self._starvation_task: Optional[asyncio.Task] = None + self._last_audio_time: float = 0.0 + self._audio_idle_timeout = audio_idle_timeout + self._audio_idle_task: Optional[asyncio.Task] = None self._register_event_handler("on_speech_started", sync=True) self._register_event_handler("on_speech_stopped", sync=True) @@ -116,11 +117,11 @@ class VADController(BaseObject): task_manager: The task manager to be associated with this instance. """ self._task_manager = task_manager - self._audio_received_event.clear() - if self._audio_starvation_timeout > 0 and not self._starvation_task: - self._starvation_task = self._task_manager.create_task( - self._audio_starvation_handler(), - f"{self}::_audio_starvation_handler", + self._last_audio_time = time.monotonic() + if self._audio_idle_timeout > 0 and not self._audio_idle_task: + self._audio_idle_task = self._task_manager.create_task( + self._audio_idle_handler(), + f"{self}::_audio_idle_handler", ) async def process_frame(self, frame: Frame): @@ -153,9 +154,9 @@ class VADController(BaseObject): before returning. """ await super().cleanup() - if self._starvation_task and self._task_manager: - await self._task_manager.cancel_task(self._starvation_task) - self._starvation_task = None + if self._audio_idle_task and self._task_manager: + await self._task_manager.cancel_task(self._audio_idle_task) + self._audio_idle_task = None if self._vad_analyzer: await self._vad_analyzer.cleanup() @@ -168,7 +169,7 @@ class VADController(BaseObject): Args: frame: Audio frame to process. """ - self._audio_received_event.set() + self._last_audio_time = time.monotonic() self._vad_state = await self._handle_vad(frame.audio, self._vad_state) @@ -191,25 +192,28 @@ class VADController(BaseObject): vad_state = new_vad_state return vad_state - async def _audio_starvation_handler(self): - """Monitor for audio starvation while in SPEAKING state. + async def _audio_idle_handler(self): + """Monitor for an idle audio stream while in SPEAKING state. - When no audio frames arrive for `audio_starvation_timeout` seconds + When no audio frames arrive for `audio_idle_timeout` seconds (e.g. user mutes mic mid-speech), forces a transition to QUIET and emits `on_speech_stopped`. """ while True: - try: - await asyncio.wait_for( - self._audio_received_event.wait(), - timeout=self._audio_starvation_timeout, - ) - self._audio_received_event.clear() - except asyncio.TimeoutError: - if self._vad_state == VADState.SPEAKING: - logger.warning(f"{self}: no audio received while speaking, forcing speech stop") - self._vad_state = VADState.QUIET - await self._call_event_handler("on_speech_stopped") + deadline = self._last_audio_time + self._audio_idle_timeout + remaining = deadline - time.monotonic() + if remaining > 0: + # Audio is still recent; sleep only for the remaining window. + await asyncio.sleep(remaining) + continue + + if self._vad_state == VADState.SPEAKING: + logger.warning(f"{self}: no audio received while speaking, forcing speech stop") + self._vad_state = VADState.QUIET + await self._call_event_handler("on_speech_stopped") + + # Wait for the next potential idle window. + await asyncio.sleep(self._audio_idle_timeout) async def _maybe_speech_activity(self): """Handle user speaking frame.""" diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py index e64147d3b..94e56a972 100644 --- a/src/pipecat/processors/aggregators/llm_response_universal.py +++ b/src/pipecat/processors/aggregators/llm_response_universal.py @@ -107,7 +107,7 @@ class LLMUserAggregatorParams: has been idle (not speaking) for this duration. Set to 0 to disable idle detection. vad_analyzer: Voice Activity Detection analyzer instance. - audio_starvation_timeout: Timeout in seconds to force speech stop when + audio_idle_timeout: Timeout in seconds to force speech stop when no audio frames are received while in SPEAKING state (e.g. user mutes mic mid-speech). Set to 0 to disable. Defaults to 1.0. filter_incomplete_user_turns: Whether to filter out incomplete user turns. @@ -124,7 +124,7 @@ class LLMUserAggregatorParams: user_turn_stop_timeout: float = 5.0 user_idle_timeout: float = 0 vad_analyzer: Optional[VADAnalyzer] = None - audio_starvation_timeout: float = 1.0 + audio_idle_timeout: float = 1.0 filter_incomplete_user_turns: bool = False user_turn_completion_config: Optional[UserTurnCompletionConfig] = None @@ -473,7 +473,7 @@ class LLMUserAggregator(LLMContextAggregator): if self._params.vad_analyzer: self._vad_controller = VADController( self._params.vad_analyzer, - audio_starvation_timeout=self._params.audio_starvation_timeout, + audio_idle_timeout=self._params.audio_idle_timeout, ) self._vad_controller.add_event_handler("on_speech_started", self._on_vad_speech_started) self._vad_controller.add_event_handler("on_speech_stopped", self._on_vad_speech_stopped) diff --git a/src/pipecat/processors/audio/vad_processor.py b/src/pipecat/processors/audio/vad_processor.py index a2f13460a..aaa769061 100644 --- a/src/pipecat/processors/audio/vad_processor.py +++ b/src/pipecat/processors/audio/vad_processor.py @@ -46,7 +46,7 @@ class VADProcessor(FrameProcessor): *, vad_analyzer: VADAnalyzer, speech_activity_period: float = 0.2, - audio_starvation_timeout: float = 1.0, + audio_idle_timeout: float = 1.0, **kwargs, ): """Initialize the VAD processor. @@ -55,7 +55,7 @@ class VADProcessor(FrameProcessor): vad_analyzer: The VADAnalyzer instance for processing audio. speech_activity_period: Minimum interval in seconds between UserSpeakingFrame pushes. Defaults to 0.2. - audio_starvation_timeout: Timeout in seconds to force speech stop + audio_idle_timeout: Timeout in seconds to force speech stop when no audio frames are received while in SPEAKING state. Set to 0 to disable. Defaults to 1.0. **kwargs: Additional arguments passed to parent class. @@ -64,7 +64,7 @@ class VADProcessor(FrameProcessor): self._vad_controller = VADController( vad_analyzer, speech_activity_period=speech_activity_period, - audio_starvation_timeout=audio_starvation_timeout, + audio_idle_timeout=audio_idle_timeout, ) # Push VAD frames when speech events are detected diff --git a/tests/test_vad_controller.py b/tests/test_vad_controller.py index ddae23549..30f2fe56c 100644 --- a/tests/test_vad_controller.py +++ b/tests/test_vad_controller.py @@ -208,18 +208,18 @@ class TestVADController(unittest.IsolatedAsyncioTestCase): self.assertIsInstance(broadcast_calls[0][1]["vad_params"], VADParams) -AUDIO_STARVATION_TIMEOUT = 0.1 +AUDIO_IDLE_TIMEOUT = 0.1 -class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase): +class TestVADControllerAudioIdle(unittest.IsolatedAsyncioTestCase): async def asyncSetUp(self): self.task_manager = TaskManager() self.task_manager.setup(TaskManagerParams(loop=asyncio.get_running_loop())) - async def test_audio_starvation_forces_speech_stop(self): + async def test_audio_idle_forces_speech_stop(self): """Test that on_speech_stopped fires when no audio arrives while SPEAKING.""" analyzer = MockVADAnalyzer() - controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT) + controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT) speech_stopped = False @@ -238,16 +238,16 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase): await controller.process_frame(audio_frame) self.assertFalse(speech_stopped) - # Stop sending audio, wait for starvation timeout - await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1) + # Stop sending audio, wait for idle timeout + await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1) self.assertTrue(speech_stopped) await controller.cleanup() - async def test_audio_starvation_does_not_fire_when_quiet(self): - """Test that starvation timeout does NOT fire when VAD is in QUIET state.""" + async def test_audio_idle_does_not_fire_when_quiet(self): + """Test that idle timeout does NOT fire when VAD is in QUIET state.""" analyzer = MockVADAnalyzer() - controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT) + controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT) speech_stopped = False @@ -260,8 +260,8 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase): await controller.process_frame(start_frame) await controller.setup(self.task_manager) - # Stay in QUIET state, wait past starvation timeout - await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1) + # Stay in QUIET state, wait past idle timeout + await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1) self.assertFalse(speech_stopped) await controller.cleanup()