Rework audio idle detection with timestamp-based adaptive sleep

Replaces the per-frame asyncio.Event signaling with a monotonic timestamp updated on each audio frame. The handler sleeps until the next deadline (last_audio_time + timeout), recomputing on each wake-up to account for audio arriving during sleep. This avoids waking the handler on every audio frame (~50/s at 20ms chunks), and guarantees detection latency is bounded by timeout rather than 2 * timeout. Also renames audio_starvation_timeout to audio_idle_timeout and associated identifiers for consistency with existing pipecat naming (user_idle_timeout, etc.).
2026-04-10 09:55:54 -07:00
parent cb2c1868b0
commit dcd21e7ff4
5 changed files with 54 additions and 49 deletions
--- a/changelog/4244.fixed.md
+++ b/changelog/4244.fixed.md
@@ -0,0 +1 @@
+- Fixed `VADController` getting stuck in the `SPEAKING` state when audio frames stop arriving mid-speech (e.g. user mutes mic). A new `audio_idle_timeout` parameter (default 1s, set to 0 to disable) forces a transition back to `QUIET` and emits `on_speech_stopped` when no audio is received while speaking.
--- a/src/pipecat/audio/vad/vad_controller.py
+++ b/src/pipecat/audio/vad/vad_controller.py
@@ -39,8 +39,8 @@ class VADController(BaseObject):
    Event handlers available:

    - on_speech_started: Called when speech begins.
-    - on_speech_stopped: Called when speech ends, including forced stop on
-      audio starvation (no frames received while speaking).
+    - on_speech_stopped: Called when speech ends, including forced stop when
+      the audio stream goes idle (no frames received while speaking).
    - on_speech_activity: Called periodically while speech is detected.
    - on_push_frame: Called when the controller wants to push a frame.
    - on_broadcast_frame: Called when the controller wants to broadcast a frame.
@@ -73,7 +73,7 @@ class VADController(BaseObject):
        vad_analyzer: VADAnalyzer,
        *,
        speech_activity_period: float = 0.2,
-        audio_starvation_timeout: float = 1.0,
+        audio_idle_timeout: float = 1.0,
    ):
        """Initialize the VAD controller.

@@ -81,7 +81,7 @@ class VADController(BaseObject):
            vad_analyzer: The `VADAnalyzer` instance for processing audio.
            speech_activity_period: Minimum interval in seconds between
                `on_speech_activity` events. Defaults to 0.2.
-            audio_starvation_timeout: Timeout in seconds to force speech stop
+            audio_idle_timeout: Timeout in seconds to force speech stop
                when no audio frames are received while in SPEAKING state.
                This handles cases like mic mute mid-speech.
                Set to 0 to disable. Defaults to 1.0.
@@ -90,18 +90,19 @@ class VADController(BaseObject):
        self._vad_analyzer = vad_analyzer
        self._vad_state: VADState = VADState.QUIET

+        self._task_manager: Optional[BaseTaskManager] = None
+
        # Last time a on_speech_activity was triggered.
        self._speech_activity_time = 0
        # How often a on_speech_activity event should be triggered (value should
        # be greater than the audio chunks to have any effect).
        self._speech_activity_period = speech_activity_period

-        # Audio starvation detection: force speech stop when no audio arrives
+        # Audio idle detection: force speech stop when no audio arrives
        # while in SPEAKING state (e.g. user mutes mic mid-speech).
-        self._audio_starvation_timeout = audio_starvation_timeout
-        self._task_manager: Optional[BaseTaskManager] = None
-        self._audio_received_event = asyncio.Event()
-        self._starvation_task: Optional[asyncio.Task] = None
+        self._last_audio_time: float = 0.0
+        self._audio_idle_timeout = audio_idle_timeout
+        self._audio_idle_task: Optional[asyncio.Task] = None

        self._register_event_handler("on_speech_started", sync=True)
        self._register_event_handler("on_speech_stopped", sync=True)
@@ -116,11 +117,11 @@ class VADController(BaseObject):
            task_manager: The task manager to be associated with this instance.
        """
        self._task_manager = task_manager
-        self._audio_received_event.clear()
-        if self._audio_starvation_timeout > 0 and not self._starvation_task:
-            self._starvation_task = self._task_manager.create_task(
-                self._audio_starvation_handler(),
-                f"{self}::_audio_starvation_handler",
+        self._last_audio_time = time.monotonic()
+        if self._audio_idle_timeout > 0 and not self._audio_idle_task:
+            self._audio_idle_task = self._task_manager.create_task(
+                self._audio_idle_handler(),
+                f"{self}::_audio_idle_handler",
            )

    async def process_frame(self, frame: Frame):
@@ -153,9 +154,9 @@ class VADController(BaseObject):
        before returning.
        """
        await super().cleanup()
-        if self._starvation_task and self._task_manager:
-            await self._task_manager.cancel_task(self._starvation_task)
-            self._starvation_task = None
+        if self._audio_idle_task and self._task_manager:
+            await self._task_manager.cancel_task(self._audio_idle_task)
+            self._audio_idle_task = None
        if self._vad_analyzer:
            await self._vad_analyzer.cleanup()

@@ -168,7 +169,7 @@ class VADController(BaseObject):
        Args:
            frame: Audio frame to process.
        """
-        self._audio_received_event.set()
+        self._last_audio_time = time.monotonic()

        self._vad_state = await self._handle_vad(frame.audio, self._vad_state)

@@ -191,25 +192,28 @@ class VADController(BaseObject):
            vad_state = new_vad_state
        return vad_state

-    async def _audio_starvation_handler(self):
-        """Monitor for audio starvation while in SPEAKING state.
+    async def _audio_idle_handler(self):
+        """Monitor for an idle audio stream while in SPEAKING state.

-        When no audio frames arrive for `audio_starvation_timeout` seconds
+        When no audio frames arrive for `audio_idle_timeout` seconds
        (e.g. user mutes mic mid-speech), forces a transition to QUIET and
        emits `on_speech_stopped`.
        """
        while True:
-            try:
-                await asyncio.wait_for(
-                    self._audio_received_event.wait(),
-                    timeout=self._audio_starvation_timeout,
-                )
-                self._audio_received_event.clear()
-            except asyncio.TimeoutError:
-                if self._vad_state == VADState.SPEAKING:
-                    logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
-                    self._vad_state = VADState.QUIET
-                    await self._call_event_handler("on_speech_stopped")
+            deadline = self._last_audio_time + self._audio_idle_timeout
+            remaining = deadline - time.monotonic()
+            if remaining > 0:
+                # Audio is still recent; sleep only for the remaining window.
+                await asyncio.sleep(remaining)
+                continue
+
+            if self._vad_state == VADState.SPEAKING:
+                logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
+                self._vad_state = VADState.QUIET
+                await self._call_event_handler("on_speech_stopped")
+
+            # Wait for the next potential idle window.
+            await asyncio.sleep(self._audio_idle_timeout)

    async def _maybe_speech_activity(self):
        """Handle user speaking frame."""
--- a/src/pipecat/processors/aggregators/llm_response_universal.py
+++ b/src/pipecat/processors/aggregators/llm_response_universal.py
@@ -107,7 +107,7 @@ class LLMUserAggregatorParams:
            has been idle (not speaking) for this duration. Set to 0 to disable
            idle detection.
        vad_analyzer: Voice Activity Detection analyzer instance.
-        audio_starvation_timeout: Timeout in seconds to force speech stop when
+        audio_idle_timeout: Timeout in seconds to force speech stop when
            no audio frames are received while in SPEAKING state (e.g. user mutes
            mic mid-speech). Set to 0 to disable. Defaults to 1.0.
        filter_incomplete_user_turns: Whether to filter out incomplete user turns.
@@ -124,7 +124,7 @@ class LLMUserAggregatorParams:
    user_turn_stop_timeout: float = 5.0
    user_idle_timeout: float = 0
    vad_analyzer: Optional[VADAnalyzer] = None
-    audio_starvation_timeout: float = 1.0
+    audio_idle_timeout: float = 1.0
    filter_incomplete_user_turns: bool = False
    user_turn_completion_config: Optional[UserTurnCompletionConfig] = None

@@ -473,7 +473,7 @@ class LLMUserAggregator(LLMContextAggregator):
        if self._params.vad_analyzer:
            self._vad_controller = VADController(
                self._params.vad_analyzer,
-                audio_starvation_timeout=self._params.audio_starvation_timeout,
+                audio_idle_timeout=self._params.audio_idle_timeout,
            )
            self._vad_controller.add_event_handler("on_speech_started", self._on_vad_speech_started)
            self._vad_controller.add_event_handler("on_speech_stopped", self._on_vad_speech_stopped)
--- a/src/pipecat/processors/audio/vad_processor.py
+++ b/src/pipecat/processors/audio/vad_processor.py
@@ -46,7 +46,7 @@ class VADProcessor(FrameProcessor):
        *,
        vad_analyzer: VADAnalyzer,
        speech_activity_period: float = 0.2,
-        audio_starvation_timeout: float = 1.0,
+        audio_idle_timeout: float = 1.0,
        **kwargs,
    ):
        """Initialize the VAD processor.
@@ -55,7 +55,7 @@ class VADProcessor(FrameProcessor):
            vad_analyzer: The VADAnalyzer instance for processing audio.
            speech_activity_period: Minimum interval in seconds between
                UserSpeakingFrame pushes. Defaults to 0.2.
-            audio_starvation_timeout: Timeout in seconds to force speech stop
+            audio_idle_timeout: Timeout in seconds to force speech stop
                when no audio frames are received while in SPEAKING state.
                Set to 0 to disable. Defaults to 1.0.
            **kwargs: Additional arguments passed to parent class.
@@ -64,7 +64,7 @@ class VADProcessor(FrameProcessor):
        self._vad_controller = VADController(
            vad_analyzer,
            speech_activity_period=speech_activity_period,
-            audio_starvation_timeout=audio_starvation_timeout,
+            audio_idle_timeout=audio_idle_timeout,
        )

        # Push VAD frames when speech events are detected
--- a/tests/test_vad_controller.py
+++ b/tests/test_vad_controller.py
@@ -208,18 +208,18 @@ class TestVADController(unittest.IsolatedAsyncioTestCase):
        self.assertIsInstance(broadcast_calls[0][1]["vad_params"], VADParams)


-AUDIO_STARVATION_TIMEOUT = 0.1
+AUDIO_IDLE_TIMEOUT = 0.1


-class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
+class TestVADControllerAudioIdle(unittest.IsolatedAsyncioTestCase):
    async def asyncSetUp(self):
        self.task_manager = TaskManager()
        self.task_manager.setup(TaskManagerParams(loop=asyncio.get_running_loop()))

-    async def test_audio_starvation_forces_speech_stop(self):
+    async def test_audio_idle_forces_speech_stop(self):
        """Test that on_speech_stopped fires when no audio arrives while SPEAKING."""
        analyzer = MockVADAnalyzer()
-        controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
+        controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)

        speech_stopped = False

@@ -238,16 +238,16 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
        await controller.process_frame(audio_frame)
        self.assertFalse(speech_stopped)

-        # Stop sending audio, wait for starvation timeout
-        await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
+        # Stop sending audio, wait for idle timeout
+        await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
        self.assertTrue(speech_stopped)

        await controller.cleanup()

-    async def test_audio_starvation_does_not_fire_when_quiet(self):
-        """Test that starvation timeout does NOT fire when VAD is in QUIET state."""
+    async def test_audio_idle_does_not_fire_when_quiet(self):
+        """Test that idle timeout does NOT fire when VAD is in QUIET state."""
        analyzer = MockVADAnalyzer()
-        controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
+        controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)

        speech_stopped = False

@@ -260,8 +260,8 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
        await controller.process_frame(start_frame)
        await controller.setup(self.task_manager)

-        # Stay in QUIET state, wait past starvation timeout
-        await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
+        # Stay in QUIET state, wait past idle timeout
+        await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
        self.assertFalse(speech_stopped)

        await controller.cleanup()
				`@@ -0,0 +1 @@`
				- Fixed `VADController` getting stuck in the `SPEAKING` state when audio frames stop arriving mid-speech (e.g. user mutes mic). A new `audio_idle_timeout` parameter (default 1s, set to 0 to disable) forces a transition back to `QUIET` and emits `on_speech_stopped` when no audio is received while speaking.