From dcd21e7ff4fcd72a392a89292f8857f42d5bff7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Fri, 10 Apr 2026 09:55:54 -0700
Subject: [PATCH] Rework audio idle detection with timestamp-based adaptive
 sleep

Replaces the per-frame asyncio.Event signaling with a monotonic
timestamp updated on each audio frame. The handler sleeps until the
next deadline (last_audio_time + timeout), recomputing on each wake-up
to account for audio arriving during sleep.

This avoids waking the handler on every audio frame (~50/s at 20ms
chunks), and guarantees detection latency is bounded by timeout rather
than 2 * timeout.

Also renames audio_starvation_timeout to audio_idle_timeout and
associated identifiers for consistency with existing pipecat naming
(user_idle_timeout, etc.).
---
 changelog/4244.fixed.md                       |  1 +
 src/pipecat/audio/vad/vad_controller.py       | 68 ++++++++++---------
 .../aggregators/llm_response_universal.py     |  6 +-
 src/pipecat/processors/audio/vad_processor.py |  6 +-
 tests/test_vad_controller.py                  | 22 +++---
 5 files changed, 54 insertions(+), 49 deletions(-)
 create mode 100644 changelog/4244.fixed.md

diff --git a/changelog/4244.fixed.md b/changelog/4244.fixed.md
new file mode 100644
index 000000000..271e39c37
--- /dev/null
+++ b/changelog/4244.fixed.md
@@ -0,0 +1 @@
+- Fixed `VADController` getting stuck in the `SPEAKING` state when audio frames stop arriving mid-speech (e.g. user mutes mic). A new `audio_idle_timeout` parameter (default 1s, set to 0 to disable) forces a transition back to `QUIET` and emits `on_speech_stopped` when no audio is received while speaking.
diff --git a/src/pipecat/audio/vad/vad_controller.py b/src/pipecat/audio/vad/vad_controller.py
index 2d7bbf817..fefe3bec1 100644
--- a/src/pipecat/audio/vad/vad_controller.py
+++ b/src/pipecat/audio/vad/vad_controller.py
@@ -39,8 +39,8 @@ class VADController(BaseObject):
     Event handlers available:
 
     - on_speech_started: Called when speech begins.
-    - on_speech_stopped: Called when speech ends, including forced stop on
-      audio starvation (no frames received while speaking).
+    - on_speech_stopped: Called when speech ends, including forced stop when
+      the audio stream goes idle (no frames received while speaking).
     - on_speech_activity: Called periodically while speech is detected.
     - on_push_frame: Called when the controller wants to push a frame.
     - on_broadcast_frame: Called when the controller wants to broadcast a frame.
@@ -73,7 +73,7 @@ class VADController(BaseObject):
         vad_analyzer: VADAnalyzer,
         *,
         speech_activity_period: float = 0.2,
-        audio_starvation_timeout: float = 1.0,
+        audio_idle_timeout: float = 1.0,
     ):
         """Initialize the VAD controller.
 
@@ -81,7 +81,7 @@ class VADController(BaseObject):
             vad_analyzer: The `VADAnalyzer` instance for processing audio.
             speech_activity_period: Minimum interval in seconds between
                 `on_speech_activity` events. Defaults to 0.2.
-            audio_starvation_timeout: Timeout in seconds to force speech stop
+            audio_idle_timeout: Timeout in seconds to force speech stop
                 when no audio frames are received while in SPEAKING state.
                 This handles cases like mic mute mid-speech.
                 Set to 0 to disable. Defaults to 1.0.
@@ -90,18 +90,19 @@ class VADController(BaseObject):
         self._vad_analyzer = vad_analyzer
         self._vad_state: VADState = VADState.QUIET
 
+        self._task_manager: Optional[BaseTaskManager] = None
+
         # Last time a on_speech_activity was triggered.
         self._speech_activity_time = 0
         # How often a on_speech_activity event should be triggered (value should
         # be greater than the audio chunks to have any effect).
         self._speech_activity_period = speech_activity_period
 
-        # Audio starvation detection: force speech stop when no audio arrives
+        # Audio idle detection: force speech stop when no audio arrives
         # while in SPEAKING state (e.g. user mutes mic mid-speech).
-        self._audio_starvation_timeout = audio_starvation_timeout
-        self._task_manager: Optional[BaseTaskManager] = None
-        self._audio_received_event = asyncio.Event()
-        self._starvation_task: Optional[asyncio.Task] = None
+        self._last_audio_time: float = 0.0
+        self._audio_idle_timeout = audio_idle_timeout
+        self._audio_idle_task: Optional[asyncio.Task] = None
 
         self._register_event_handler("on_speech_started", sync=True)
         self._register_event_handler("on_speech_stopped", sync=True)
@@ -116,11 +117,11 @@ class VADController(BaseObject):
             task_manager: The task manager to be associated with this instance.
         """
         self._task_manager = task_manager
-        self._audio_received_event.clear()
-        if self._audio_starvation_timeout > 0 and not self._starvation_task:
-            self._starvation_task = self._task_manager.create_task(
-                self._audio_starvation_handler(),
-                f"{self}::_audio_starvation_handler",
+        self._last_audio_time = time.monotonic()
+        if self._audio_idle_timeout > 0 and not self._audio_idle_task:
+            self._audio_idle_task = self._task_manager.create_task(
+                self._audio_idle_handler(),
+                f"{self}::_audio_idle_handler",
             )
 
     async def process_frame(self, frame: Frame):
@@ -153,9 +154,9 @@ class VADController(BaseObject):
         before returning.
         """
         await super().cleanup()
-        if self._starvation_task and self._task_manager:
-            await self._task_manager.cancel_task(self._starvation_task)
-            self._starvation_task = None
+        if self._audio_idle_task and self._task_manager:
+            await self._task_manager.cancel_task(self._audio_idle_task)
+            self._audio_idle_task = None
         if self._vad_analyzer:
             await self._vad_analyzer.cleanup()
 
@@ -168,7 +169,7 @@ class VADController(BaseObject):
         Args:
             frame: Audio frame to process.
         """
-        self._audio_received_event.set()
+        self._last_audio_time = time.monotonic()
 
         self._vad_state = await self._handle_vad(frame.audio, self._vad_state)
 
@@ -191,25 +192,28 @@ class VADController(BaseObject):
             vad_state = new_vad_state
         return vad_state
 
-    async def _audio_starvation_handler(self):
-        """Monitor for audio starvation while in SPEAKING state.
+    async def _audio_idle_handler(self):
+        """Monitor for an idle audio stream while in SPEAKING state.
 
-        When no audio frames arrive for `audio_starvation_timeout` seconds
+        When no audio frames arrive for `audio_idle_timeout` seconds
         (e.g. user mutes mic mid-speech), forces a transition to QUIET and
         emits `on_speech_stopped`.
         """
         while True:
-            try:
-                await asyncio.wait_for(
-                    self._audio_received_event.wait(),
-                    timeout=self._audio_starvation_timeout,
-                )
-                self._audio_received_event.clear()
-            except asyncio.TimeoutError:
-                if self._vad_state == VADState.SPEAKING:
-                    logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
-                    self._vad_state = VADState.QUIET
-                    await self._call_event_handler("on_speech_stopped")
+            deadline = self._last_audio_time + self._audio_idle_timeout
+            remaining = deadline - time.monotonic()
+            if remaining > 0:
+                # Audio is still recent; sleep only for the remaining window.
+                await asyncio.sleep(remaining)
+                continue
+
+            if self._vad_state == VADState.SPEAKING:
+                logger.warning(f"{self}: no audio received while speaking, forcing speech stop")
+                self._vad_state = VADState.QUIET
+                await self._call_event_handler("on_speech_stopped")
+
+            # Wait for the next potential idle window.
+            await asyncio.sleep(self._audio_idle_timeout)
 
     async def _maybe_speech_activity(self):
         """Handle user speaking frame."""
diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py
index e64147d3b..94e56a972 100644
--- a/src/pipecat/processors/aggregators/llm_response_universal.py
+++ b/src/pipecat/processors/aggregators/llm_response_universal.py
@@ -107,7 +107,7 @@ class LLMUserAggregatorParams:
             has been idle (not speaking) for this duration. Set to 0 to disable
             idle detection.
         vad_analyzer: Voice Activity Detection analyzer instance.
-        audio_starvation_timeout: Timeout in seconds to force speech stop when
+        audio_idle_timeout: Timeout in seconds to force speech stop when
             no audio frames are received while in SPEAKING state (e.g. user mutes
             mic mid-speech). Set to 0 to disable. Defaults to 1.0.
         filter_incomplete_user_turns: Whether to filter out incomplete user turns.
@@ -124,7 +124,7 @@ class LLMUserAggregatorParams:
     user_turn_stop_timeout: float = 5.0
     user_idle_timeout: float = 0
     vad_analyzer: Optional[VADAnalyzer] = None
-    audio_starvation_timeout: float = 1.0
+    audio_idle_timeout: float = 1.0
     filter_incomplete_user_turns: bool = False
     user_turn_completion_config: Optional[UserTurnCompletionConfig] = None
 
@@ -473,7 +473,7 @@ class LLMUserAggregator(LLMContextAggregator):
         if self._params.vad_analyzer:
             self._vad_controller = VADController(
                 self._params.vad_analyzer,
-                audio_starvation_timeout=self._params.audio_starvation_timeout,
+                audio_idle_timeout=self._params.audio_idle_timeout,
             )
             self._vad_controller.add_event_handler("on_speech_started", self._on_vad_speech_started)
             self._vad_controller.add_event_handler("on_speech_stopped", self._on_vad_speech_stopped)
diff --git a/src/pipecat/processors/audio/vad_processor.py b/src/pipecat/processors/audio/vad_processor.py
index a2f13460a..aaa769061 100644
--- a/src/pipecat/processors/audio/vad_processor.py
+++ b/src/pipecat/processors/audio/vad_processor.py
@@ -46,7 +46,7 @@ class VADProcessor(FrameProcessor):
         *,
         vad_analyzer: VADAnalyzer,
         speech_activity_period: float = 0.2,
-        audio_starvation_timeout: float = 1.0,
+        audio_idle_timeout: float = 1.0,
         **kwargs,
     ):
         """Initialize the VAD processor.
@@ -55,7 +55,7 @@ class VADProcessor(FrameProcessor):
             vad_analyzer: The VADAnalyzer instance for processing audio.
             speech_activity_period: Minimum interval in seconds between
                 UserSpeakingFrame pushes. Defaults to 0.2.
-            audio_starvation_timeout: Timeout in seconds to force speech stop
+            audio_idle_timeout: Timeout in seconds to force speech stop
                 when no audio frames are received while in SPEAKING state.
                 Set to 0 to disable. Defaults to 1.0.
             **kwargs: Additional arguments passed to parent class.
@@ -64,7 +64,7 @@ class VADProcessor(FrameProcessor):
         self._vad_controller = VADController(
             vad_analyzer,
             speech_activity_period=speech_activity_period,
-            audio_starvation_timeout=audio_starvation_timeout,
+            audio_idle_timeout=audio_idle_timeout,
         )
 
         # Push VAD frames when speech events are detected
diff --git a/tests/test_vad_controller.py b/tests/test_vad_controller.py
index ddae23549..30f2fe56c 100644
--- a/tests/test_vad_controller.py
+++ b/tests/test_vad_controller.py
@@ -208,18 +208,18 @@ class TestVADController(unittest.IsolatedAsyncioTestCase):
         self.assertIsInstance(broadcast_calls[0][1]["vad_params"], VADParams)
 
 
-AUDIO_STARVATION_TIMEOUT = 0.1
+AUDIO_IDLE_TIMEOUT = 0.1
 
 
-class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
+class TestVADControllerAudioIdle(unittest.IsolatedAsyncioTestCase):
     async def asyncSetUp(self):
         self.task_manager = TaskManager()
         self.task_manager.setup(TaskManagerParams(loop=asyncio.get_running_loop()))
 
-    async def test_audio_starvation_forces_speech_stop(self):
+    async def test_audio_idle_forces_speech_stop(self):
         """Test that on_speech_stopped fires when no audio arrives while SPEAKING."""
         analyzer = MockVADAnalyzer()
-        controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
+        controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)
 
         speech_stopped = False
 
@@ -238,16 +238,16 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
         await controller.process_frame(audio_frame)
         self.assertFalse(speech_stopped)
 
-        # Stop sending audio, wait for starvation timeout
-        await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
+        # Stop sending audio, wait for idle timeout
+        await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
         self.assertTrue(speech_stopped)
 
         await controller.cleanup()
 
-    async def test_audio_starvation_does_not_fire_when_quiet(self):
-        """Test that starvation timeout does NOT fire when VAD is in QUIET state."""
+    async def test_audio_idle_does_not_fire_when_quiet(self):
+        """Test that idle timeout does NOT fire when VAD is in QUIET state."""
         analyzer = MockVADAnalyzer()
-        controller = VADController(analyzer, audio_starvation_timeout=AUDIO_STARVATION_TIMEOUT)
+        controller = VADController(analyzer, audio_idle_timeout=AUDIO_IDLE_TIMEOUT)
 
         speech_stopped = False
 
@@ -260,8 +260,8 @@ class TestVADControllerStarvation(unittest.IsolatedAsyncioTestCase):
         await controller.process_frame(start_frame)
         await controller.setup(self.task_manager)
 
-        # Stay in QUIET state, wait past starvation timeout
-        await asyncio.sleep(AUDIO_STARVATION_TIMEOUT + 0.1)
+        # Stay in QUIET state, wait past idle timeout
+        await asyncio.sleep(AUDIO_IDLE_TIMEOUT + 0.1)
         self.assertFalse(speech_stopped)
 
         await controller.cleanup()