From 6e8a1408ebc118994e977403ac6ec0dae1e81550 Mon Sep 17 00:00:00 2001 From: James Hush Date: Mon, 24 Nov 2025 14:13:12 +0100 Subject: [PATCH] fix: Handle race condition in ElevenLabs TTS context management Fixes issue where audio generated before user interruption was being discarded, causing the bot to appear unresponsive (ai_not_responsive). The problem occurred when: 1. TTS generation starts 2. User interrupts early (e.g., 43ms later) 3. InterruptionFrame closes context and sets context_id to None 4. Audio arrives from ElevenLabs (e.g., 227ms after generation) 5. Audio is incorrectly dropped as 'unavailable context' Changes: - Added _last_closed_context_id to track recently closed contexts - Store context_id before clearing it during interruption - Check for recently closed context when receiving delayed audio - Properly discard delayed audio from interrupted contexts with clear logging - Clear last_closed_context_id when creating new context This ensures delayed audio from interrupted contexts is properly identified and discarded, while legitimate delayed messages are still handled correctly. --- src/pipecat/services/elevenlabs/tts.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index bbe05f9dc..61025e585 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -33,10 +33,7 @@ from pipecat.frames.frames import ( TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.tts_service import ( - AudioContextWordTTSService, - WordTTSService, -) +from pipecat.services.tts_service import AudioContextWordTTSService, WordTTSService from pipecat.transcriptions.language import Language, resolve_language from pipecat.utils.tracing.service_decorators import traced_tts @@ -347,6 +344,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService): # Context management for v1 multi API self._context_id = None + self._last_closed_context_id = None self._receive_task = None self._keepalive_task = None @@ -586,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService): except Exception as e: logger.error(f"{self} exception: {e}") await self.push_error(ErrorFrame(error=f"{self} error: {e}")) + # Store the context ID before clearing it so we can identify delayed messages + self._last_closed_context_id = self._context_id self._context_id = None self._started = False self._partial_word = "" @@ -612,6 +612,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService): f"Received a delayed message, recreating the context: {self._context_id}" ) await self.create_audio_context(self._context_id) + elif self._last_closed_context_id == received_ctx_id: + # This message belongs to a context we recently closed due to interruption. + # The audio was already generated by ElevenLabs but arrived after we closed + # the context, so we should discard it to avoid playing interrupted speech. + logger.debug( + f"Discarding delayed audio from recently closed context: {received_ctx_id}" + ) + continue else: # This can happen if a message is received _after_ we have closed a context # due to user interruption but _before_ the `isFinal` message for the context @@ -720,6 +728,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService): # an interruption, which resets the context ID. if not self._context_id: self._context_id = str(uuid.uuid4()) + # Clear the last closed context when starting a new one + self._last_closed_context_id = None if not self.audio_context_available(self._context_id): await self.create_audio_context(self._context_id)