From 6e8a1408ebc118994e977403ac6ec0dae1e81550 Mon Sep 17 00:00:00 2001
From: James Hush <james@daily.co>
Date: Mon, 24 Nov 2025 14:13:12 +0100
Subject: [PATCH] fix: Handle race condition in ElevenLabs TTS context
 management

Fixes issue where audio generated before user interruption was being
discarded, causing the bot to appear unresponsive (ai_not_responsive).

The problem occurred when:
1. TTS generation starts
2. User interrupts early (e.g., 43ms later)
3. InterruptionFrame closes context and sets context_id to None
4. Audio arrives from ElevenLabs (e.g., 227ms after generation)
5. Audio is incorrectly dropped as 'unavailable context'

Changes:
- Added _last_closed_context_id to track recently closed contexts
- Store context_id before clearing it during interruption
- Check for recently closed context when receiving delayed audio
- Properly discard delayed audio from interrupted contexts with clear logging
- Clear last_closed_context_id when creating new context

This ensures delayed audio from interrupted contexts is properly
identified and discarded, while legitimate delayed messages are
still handled correctly.
---
 src/pipecat/services/elevenlabs/tts.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py
index bbe05f9dc..61025e585 100644
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -33,10 +33,7 @@ from pipecat.frames.frames import (
     TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.tts_service import (
-    AudioContextWordTTSService,
-    WordTTSService,
-)
+from pipecat.services.tts_service import AudioContextWordTTSService, WordTTSService
 from pipecat.transcriptions.language import Language, resolve_language
 from pipecat.utils.tracing.service_decorators import traced_tts
 
@@ -347,6 +344,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
 
         # Context management for v1 multi API
         self._context_id = None
+        self._last_closed_context_id = None
         self._receive_task = None
         self._keepalive_task = None
 
@@ -586,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             except Exception as e:
                 logger.error(f"{self} exception: {e}")
                 await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
+            # Store the context ID before clearing it so we can identify delayed messages
+            self._last_closed_context_id = self._context_id
             self._context_id = None
             self._started = False
             self._partial_word = ""
@@ -612,6 +612,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                         f"Received a delayed message, recreating the context: {self._context_id}"
                     )
                     await self.create_audio_context(self._context_id)
+                elif self._last_closed_context_id == received_ctx_id:
+                    # This message belongs to a context we recently closed due to interruption.
+                    # The audio was already generated by ElevenLabs but arrived after we closed
+                    # the context, so we should discard it to avoid playing interrupted speech.
+                    logger.debug(
+                        f"Discarding delayed audio from recently closed context: {received_ctx_id}"
+                    )
+                    continue
                 else:
                     # This can happen if a message is received _after_ we have closed a context
                     # due to user interruption but _before_ the `isFinal` message for the context
@@ -720,6 +728,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                     # an interruption, which resets the context ID.
                     if not self._context_id:
                         self._context_id = str(uuid.uuid4())
+                        # Clear the last closed context when starting a new one
+                        self._last_closed_context_id = None
                     if not self.audio_context_available(self._context_id):
                         await self.create_audio_context(self._context_id)