fix: Handle race condition in ElevenLabs TTS context management
Fixes issue where audio generated before user interruption was being discarded, causing the bot to appear unresponsive (ai_not_responsive). The problem occurred when: 1. TTS generation starts 2. User interrupts early (e.g., 43ms later) 3. InterruptionFrame closes context and sets context_id to None 4. Audio arrives from ElevenLabs (e.g., 227ms after generation) 5. Audio is incorrectly dropped as 'unavailable context' Changes: - Added _last_closed_context_id to track recently closed contexts - Store context_id before clearing it during interruption - Check for recently closed context when receiving delayed audio - Properly discard delayed audio from interrupted contexts with clear logging - Clear last_closed_context_id when creating new context This ensures delayed audio from interrupted contexts is properly identified and discarded, while legitimate delayed messages are still handled correctly.
This commit is contained in:
@@ -33,10 +33,7 @@ from pipecat.frames.frames import (
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import (
|
||||
AudioContextWordTTSService,
|
||||
WordTTSService,
|
||||
)
|
||||
from pipecat.services.tts_service import AudioContextWordTTSService, WordTTSService
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
@@ -347,6 +344,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
|
||||
# Context management for v1 multi API
|
||||
self._context_id = None
|
||||
self._last_closed_context_id = None
|
||||
self._receive_task = None
|
||||
self._keepalive_task = None
|
||||
|
||||
@@ -586,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
# Store the context ID before clearing it so we can identify delayed messages
|
||||
self._last_closed_context_id = self._context_id
|
||||
self._context_id = None
|
||||
self._started = False
|
||||
self._partial_word = ""
|
||||
@@ -612,6 +612,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
f"Received a delayed message, recreating the context: {self._context_id}"
|
||||
)
|
||||
await self.create_audio_context(self._context_id)
|
||||
elif self._last_closed_context_id == received_ctx_id:
|
||||
# This message belongs to a context we recently closed due to interruption.
|
||||
# The audio was already generated by ElevenLabs but arrived after we closed
|
||||
# the context, so we should discard it to avoid playing interrupted speech.
|
||||
logger.debug(
|
||||
f"Discarding delayed audio from recently closed context: {received_ctx_id}"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
# This can happen if a message is received _after_ we have closed a context
|
||||
# due to user interruption but _before_ the `isFinal` message for the context
|
||||
@@ -720,6 +728,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
# an interruption, which resets the context ID.
|
||||
if not self._context_id:
|
||||
self._context_id = str(uuid.uuid4())
|
||||
# Clear the last closed context when starting a new one
|
||||
self._last_closed_context_id = None
|
||||
if not self.audio_context_available(self._context_id):
|
||||
await self.create_audio_context(self._context_id)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user