From 6d95a2425ca30f775d1da2f15a7546cfff0b30a6 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Thu, 12 Feb 2026 12:54:47 -0300 Subject: [PATCH 1/7] Fixing ElevenLabs TTS word timestamp interleaving across sentences. --- src/pipecat/services/elevenlabs/tts.py | 56 ++++++++++++++++---------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index 4dab0c01a..7df891f0e 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -13,6 +13,7 @@ with support for streaming audio, word timestamps, and voice customization. import asyncio import base64 import json +import uuid from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple, Union import aiohttp @@ -680,6 +681,20 @@ class ElevenLabsTTSService(AudioContextWordTTSService): msg = {"text": text, "context_id": self._context_id} await self._websocket.send(json.dumps(msg)) + def create_context_id(self) -> str: + """Generate a unique context ID for a TTS request in case we don't have one already in progress. + + Returns: + A unique string identifier for the TTS context. + """ + # If a context ID does not exist, create a new one. + # If an ID exists, continue using the current ID. + # When interruptions happens, user speech results in + # an interruption, which resets the context ID. + if not self._context_id: + return str(uuid.uuid4()) + return self._context_id + @traced_tts async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using ElevenLabs' streaming WebSocket API. @@ -698,31 +713,28 @@ class ElevenLabsTTSService(AudioContextWordTTSService): await self._connect() try: - await self.start_ttfb_metrics() - yield TTSStartedFrame(context_id=context_id) - self._cumulative_time = 0 - self._partial_word = "" - self._partial_word_start_time = 0.0 - # If a context ID does not exist, use the provided one. - # If an ID exists, that means the Pipeline doesn't allow - # user interruptions, so continue using the current ID. - # When interruptions are allowed, user speech results in - # an interruption, which resets the context ID. if not self._context_id: + await self.start_ttfb_metrics() + yield TTSStartedFrame(context_id=context_id) self._context_id = context_id - if not self.audio_context_available(self._context_id): - await self.create_audio_context(self._context_id) + self._cumulative_time = 0 + self._partial_word = "" + self._partial_word_start_time = 0.0 - # Initialize context with voice settings and pronunciation dictionaries - msg = {"text": " ", "context_id": self._context_id} - if self._voice_settings: - msg["voice_settings"] = self._voice_settings - if self._pronunciation_dictionary_locators: - msg["pronunciation_dictionary_locators"] = [ - locator.model_dump() for locator in self._pronunciation_dictionary_locators - ] - await self._websocket.send(json.dumps(msg)) - logger.trace(f"Created new context {self._context_id}") + if not self.audio_context_available(self._context_id): + await self.create_audio_context(self._context_id) + + # Initialize context with voice settings and pronunciation dictionaries + msg = {"text": " ", "context_id": self._context_id} + if self._voice_settings: + msg["voice_settings"] = self._voice_settings + if self._pronunciation_dictionary_locators: + msg["pronunciation_dictionary_locators"] = [ + locator.model_dump() + for locator in self._pronunciation_dictionary_locators + ] + await self._websocket.send(json.dumps(msg)) + logger.trace(f"Created new context {self._context_id}") await self._send_text(text) await self.start_tts_usage_metrics(text) From abea22ec574a379f90f6858c10580243f90cf2f1 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Thu, 12 Feb 2026 15:17:47 -0300 Subject: [PATCH 2/7] Fixing AsyncAITTSService to reuse the same context when needed. --- src/pipecat/services/asyncai/tts.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/pipecat/services/asyncai/tts.py b/src/pipecat/services/asyncai/tts.py index 4ff6c928d..e9170f8b6 100644 --- a/src/pipecat/services/asyncai/tts.py +++ b/src/pipecat/services/asyncai/tts.py @@ -9,6 +9,7 @@ import asyncio import base64 import json +import uuid from typing import AsyncGenerator, Optional import aiohttp @@ -270,6 +271,20 @@ class AsyncAITTSService(AudioContextTTSService): return self._websocket raise Exception("Websocket not connected") + def create_context_id(self) -> str: + """Generate a unique context ID for a TTS request in case we don't have one already in progress. + + Returns: + A unique string identifier for the TTS context. + """ + # If a context ID does not exist, create a new one. + # If an ID exists, continue using the current ID. + # When interruptions happen, user speech results in + # an interruption, which resets the context ID. + if not self._context_id: + return str(uuid.uuid4()) + return self._context_id + async def flush_audio(self): """Flush any pending audio.""" if not self._context_id or not self._websocket: @@ -379,13 +394,14 @@ class AsyncAITTSService(AudioContextTTSService): await self._connect() try: - await self.start_ttfb_metrics() - yield TTSStartedFrame(context_id=context_id) - if not self._context_id: + await self.start_ttfb_metrics() + yield TTSStartedFrame(context_id=context_id) + self._context_id = context_id - if not self.audio_context_available(self._context_id): - await self.create_audio_context(self._context_id) + + if not self.audio_context_available(self._context_id): + await self.create_audio_context(self._context_id) msg = self._build_msg(text=text, force=True, context_id=self._context_id) await self._get_websocket().send(msg) From 3410eb82b37803e84576112e7ab9d88a48cdf91b Mon Sep 17 00:00:00 2001 From: filipi87 Date: Thu, 12 Feb 2026 15:26:49 -0300 Subject: [PATCH 3/7] Fixing CartesiaTTSService to reuse the same context when needed. --- src/pipecat/services/cartesia/tts.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index 791c60a18..1fa9a026a 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -8,6 +8,7 @@ import base64 import json +import uuid import warnings from enum import Enum from typing import AsyncGenerator, List, Literal, Optional @@ -539,6 +540,20 @@ class CartesiaTTSService(AudioContextWordTTSService): await self._get_websocket().send(cancel_msg) self._context_id = None + def create_context_id(self) -> str: + """Generate a unique context ID for a TTS request in case we don't have one already in progress. + + Returns: + A unique string identifier for the TTS context. + """ + # If a context ID does not exist, create a new one. + # If an ID exists, continue using the current ID. + # When interruptions happen, user speech results in + # an interruption, which resets the context ID. + if not self._context_id: + return str(uuid.uuid4()) + return self._context_id + async def flush_audio(self): """Flush any pending audio and finalize the current context.""" if not self._context_id or not self._websocket: From 136732afae720857f80a849f8b6221836d2481ac Mon Sep 17 00:00:00 2001 From: filipi87 Date: Thu, 12 Feb 2026 15:46:59 -0300 Subject: [PATCH 4/7] Fixing InworldTTSService to reuse the same context when needed. --- src/pipecat/services/inworld/tts.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 7ba7d6b9d..845a37bdd 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -924,6 +924,20 @@ class InworldTTSService(AudioContextWordTTSService): msg = {"close_context": {}, "contextId": context_id} await self.send_with_retry(json.dumps(msg), self._report_error) + def create_context_id(self) -> str: + """Generate a unique context ID for a TTS request in case we don't have one already in progress. + + Returns: + A unique string identifier for the TTS context. + """ + # If a context ID does not exist, create a new one. + # If an ID exists, continue using the current ID. + # When interruptions happen, user speech results in + # an interruption, which resets the context ID. + if not self._context_id: + return str(uuid.uuid4()) + return self._context_id + @traced_tts async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: """Generate TTS audio for the given text using the Inworld WebSocket TTS service. @@ -942,10 +956,9 @@ class InworldTTSService(AudioContextWordTTSService): await self._connect() try: - await self.start_ttfb_metrics() - yield TTSStartedFrame(context_id=context_id) - if not self._context_id: + await self.start_ttfb_metrics() + yield TTSStartedFrame(context_id=context_id) self._context_id = context_id logger.trace(f"{self}: Creating new context {self._context_id}") await self.create_audio_context(self._context_id) From f0995164d9ae8550422dde7afc950ff8c92ab6e8 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Thu, 12 Feb 2026 15:50:18 -0300 Subject: [PATCH 5/7] Fixing PlayHTTTSService to reuse the same context when needed. --- src/pipecat/services/playht/tts.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/pipecat/services/playht/tts.py b/src/pipecat/services/playht/tts.py index 2d4cd0427..287463186 100644 --- a/src/pipecat/services/playht/tts.py +++ b/src/pipecat/services/playht/tts.py @@ -13,6 +13,7 @@ supporting both WebSocket streaming and HTTP-based synthesis. import io import json import struct +import uuid import warnings from typing import AsyncGenerator, Optional @@ -323,6 +324,20 @@ class PlayHTTTSService(InterruptibleTTSService): return self._websocket raise Exception("Websocket not connected") + def create_context_id(self) -> str: + """Generate a unique context ID for a TTS request in case we don't have one already in progress. + + Returns: + A unique string identifier for the TTS context. + """ + # If a context ID does not exist, create a new one. + # If an ID exists, continue using the current ID. + # When interruptions happen, user speech results in + # an interruption, which resets the context ID. + if not self._context_id: + return str(uuid.uuid4()) + return self._context_id + async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection): """Handle interruption by stopping metrics and clearing request ID.""" await super()._handle_interruption(frame, direction) From 8866ab1585e257d8f880b71bd46b0ad4091e98ab Mon Sep 17 00:00:00 2001 From: filipi87 Date: Thu, 12 Feb 2026 15:53:38 -0300 Subject: [PATCH 6/7] Fixing RimeTTSService to reuse the same context when needed. --- src/pipecat/services/rime/tts.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/pipecat/services/rime/tts.py b/src/pipecat/services/rime/tts.py index e38e840e6..22f1cf4e1 100644 --- a/src/pipecat/services/rime/tts.py +++ b/src/pipecat/services/rime/tts.py @@ -12,6 +12,7 @@ using Rime's API for streaming and batch audio synthesis. import base64 import json +import uuid from typing import Any, AsyncGenerator, Mapping, Optional import aiohttp @@ -369,6 +370,20 @@ class RimeTTSService(AudioContextWordTTSService): return word_pairs + def create_context_id(self) -> str: + """Generate a unique context ID for a TTS request in case we don't have one already in progress. + + Returns: + A unique string identifier for the TTS context. + """ + # If a context ID does not exist, create a new one. + # If an ID exists, continue using the current ID. + # When interruptions happen, user speech results in + # an interruption, which resets the context ID. + if not self._context_id: + return str(uuid.uuid4()) + return self._context_id + async def flush_audio(self): """Flush any pending audio synthesis.""" if not self._context_id or not self._websocket: From 9569625f03ad013945e00d335adc5c9a2bd25e44 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Thu, 12 Feb 2026 16:11:02 -0300 Subject: [PATCH 7/7] Changelog entries for the TTS fixes. --- changelog/3729.fixed.2.md | 1 + changelog/3729.fixed.md | 1 + 2 files changed, 2 insertions(+) create mode 100644 changelog/3729.fixed.2.md create mode 100644 changelog/3729.fixed.md diff --git a/changelog/3729.fixed.2.md b/changelog/3729.fixed.2.md new file mode 100644 index 000000000..6d4f33d93 --- /dev/null +++ b/changelog/3729.fixed.2.md @@ -0,0 +1 @@ +- Fixed context ID reuse issue in `ElevenLabsTTSService`, `InworldTTSService`, `RimeTTSService`, `CartesiaTTSService`, `AsyncAITTSService`, and `PlayHTTTSService`. Services now properly reuse the same context ID across multiple `run_tts()` invocations within a single LLM turn, preventing context tracking issues and incorrect lifecycle signaling. diff --git a/changelog/3729.fixed.md b/changelog/3729.fixed.md new file mode 100644 index 000000000..b8be759fb --- /dev/null +++ b/changelog/3729.fixed.md @@ -0,0 +1 @@ +- Fixed word timestamp interleaving issue in `ElevenLabsTTSService` when processing multiple sentences within a single LLM turn.