From 6d95a2425ca30f775d1da2f15a7546cfff0b30a6 Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Thu, 12 Feb 2026 12:54:47 -0300
Subject: [PATCH 1/7] Fixing ElevenLabs TTS word timestamp interleaving across
 sentences.

---
 src/pipecat/services/elevenlabs/tts.py | 56 ++++++++++++++++----------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py
index 4dab0c01a..7df891f0e 100644
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -13,6 +13,7 @@ with support for streaming audio, word timestamps, and voice customization.
 import asyncio
 import base64
 import json
+import uuid
 from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple, Union
 
 import aiohttp
@@ -680,6 +681,20 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             msg = {"text": text, "context_id": self._context_id}
             await self._websocket.send(json.dumps(msg))
 
+    def create_context_id(self) -> str:
+        """Generate a unique context ID for a TTS request in case we don't have one already in progress.
+
+        Returns:
+            A unique string identifier for the TTS context.
+        """
+        # If a context ID does not exist, create a new one.
+        # If an ID exists, continue using the current ID.
+        # When interruptions happens, user speech results in
+        # an interruption, which resets the context ID.
+        if not self._context_id:
+            return str(uuid.uuid4())
+        return self._context_id
+
     @traced_tts
     async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
         """Generate speech from text using ElevenLabs' streaming WebSocket API.
@@ -698,31 +713,28 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 await self._connect()
 
             try:
-                await self.start_ttfb_metrics()
-                yield TTSStartedFrame(context_id=context_id)
-                self._cumulative_time = 0
-                self._partial_word = ""
-                self._partial_word_start_time = 0.0
-                # If a context ID does not exist, use the provided one.
-                # If an ID exists, that means the Pipeline doesn't allow
-                # user interruptions, so continue using the current ID.
-                # When interruptions are allowed, user speech results in
-                # an interruption, which resets the context ID.
                 if not self._context_id:
+                    await self.start_ttfb_metrics()
+                    yield TTSStartedFrame(context_id=context_id)
                     self._context_id = context_id
-                if not self.audio_context_available(self._context_id):
-                    await self.create_audio_context(self._context_id)
+                    self._cumulative_time = 0
+                    self._partial_word = ""
+                    self._partial_word_start_time = 0.0
 
-                # Initialize context with voice settings and pronunciation dictionaries
-                msg = {"text": " ", "context_id": self._context_id}
-                if self._voice_settings:
-                    msg["voice_settings"] = self._voice_settings
-                if self._pronunciation_dictionary_locators:
-                    msg["pronunciation_dictionary_locators"] = [
-                        locator.model_dump() for locator in self._pronunciation_dictionary_locators
-                    ]
-                await self._websocket.send(json.dumps(msg))
-                logger.trace(f"Created new context {self._context_id}")
+                    if not self.audio_context_available(self._context_id):
+                        await self.create_audio_context(self._context_id)
+
+                    # Initialize context with voice settings and pronunciation dictionaries
+                    msg = {"text": " ", "context_id": self._context_id}
+                    if self._voice_settings:
+                        msg["voice_settings"] = self._voice_settings
+                    if self._pronunciation_dictionary_locators:
+                        msg["pronunciation_dictionary_locators"] = [
+                            locator.model_dump()
+                            for locator in self._pronunciation_dictionary_locators
+                        ]
+                    await self._websocket.send(json.dumps(msg))
+                    logger.trace(f"Created new context {self._context_id}")
 
                 await self._send_text(text)
                 await self.start_tts_usage_metrics(text)

From abea22ec574a379f90f6858c10580243f90cf2f1 Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Thu, 12 Feb 2026 15:17:47 -0300
Subject: [PATCH 2/7] Fixing AsyncAITTSService to reuse the same context when
 needed.

---
 src/pipecat/services/asyncai/tts.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/pipecat/services/asyncai/tts.py b/src/pipecat/services/asyncai/tts.py
index 4ff6c928d..e9170f8b6 100644
--- a/src/pipecat/services/asyncai/tts.py
+++ b/src/pipecat/services/asyncai/tts.py
@@ -9,6 +9,7 @@
 import asyncio
 import base64
 import json
+import uuid
 from typing import AsyncGenerator, Optional
 
 import aiohttp
@@ -270,6 +271,20 @@ class AsyncAITTSService(AudioContextTTSService):
             return self._websocket
         raise Exception("Websocket not connected")
 
+    def create_context_id(self) -> str:
+        """Generate a unique context ID for a TTS request in case we don't have one already in progress.
+
+        Returns:
+            A unique string identifier for the TTS context.
+        """
+        # If a context ID does not exist, create a new one.
+        # If an ID exists, continue using the current ID.
+        # When interruptions happen, user speech results in
+        # an interruption, which resets the context ID.
+        if not self._context_id:
+            return str(uuid.uuid4())
+        return self._context_id
+
     async def flush_audio(self):
         """Flush any pending audio."""
         if not self._context_id or not self._websocket:
@@ -379,13 +394,14 @@ class AsyncAITTSService(AudioContextTTSService):
                 await self._connect()
 
             try:
-                await self.start_ttfb_metrics()
-                yield TTSStartedFrame(context_id=context_id)
-
                 if not self._context_id:
+                    await self.start_ttfb_metrics()
+                    yield TTSStartedFrame(context_id=context_id)
+
                     self._context_id = context_id
-                if not self.audio_context_available(self._context_id):
-                    await self.create_audio_context(self._context_id)
+
+                    if not self.audio_context_available(self._context_id):
+                        await self.create_audio_context(self._context_id)
 
                 msg = self._build_msg(text=text, force=True, context_id=self._context_id)
                 await self._get_websocket().send(msg)

From 3410eb82b37803e84576112e7ab9d88a48cdf91b Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Thu, 12 Feb 2026 15:26:49 -0300
Subject: [PATCH 3/7] Fixing CartesiaTTSService to reuse the same context when
 needed.

---
 src/pipecat/services/cartesia/tts.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py
index 791c60a18..1fa9a026a 100644
--- a/src/pipecat/services/cartesia/tts.py
+++ b/src/pipecat/services/cartesia/tts.py
@@ -8,6 +8,7 @@
 
 import base64
 import json
+import uuid
 import warnings
 from enum import Enum
 from typing import AsyncGenerator, List, Literal, Optional
@@ -539,6 +540,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
             await self._get_websocket().send(cancel_msg)
             self._context_id = None
 
+    def create_context_id(self) -> str:
+        """Generate a unique context ID for a TTS request in case we don't have one already in progress.
+
+        Returns:
+            A unique string identifier for the TTS context.
+        """
+        # If a context ID does not exist, create a new one.
+        # If an ID exists, continue using the current ID.
+        # When interruptions happen, user speech results in
+        # an interruption, which resets the context ID.
+        if not self._context_id:
+            return str(uuid.uuid4())
+        return self._context_id
+
     async def flush_audio(self):
         """Flush any pending audio and finalize the current context."""
         if not self._context_id or not self._websocket:

From 136732afae720857f80a849f8b6221836d2481ac Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Thu, 12 Feb 2026 15:46:59 -0300
Subject: [PATCH 4/7] Fixing InworldTTSService to reuse the same context when
 needed.

---
 src/pipecat/services/inworld/tts.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 7ba7d6b9d..845a37bdd 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -924,6 +924,20 @@ class InworldTTSService(AudioContextWordTTSService):
         msg = {"close_context": {}, "contextId": context_id}
         await self.send_with_retry(json.dumps(msg), self._report_error)
 
+    def create_context_id(self) -> str:
+        """Generate a unique context ID for a TTS request in case we don't have one already in progress.
+
+        Returns:
+            A unique string identifier for the TTS context.
+        """
+        # If a context ID does not exist, create a new one.
+        # If an ID exists, continue using the current ID.
+        # When interruptions happen, user speech results in
+        # an interruption, which resets the context ID.
+        if not self._context_id:
+            return str(uuid.uuid4())
+        return self._context_id
+
     @traced_tts
     async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
         """Generate TTS audio for the given text using the Inworld WebSocket TTS service.
@@ -942,10 +956,9 @@ class InworldTTSService(AudioContextWordTTSService):
                 await self._connect()
 
             try:
-                await self.start_ttfb_metrics()
-                yield TTSStartedFrame(context_id=context_id)
-
                 if not self._context_id:
+                    await self.start_ttfb_metrics()
+                    yield TTSStartedFrame(context_id=context_id)
                     self._context_id = context_id
                     logger.trace(f"{self}: Creating new context {self._context_id}")
                     await self.create_audio_context(self._context_id)

From f0995164d9ae8550422dde7afc950ff8c92ab6e8 Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Thu, 12 Feb 2026 15:50:18 -0300
Subject: [PATCH 5/7] Fixing PlayHTTTSService to reuse the same context when
 needed.

---
 src/pipecat/services/playht/tts.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/pipecat/services/playht/tts.py b/src/pipecat/services/playht/tts.py
index 2d4cd0427..287463186 100644
--- a/src/pipecat/services/playht/tts.py
+++ b/src/pipecat/services/playht/tts.py
@@ -13,6 +13,7 @@ supporting both WebSocket streaming and HTTP-based synthesis.
 import io
 import json
 import struct
+import uuid
 import warnings
 from typing import AsyncGenerator, Optional
 
@@ -323,6 +324,20 @@ class PlayHTTTSService(InterruptibleTTSService):
             return self._websocket
         raise Exception("Websocket not connected")
 
+    def create_context_id(self) -> str:
+        """Generate a unique context ID for a TTS request in case we don't have one already in progress.
+
+        Returns:
+            A unique string identifier for the TTS context.
+        """
+        # If a context ID does not exist, create a new one.
+        # If an ID exists, continue using the current ID.
+        # When interruptions happen, user speech results in
+        # an interruption, which resets the context ID.
+        if not self._context_id:
+            return str(uuid.uuid4())
+        return self._context_id
+
     async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         """Handle interruption by stopping metrics and clearing request ID."""
         await super()._handle_interruption(frame, direction)

From 8866ab1585e257d8f880b71bd46b0ad4091e98ab Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Thu, 12 Feb 2026 15:53:38 -0300
Subject: [PATCH 6/7] Fixing RimeTTSService to reuse the same context when
 needed.

---
 src/pipecat/services/rime/tts.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/pipecat/services/rime/tts.py b/src/pipecat/services/rime/tts.py
index e38e840e6..22f1cf4e1 100644
--- a/src/pipecat/services/rime/tts.py
+++ b/src/pipecat/services/rime/tts.py
@@ -12,6 +12,7 @@ using Rime's API for streaming and batch audio synthesis.
 
 import base64
 import json
+import uuid
 from typing import Any, AsyncGenerator, Mapping, Optional
 
 import aiohttp
@@ -369,6 +370,20 @@ class RimeTTSService(AudioContextWordTTSService):
 
         return word_pairs
 
+    def create_context_id(self) -> str:
+        """Generate a unique context ID for a TTS request in case we don't have one already in progress.
+
+        Returns:
+            A unique string identifier for the TTS context.
+        """
+        # If a context ID does not exist, create a new one.
+        # If an ID exists, continue using the current ID.
+        # When interruptions happen, user speech results in
+        # an interruption, which resets the context ID.
+        if not self._context_id:
+            return str(uuid.uuid4())
+        return self._context_id
+
     async def flush_audio(self):
         """Flush any pending audio synthesis."""
         if not self._context_id or not self._websocket:

From 9569625f03ad013945e00d335adc5c9a2bd25e44 Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Thu, 12 Feb 2026 16:11:02 -0300
Subject: [PATCH 7/7] Changelog entries for the TTS fixes.

---
 changelog/3729.fixed.2.md | 1 +
 changelog/3729.fixed.md   | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 changelog/3729.fixed.2.md
 create mode 100644 changelog/3729.fixed.md

diff --git a/changelog/3729.fixed.2.md b/changelog/3729.fixed.2.md
new file mode 100644
index 000000000..6d4f33d93
--- /dev/null
+++ b/changelog/3729.fixed.2.md
@@ -0,0 +1 @@
+- Fixed context ID reuse issue in `ElevenLabsTTSService`, `InworldTTSService`, `RimeTTSService`, `CartesiaTTSService`, `AsyncAITTSService`, and `PlayHTTTSService`. Services now properly reuse the same context ID across multiple `run_tts()` invocations within a single LLM turn, preventing context tracking issues and incorrect lifecycle signaling.
diff --git a/changelog/3729.fixed.md b/changelog/3729.fixed.md
new file mode 100644
index 000000000..b8be759fb
--- /dev/null
+++ b/changelog/3729.fixed.md
@@ -0,0 +1 @@
+- Fixed word timestamp interleaving issue in `ElevenLabsTTSService` when processing multiple sentences within a single LLM turn.