Merge pull request #3240 from pipecat-ai/aleix/cartesia-ensure-word-timestamps-started

WordTTSService: make sure word timestamps are always started
2025-12-16 14:02:55 -08:00
parent afa7573834 ac7b06faba
commit 2bb6ba59fc
9 changed files with 32 additions and 14 deletions
--- a/changelog/3240.changed.md
+++ b/changelog/3240.changed.md
@@ -0,0 +1,2 @@
+- ⚠️ Breaking change: `WordTTSService.start_word_timestamps()` and
+  `WordTTSService.reset_word_timestamps()` are now async.
--- a/changelog/3240.fixed.md
+++ b/changelog/3240.fixed.md
@@ -0,0 +1,2 @@
+- Fixed a TTS service word-timestamp issue that could cause generated
+  `TTSTextFrame` instances to have an incorrect pts (`pts = -1`).
--- a/src/pipecat/services/cartesia/tts.py
+++ b/src/pipecat/services/cartesia/tts.py
@@ -554,7 +554,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
                await self.add_word_timestamps(processed_timestamps)
            elif msg["type"] == "chunk":
                await self.stop_ttfb_metrics()
-                self.start_word_timestamps()
+                await self.start_word_timestamps()
                frame = TTSAudioRawFrame(
                    audio=base64.b64decode(msg["data"]),
                    sample_rate=self.sample_rate,
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -617,7 +617,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):

            if msg.get("audio"):
                await self.stop_ttfb_metrics()
-                self.start_word_timestamps()
+                await self.start_word_timestamps()

                audio = base64.b64decode(msg["audio"])
                frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
@@ -1047,7 +1047,7 @@ class ElevenLabsHttpTTSService(WordTTSService):

                # Start TTS sequence if not already started
                if not self._started:
-                    self.start_word_timestamps()
+                    await self.start_word_timestamps()
                    yield TTSStartedFrame()
                    self._started = True

--- a/src/pipecat/services/gradium/tts.py
+++ b/src/pipecat/services/gradium/tts.py
@@ -253,7 +253,7 @@ class GradiumTTSService(InterruptibleWordTTSService):
            if msg["type"] == "audio":
                # Process audio chunk
                await self.stop_ttfb_metrics()
-                self.start_word_timestamps()
+                await self.start_word_timestamps()
                frame = TTSAudioRawFrame(
                    audio=base64.b64decode(msg["audio"]),
                    sample_rate=self.sample_rate,
--- a/src/pipecat/services/hume/tts.py
+++ b/src/pipecat/services/hume/tts.py
@@ -245,7 +245,7 @@ class HumeTTSService(WordTTSService):

        # Start TTS sequence if not already started
        if not self._started:
-            self.start_word_timestamps()
+            await self.start_word_timestamps()
            yield TTSStartedFrame()
            self._started = True

--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -243,7 +243,7 @@ class InworldHttpTTSService(WordTTSService):
            await self.start_ttfb_metrics()

            if not self._started:
-                self.start_word_timestamps()
+                await self.start_word_timestamps()
                yield TTSStartedFrame()
                self._started = True

@@ -699,7 +699,7 @@ class InworldTTSService(AudioContextWordTTSService):

            if audio_b64:
                await self.stop_ttfb_metrics()
-                self.start_word_timestamps()
+                await self.start_word_timestamps()
                audio = base64.b64decode(audio_b64)
                if len(audio) > 44 and audio.startswith(b"RIFF"):
                    audio = audio[44:]
--- a/src/pipecat/services/rime/tts.py
+++ b/src/pipecat/services/rime/tts.py
@@ -385,7 +385,7 @@ class RimeTTSService(AudioContextWordTTSService):
            if msg["type"] == "chunk":
                # Process audio chunk
                await self.stop_ttfb_metrics()
-                self.start_word_timestamps()
+                await self.start_word_timestamps()
                frame = TTSAudioRawFrame(
                    audio=base64.b64decode(msg["data"]),
                    sample_rate=self.sample_rate,
--- a/src/pipecat/services/tts_service.py
+++ b/src/pipecat/services/tts_service.py
@@ -651,15 +651,21 @@ class WordTTSService(TTSService):
        """
        super().__init__(**kwargs)
        self._initial_word_timestamp = -1
+        self._initial_word_times = []
        self._words_task = None
        self._llm_response_started: bool = False

-    def start_word_timestamps(self):
+    async def start_word_timestamps(self):
        """Start tracking word timestamps from the current time."""
        if self._initial_word_timestamp == -1:
            self._initial_word_timestamp = self.get_clock().get_time()
+            # If we cached some initial word times (because we didn't receive
+            # audio), let's add them now.
+            if self._initial_word_times:
+                await self._add_word_timestamps(self._initial_word_times)
+                self._initial_word_times = []

-    def reset_word_timestamps(self):
+    async def reset_word_timestamps(self):
        """Reset word timestamp tracking."""
        self._initial_word_timestamp = -1

@@ -669,8 +675,12 @@ class WordTTSService(TTSService):
        Args:
            word_times: List of (word, timestamp) tuples where timestamp is in seconds.
        """
-        for word, timestamp in word_times:
-            await self._words_queue.put((word, seconds_to_nanoseconds(timestamp)))
+        if self._initial_word_timestamp == -1:
+            # Cache word timestamps and don't add them until we have started
+            # (i.e. we have some audio).
+            self._initial_word_times.extend(word_times)
+        else:
+            await self._add_word_timestamps(word_times)

    async def start(self, frame: StartFrame):
        """Start the word TTS service.
@@ -716,7 +726,7 @@ class WordTTSService(TTSService):
    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
        await super()._handle_interruption(frame, direction)
        self._llm_response_started = False
-        self.reset_word_timestamps()
+        await self.reset_word_timestamps()

    def _create_words_task(self):
        if not self._words_task:
@@ -728,13 +738,17 @@ class WordTTSService(TTSService):
            await self.cancel_task(self._words_task)
            self._words_task = None

+    async def _add_word_timestamps(self, word_times: List[Tuple[str, float]]):
+        for word, timestamp in word_times:
+            await self._words_queue.put((word, seconds_to_nanoseconds(timestamp)))
+
    async def _words_task_handler(self):
        last_pts = 0
        while True:
            frame = None
            (word, timestamp) = await self._words_queue.get()
            if word == "Reset" and timestamp == 0:
-                self.reset_word_timestamps()
+                await self.reset_word_timestamps()
                if self._llm_response_started:
                    self._llm_response_started = False
                    frame = LLMFullResponseEndFrame()