Merge pull request #3240 from pipecat-ai/aleix/cartesia-ensure-word-timestamps-started

WordTTSService: make sure word timestamps are always started
This commit is contained in:
Aleix Conchillo Flaqué
2025-12-16 14:02:55 -08:00
committed by GitHub
9 changed files with 32 additions and 14 deletions

View File

@@ -0,0 +1,2 @@
- ⚠️ Breaking change: `WordTTSService.start_word_timestamps()` and
`WordTTSService.reset_word_timestamps()` are now async.

2
changelog/3240.fixed.md Normal file
View File

@@ -0,0 +1,2 @@
- Fixed a TTS service word-timestamp issue that could cause generated
`TTSTextFrame` instances to have an incorrect pts (`pts = -1`).

View File

@@ -554,7 +554,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
await self.add_word_timestamps(processed_timestamps)
elif msg["type"] == "chunk":
await self.stop_ttfb_metrics()
self.start_word_timestamps()
await self.start_word_timestamps()
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["data"]),
sample_rate=self.sample_rate,

View File

@@ -617,7 +617,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
if msg.get("audio"):
await self.stop_ttfb_metrics()
self.start_word_timestamps()
await self.start_word_timestamps()
audio = base64.b64decode(msg["audio"])
frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
@@ -1047,7 +1047,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
# Start TTS sequence if not already started
if not self._started:
self.start_word_timestamps()
await self.start_word_timestamps()
yield TTSStartedFrame()
self._started = True

View File

@@ -253,7 +253,7 @@ class GradiumTTSService(InterruptibleWordTTSService):
if msg["type"] == "audio":
# Process audio chunk
await self.stop_ttfb_metrics()
self.start_word_timestamps()
await self.start_word_timestamps()
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["audio"]),
sample_rate=self.sample_rate,

View File

@@ -245,7 +245,7 @@ class HumeTTSService(WordTTSService):
# Start TTS sequence if not already started
if not self._started:
self.start_word_timestamps()
await self.start_word_timestamps()
yield TTSStartedFrame()
self._started = True

View File

@@ -243,7 +243,7 @@ class InworldHttpTTSService(WordTTSService):
await self.start_ttfb_metrics()
if not self._started:
self.start_word_timestamps()
await self.start_word_timestamps()
yield TTSStartedFrame()
self._started = True
@@ -699,7 +699,7 @@ class InworldTTSService(AudioContextWordTTSService):
if audio_b64:
await self.stop_ttfb_metrics()
self.start_word_timestamps()
await self.start_word_timestamps()
audio = base64.b64decode(audio_b64)
if len(audio) > 44 and audio.startswith(b"RIFF"):
audio = audio[44:]

View File

@@ -385,7 +385,7 @@ class RimeTTSService(AudioContextWordTTSService):
if msg["type"] == "chunk":
# Process audio chunk
await self.stop_ttfb_metrics()
self.start_word_timestamps()
await self.start_word_timestamps()
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["data"]),
sample_rate=self.sample_rate,

View File

@@ -651,15 +651,21 @@ class WordTTSService(TTSService):
"""
super().__init__(**kwargs)
self._initial_word_timestamp = -1
self._initial_word_times = []
self._words_task = None
self._llm_response_started: bool = False
def start_word_timestamps(self):
async def start_word_timestamps(self):
"""Start tracking word timestamps from the current time."""
if self._initial_word_timestamp == -1:
self._initial_word_timestamp = self.get_clock().get_time()
# If we cached some initial word times (because we didn't receive
# audio), let's add them now.
if self._initial_word_times:
await self._add_word_timestamps(self._initial_word_times)
self._initial_word_times = []
def reset_word_timestamps(self):
async def reset_word_timestamps(self):
"""Reset word timestamp tracking."""
self._initial_word_timestamp = -1
@@ -669,8 +675,12 @@ class WordTTSService(TTSService):
Args:
word_times: List of (word, timestamp) tuples where timestamp is in seconds.
"""
for word, timestamp in word_times:
await self._words_queue.put((word, seconds_to_nanoseconds(timestamp)))
if self._initial_word_timestamp == -1:
# Cache word timestamps and don't add them until we have started
# (i.e. we have some audio).
self._initial_word_times.extend(word_times)
else:
await self._add_word_timestamps(word_times)
async def start(self, frame: StartFrame):
"""Start the word TTS service.
@@ -716,7 +726,7 @@ class WordTTSService(TTSService):
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
await super()._handle_interruption(frame, direction)
self._llm_response_started = False
self.reset_word_timestamps()
await self.reset_word_timestamps()
def _create_words_task(self):
if not self._words_task:
@@ -728,13 +738,17 @@ class WordTTSService(TTSService):
await self.cancel_task(self._words_task)
self._words_task = None
async def _add_word_timestamps(self, word_times: List[Tuple[str, float]]):
for word, timestamp in word_times:
await self._words_queue.put((word, seconds_to_nanoseconds(timestamp)))
async def _words_task_handler(self):
last_pts = 0
while True:
frame = None
(word, timestamp) = await self._words_queue.get()
if word == "Reset" and timestamp == 0:
self.reset_word_timestamps()
await self.reset_word_timestamps()
if self._llm_response_started:
self._llm_response_started = False
frame = LLMFullResponseEndFrame()