Merge pull request #3240 from pipecat-ai/aleix/cartesia-ensure-word-timestamps-started
WordTTSService: make sure word timestamps are always started
This commit is contained in:
2
changelog/3240.changed.md
Normal file
2
changelog/3240.changed.md
Normal file
@@ -0,0 +1,2 @@
|
||||
- ⚠️ Breaking change: `WordTTSService.start_word_timestamps()` and
|
||||
`WordTTSService.reset_word_timestamps()` are now async.
|
||||
2
changelog/3240.fixed.md
Normal file
2
changelog/3240.fixed.md
Normal file
@@ -0,0 +1,2 @@
|
||||
- Fixed a TTS service word-timestamp issue that could cause generated
|
||||
`TTSTextFrame` instances to have an incorrect pts (`pts = -1`).
|
||||
@@ -554,7 +554,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
await self.add_word_timestamps(processed_timestamps)
|
||||
elif msg["type"] == "chunk":
|
||||
await self.stop_ttfb_metrics()
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=base64.b64decode(msg["data"]),
|
||||
sample_rate=self.sample_rate,
|
||||
|
||||
@@ -617,7 +617,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
|
||||
if msg.get("audio"):
|
||||
await self.stop_ttfb_metrics()
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
|
||||
audio = base64.b64decode(msg["audio"])
|
||||
frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
|
||||
@@ -1047,7 +1047,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
|
||||
# Start TTS sequence if not already started
|
||||
if not self._started:
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
|
||||
@@ -253,7 +253,7 @@ class GradiumTTSService(InterruptibleWordTTSService):
|
||||
if msg["type"] == "audio":
|
||||
# Process audio chunk
|
||||
await self.stop_ttfb_metrics()
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=base64.b64decode(msg["audio"]),
|
||||
sample_rate=self.sample_rate,
|
||||
|
||||
@@ -245,7 +245,7 @@ class HumeTTSService(WordTTSService):
|
||||
|
||||
# Start TTS sequence if not already started
|
||||
if not self._started:
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
|
||||
@@ -243,7 +243,7 @@ class InworldHttpTTSService(WordTTSService):
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
if not self._started:
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
@@ -699,7 +699,7 @@ class InworldTTSService(AudioContextWordTTSService):
|
||||
|
||||
if audio_b64:
|
||||
await self.stop_ttfb_metrics()
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
audio = base64.b64decode(audio_b64)
|
||||
if len(audio) > 44 and audio.startswith(b"RIFF"):
|
||||
audio = audio[44:]
|
||||
|
||||
@@ -385,7 +385,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
||||
if msg["type"] == "chunk":
|
||||
# Process audio chunk
|
||||
await self.stop_ttfb_metrics()
|
||||
self.start_word_timestamps()
|
||||
await self.start_word_timestamps()
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=base64.b64decode(msg["data"]),
|
||||
sample_rate=self.sample_rate,
|
||||
|
||||
@@ -651,15 +651,21 @@ class WordTTSService(TTSService):
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._initial_word_timestamp = -1
|
||||
self._initial_word_times = []
|
||||
self._words_task = None
|
||||
self._llm_response_started: bool = False
|
||||
|
||||
def start_word_timestamps(self):
|
||||
async def start_word_timestamps(self):
|
||||
"""Start tracking word timestamps from the current time."""
|
||||
if self._initial_word_timestamp == -1:
|
||||
self._initial_word_timestamp = self.get_clock().get_time()
|
||||
# If we cached some initial word times (because we didn't receive
|
||||
# audio), let's add them now.
|
||||
if self._initial_word_times:
|
||||
await self._add_word_timestamps(self._initial_word_times)
|
||||
self._initial_word_times = []
|
||||
|
||||
def reset_word_timestamps(self):
|
||||
async def reset_word_timestamps(self):
|
||||
"""Reset word timestamp tracking."""
|
||||
self._initial_word_timestamp = -1
|
||||
|
||||
@@ -669,8 +675,12 @@ class WordTTSService(TTSService):
|
||||
Args:
|
||||
word_times: List of (word, timestamp) tuples where timestamp is in seconds.
|
||||
"""
|
||||
for word, timestamp in word_times:
|
||||
await self._words_queue.put((word, seconds_to_nanoseconds(timestamp)))
|
||||
if self._initial_word_timestamp == -1:
|
||||
# Cache word timestamps and don't add them until we have started
|
||||
# (i.e. we have some audio).
|
||||
self._initial_word_times.extend(word_times)
|
||||
else:
|
||||
await self._add_word_timestamps(word_times)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the word TTS service.
|
||||
@@ -716,7 +726,7 @@ class WordTTSService(TTSService):
|
||||
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
||||
await super()._handle_interruption(frame, direction)
|
||||
self._llm_response_started = False
|
||||
self.reset_word_timestamps()
|
||||
await self.reset_word_timestamps()
|
||||
|
||||
def _create_words_task(self):
|
||||
if not self._words_task:
|
||||
@@ -728,13 +738,17 @@ class WordTTSService(TTSService):
|
||||
await self.cancel_task(self._words_task)
|
||||
self._words_task = None
|
||||
|
||||
async def _add_word_timestamps(self, word_times: List[Tuple[str, float]]):
|
||||
for word, timestamp in word_times:
|
||||
await self._words_queue.put((word, seconds_to_nanoseconds(timestamp)))
|
||||
|
||||
async def _words_task_handler(self):
|
||||
last_pts = 0
|
||||
while True:
|
||||
frame = None
|
||||
(word, timestamp) = await self._words_queue.get()
|
||||
if word == "Reset" and timestamp == 0:
|
||||
self.reset_word_timestamps()
|
||||
await self.reset_word_timestamps()
|
||||
if self._llm_response_started:
|
||||
self._llm_response_started = False
|
||||
frame = LLMFullResponseEndFrame()
|
||||
|
||||
Reference in New Issue
Block a user