diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index f1e01a18e..15b9cb9b7 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -434,6 +434,9 @@ class WordTTSService(TTSService): if word == "LLMFullResponseEndFrame" and timestamp == 0: frame = LLMFullResponseEndFrame() frame.pts = last_pts + elif word == "TTSStoppedFrame" and timestamp == 0: + frame = TTSStoppedFrame() + frame.pts = last_pts else: frame = TextFrame(word) frame.pts = self._initial_word_timestamp + timestamp diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 21074bbcc..f5657126c 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -227,12 +227,13 @@ class CartesiaTTSService(WordTTSService): continue if msg["type"] == "done": await self.stop_ttfb_metrics() - await self.push_frame(TTSStoppedFrame()) # Unset _context_id but not the _context_id_start_timestamp # because we are likely still playing out audio and need the # timestamp to set send context frames. self._context_id = None - await self.add_word_timestamps([("LLMFullResponseEndFrame", 0)]) + await self.add_word_timestamps( + [("TTSStoppedFrame", 0), ("LLMFullResponseEndFrame", 0)] + ) elif msg["type"] == "timestamps": await self.add_word_timestamps( list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]))