fix: strip Cartesia SSML tags from word timestamp entries

SSML markup (e.g. <spell>, <emotion>, <break>) was leaking into word entries returned by the Cartesia word-timestamps API. Tags are now stripped before processing so word-to-text attribution remains accurate when SSML is present in the TTS input.
2026-05-20 10:03:15 -03:00
parent 6b9deefbe3
commit 185a89bb3b
1 changed files with 22 additions and 7 deletions
--- a/src/pipecat/services/cartesia/tts.py
+++ b/src/pipecat/services/cartesia/tts.py
@@ -8,6 +8,7 @@

 import base64
 import json
+import re
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass, field
 from enum import StrEnum
@@ -431,10 +432,20 @@ class CartesiaTTSService(WebsocketTTSService):
        base_lang = language.split("-")[0].lower()
        return base_lang in {"zh", "ja"}

-    def _process_word_timestamps_for_language(
+    _CARTESIA_TAG_RE = re.compile(r"</?(?:spell|emotion|break|volume|speed)\b[^>]*>", re.IGNORECASE)
+
+    def _strip_cartesia_tags(self, text: str) -> str:
+        text = self._CARTESIA_TAG_RE.sub(" ", text)
+        text = re.sub(r"\s+", " ", text)
+        return text.strip()
+
+    def _normalize_word_timestamps(
        self, words: list[str], starts: list[float]
    ) -> list[tuple[str, float]]:
-        """Process word timestamps based on the current language.
+        """Normalize raw word timestamps from Cartesia before further processing.
+
+        Strips Cartesia SSML tags (spell, emotion, break, volume, speed) from each word
+        and drops entries that become empty after stripping.

        For Chinese and Japanese, Cartesia groups related characters in the same timestamp
        message.
@@ -458,14 +469,18 @@ class CartesiaTTSService(WebsocketTTSService):
            # For Chinese/Japanese, combine all characters in this message into one word
            # using the first character's start time.
            if words and starts:
-                combined_word = "".join(words)
+                combined_word = "".join(self._strip_cartesia_tags(w) for w in words)
                first_start = starts[0]
-                return [(combined_word, first_start)]
+                return [(combined_word, first_start)] if combined_word else []
            else:
                return []
        else:
-            # For non-CJK languages, use as-is
-            return list(zip(words, starts))
+            result = []
+            for word, start in zip(words, starts):
+                cleaned = self._strip_cartesia_tags(word)
+                if cleaned:
+                    result.append((cleaned, start))
+            return result

    def _word_timestamps_include_inter_frame_spaces(self) -> bool:
        """Whether timestamp text should be treated as carrying its own spacing."""
@@ -662,7 +677,7 @@ class CartesiaTTSService(WebsocketTTSService):
                await self.remove_audio_context(ctx_id)
            elif msg["type"] == "timestamps":
                # Process the timestamps based on language before adding them
-                processed_timestamps = self._process_word_timestamps_for_language(
+                processed_timestamps = self._normalize_word_timestamps(
                    msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]
                )
                await self.add_word_timestamps(