fix: strip Cartesia SSML tags from word timestamp entries

SSML markup (e.g. <spell>, <emotion>, <break>) was leaking into word entries
returned by the Cartesia word-timestamps API. Tags are now stripped before
processing so word-to-text attribution remains accurate when SSML is present
in the TTS input.
This commit is contained in:
filipi87
2026-05-20 10:03:15 -03:00
parent 6b9deefbe3
commit 185a89bb3b

View File

@@ -8,6 +8,7 @@
import base64
import json
import re
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field
from enum import StrEnum
@@ -431,10 +432,20 @@ class CartesiaTTSService(WebsocketTTSService):
base_lang = language.split("-")[0].lower()
return base_lang in {"zh", "ja"}
def _process_word_timestamps_for_language(
_CARTESIA_TAG_RE = re.compile(r"</?(?:spell|emotion|break|volume|speed)\b[^>]*>", re.IGNORECASE)
def _strip_cartesia_tags(self, text: str) -> str:
text = self._CARTESIA_TAG_RE.sub(" ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def _normalize_word_timestamps(
self, words: list[str], starts: list[float]
) -> list[tuple[str, float]]:
"""Process word timestamps based on the current language.
"""Normalize raw word timestamps from Cartesia before further processing.
Strips Cartesia SSML tags (spell, emotion, break, volume, speed) from each word
and drops entries that become empty after stripping.
For Chinese and Japanese, Cartesia groups related characters in the same timestamp
message.
@@ -458,14 +469,18 @@ class CartesiaTTSService(WebsocketTTSService):
# For Chinese/Japanese, combine all characters in this message into one word
# using the first character's start time.
if words and starts:
combined_word = "".join(words)
combined_word = "".join(self._strip_cartesia_tags(w) for w in words)
first_start = starts[0]
return [(combined_word, first_start)]
return [(combined_word, first_start)] if combined_word else []
else:
return []
else:
# For non-CJK languages, use as-is
return list(zip(words, starts))
result = []
for word, start in zip(words, starts):
cleaned = self._strip_cartesia_tags(word)
if cleaned:
result.append((cleaned, start))
return result
def _word_timestamps_include_inter_frame_spaces(self) -> bool:
"""Whether timestamp text should be treated as carrying its own spacing."""
@@ -662,7 +677,7 @@ class CartesiaTTSService(WebsocketTTSService):
await self.remove_audio_context(ctx_id)
elif msg["type"] == "timestamps":
# Process the timestamps based on language before adding them
processed_timestamps = self._process_word_timestamps_for_language(
processed_timestamps = self._normalize_word_timestamps(
msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]
)
await self.add_word_timestamps(