fix: strip Cartesia SSML tags from word timestamp entries
SSML markup (e.g. <spell>, <emotion>, <break>) was leaking into word entries returned by the Cartesia word-timestamps API. Tags are now stripped before processing so word-to-text attribution remains accurate when SSML is present in the TTS input.
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
from collections.abc import AsyncGenerator
|
||||
from dataclasses import dataclass, field
|
||||
from enum import StrEnum
|
||||
@@ -431,10 +432,20 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
base_lang = language.split("-")[0].lower()
|
||||
return base_lang in {"zh", "ja"}
|
||||
|
||||
def _process_word_timestamps_for_language(
|
||||
_CARTESIA_TAG_RE = re.compile(r"</?(?:spell|emotion|break|volume|speed)\b[^>]*>", re.IGNORECASE)
|
||||
|
||||
def _strip_cartesia_tags(self, text: str) -> str:
|
||||
text = self._CARTESIA_TAG_RE.sub(" ", text)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
def _normalize_word_timestamps(
|
||||
self, words: list[str], starts: list[float]
|
||||
) -> list[tuple[str, float]]:
|
||||
"""Process word timestamps based on the current language.
|
||||
"""Normalize raw word timestamps from Cartesia before further processing.
|
||||
|
||||
Strips Cartesia SSML tags (spell, emotion, break, volume, speed) from each word
|
||||
and drops entries that become empty after stripping.
|
||||
|
||||
For Chinese and Japanese, Cartesia groups related characters in the same timestamp
|
||||
message.
|
||||
@@ -458,14 +469,18 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
# For Chinese/Japanese, combine all characters in this message into one word
|
||||
# using the first character's start time.
|
||||
if words and starts:
|
||||
combined_word = "".join(words)
|
||||
combined_word = "".join(self._strip_cartesia_tags(w) for w in words)
|
||||
first_start = starts[0]
|
||||
return [(combined_word, first_start)]
|
||||
return [(combined_word, first_start)] if combined_word else []
|
||||
else:
|
||||
return []
|
||||
else:
|
||||
# For non-CJK languages, use as-is
|
||||
return list(zip(words, starts))
|
||||
result = []
|
||||
for word, start in zip(words, starts):
|
||||
cleaned = self._strip_cartesia_tags(word)
|
||||
if cleaned:
|
||||
result.append((cleaned, start))
|
||||
return result
|
||||
|
||||
def _word_timestamps_include_inter_frame_spaces(self) -> bool:
|
||||
"""Whether timestamp text should be treated as carrying its own spacing."""
|
||||
@@ -662,7 +677,7 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
await self.remove_audio_context(ctx_id)
|
||||
elif msg["type"] == "timestamps":
|
||||
# Process the timestamps based on language before adding them
|
||||
processed_timestamps = self._process_word_timestamps_for_language(
|
||||
processed_timestamps = self._normalize_word_timestamps(
|
||||
msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]
|
||||
)
|
||||
await self.add_word_timestamps(
|
||||
|
||||
Reference in New Issue
Block a user