Use ElevenLabs normalized_alignment so word timestamps match spoken audio

This commit is contained in:
Mark Backman
2026-04-21 09:09:19 -04:00
parent 9ded7bab1b
commit a0f79b4700
2 changed files with 66 additions and 21 deletions

View File

@@ -245,6 +245,35 @@ class ElevenLabsHttpTTSSettings(TTSSettings):
)
def _strip_leading_space(
alignment: Mapping[str, Any], keys: tuple[str, str, str]
) -> Mapping[str, Any]:
"""Return alignment with a prepended space char removed, if present.
Normalized alignment chunks from ElevenLabs begin with a leading space that
marks the prosody/chunk boundary. Left in place, it would prematurely
terminate a partial word carried over from the previous chunk. Stripping it
is lossless for timing: the dropped space's duration is still reflected in
the next char's `charStartTimesMs`, and the chunk's last-element values
(used to advance cumulative time) are untouched.
Args:
alignment: Alignment dict from the API.
keys: Tuple of (chars_key, start_times_key, durations_or_end_times_key)
naming the three parallel arrays — these differ between the
WebSocket and HTTP response schemas.
"""
chars_key, starts_key, tail_key = keys
chars = alignment.get(chars_key) or []
if chars and chars[0] == " ":
return {
chars_key: chars[1:],
starts_key: alignment.get(starts_key, [])[1:],
tail_key: alignment.get(tail_key, [])[1:],
}
return alignment
def calculate_word_times(
alignment_info: Mapping[str, Any],
cumulative_time: float,
@@ -790,8 +819,15 @@ class ElevenLabsTTSService(WebsocketTTSService):
frame = TTSAudioRawFrame(audio, self.sample_rate, 1, context_id=received_ctx_id)
await self.append_to_audio_context(received_ctx_id, frame)
if msg.get("alignment"):
alignment = msg["alignment"]
if msg.get("normalizedAlignment"):
# Use normalizedAlignment (what was actually spoken) rather than
# alignment (the input text), so word timestamps stay accurate
# when a pronunciation dictionary or text normalization rewrites
# the input.
alignment = _strip_leading_space(
msg["normalizedAlignment"],
("chars", "charStartTimesMs", "charDurationsMs"),
)
word_times, self._partial_word, self._partial_word_start_time = (
calculate_word_times(
alignment,
@@ -1296,21 +1332,30 @@ class ElevenLabsHttpTTSService(TTSService):
audio, self.sample_rate, 1, context_id=context_id
)
# Process alignment if present
if data and "alignment" in data:
alignment = data["alignment"]
if alignment: # Ensure alignment is not None
# Get end time of the last character in this chunk
char_end_times = alignment.get("character_end_times_seconds", [])
if char_end_times:
chunk_end_time = char_end_times[-1]
# Update to the longest end time seen so far
utterance_duration = max(utterance_duration, chunk_end_time)
# Process alignment if present. Use normalized_alignment
# (what was actually spoken) so word timestamps stay
# accurate when a pronunciation dictionary or text
# normalization rewrites the input.
if data and data.get("normalized_alignment"):
alignment = _strip_leading_space(
data["normalized_alignment"],
(
"characters",
"character_start_times_seconds",
"character_end_times_seconds",
),
)
# Get end time of the last character in this chunk
char_end_times = alignment.get("character_end_times_seconds", [])
if char_end_times:
chunk_end_time = char_end_times[-1]
# Update to the longest end time seen so far
utterance_duration = max(utterance_duration, chunk_end_time)
# Calculate word timestamps
word_times = self.calculate_word_times(alignment)
if word_times:
await self.add_word_timestamps(word_times, context_id)
# Calculate word timestamps
word_times = self.calculate_word_times(alignment)
if word_times:
await self.add_word_timestamps(word_times, context_id)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON from stream: {e}")
continue

10
uv.lock generated
View File

@@ -1,5 +1,5 @@
version = 1
revision = 2
revision = 3
requires-python = ">=3.11"
resolution-markers = [
"python_full_version >= '3.14'",
@@ -4522,7 +4522,7 @@ requires-dist = [
{ name = "requests", marker = "extra == 'kokoro'", specifier = ">=2.32.5,<3" },
{ name = "requests", marker = "extra == 'piper'", specifier = ">=2.32.5,<3" },
{ name = "resampy", specifier = "~=0.4.3" },
{ name = "sarvamai", marker = "extra == 'sarvam'", specifier = "==0.1.26" },
{ name = "sarvamai", marker = "extra == 'sarvam'", specifier = "==0.1.28" },
{ name = "sentry-sdk", marker = "extra == 'sentry'", specifier = ">=2.28.0,<3" },
{ name = "simli-ai", marker = "extra == 'simli'", specifier = "~=2.0.1" },
{ name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.1" },
@@ -5891,7 +5891,7 @@ wheels = [
[[package]]
name = "sarvamai"
version = "0.1.26"
version = "0.1.28"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "httpx" },
@@ -5900,9 +5900,9 @@ dependencies = [
{ name = "typing-extensions" },
{ name = "websockets" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d7/31/13f65e8533b667514e1cfe838d12a14494cbc5943fd8f0c101305127459b/sarvamai-0.1.26.tar.gz", hash = "sha256:d51a213c27feb33d65f5b71e4882dcdb873dc5e0d720390b7ba18d1bdeec2471", size = 113050, upload-time = "2026-03-06T16:40:36.647Z" }
sdist = { url = "https://files.pythonhosted.org/packages/23/44/57a7a37be64953bb0bec9f674e92a9f8fb7070ce6aeb44c6f22720458b40/sarvamai-0.1.28.tar.gz", hash = "sha256:bc52f0c849e429d1a4493e49b1b4b65bf06965f3936d4eb6ee3da3130452e7ea", size = 139022, upload-time = "2026-04-20T08:05:12.185Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c9/c03a807ace9cafbfe26418be995e4959142a55313c9f26564586e111f31d/sarvamai-0.1.26-py3-none-any.whl", hash = "sha256:39e79ba0932f4501a2aa28f84fd2de64d34fc9a7af2b0d4ead1efa617517b3bd", size = 229057, upload-time = "2026-03-06T16:40:35.584Z" },
{ url = "https://files.pythonhosted.org/packages/84/90/95cd195a3a2ae9a9973c6a05705207e8dd97b5aea90339bc24343cb54850/sarvamai-0.1.28-py3-none-any.whl", hash = "sha256:52e71c0a0f521552d4d948ec452699b44554a56e64ee354b14da2efc9d9046b3", size = 269335, upload-time = "2026-04-20T08:05:10.712Z" },
]
[[package]]