Use ElevenLabs normalized_alignment so word timestamps match spoken audio

2026-04-21 09:09:19 -04:00
parent 9ded7bab1b
commit a0f79b4700
2 changed files with 66 additions and 21 deletions
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -245,6 +245,35 @@ class ElevenLabsHttpTTSSettings(TTSSettings):
    )


+def _strip_leading_space(
+    alignment: Mapping[str, Any], keys: tuple[str, str, str]
+) -> Mapping[str, Any]:
+    """Return alignment with a prepended space char removed, if present.
+
+    Normalized alignment chunks from ElevenLabs begin with a leading space that
+    marks the prosody/chunk boundary. Left in place, it would prematurely
+    terminate a partial word carried over from the previous chunk. Stripping it
+    is lossless for timing: the dropped space's duration is still reflected in
+    the next char's `charStartTimesMs`, and the chunk's last-element values
+    (used to advance cumulative time) are untouched.
+
+    Args:
+        alignment: Alignment dict from the API.
+        keys: Tuple of (chars_key, start_times_key, durations_or_end_times_key)
+            naming the three parallel arrays — these differ between the
+            WebSocket and HTTP response schemas.
+    """
+    chars_key, starts_key, tail_key = keys
+    chars = alignment.get(chars_key) or []
+    if chars and chars[0] == " ":
+        return {
+            chars_key: chars[1:],
+            starts_key: alignment.get(starts_key, [])[1:],
+            tail_key: alignment.get(tail_key, [])[1:],
+        }
+    return alignment
+
+
 def calculate_word_times(
    alignment_info: Mapping[str, Any],
    cumulative_time: float,
@@ -790,8 +819,15 @@ class ElevenLabsTTSService(WebsocketTTSService):
                frame = TTSAudioRawFrame(audio, self.sample_rate, 1, context_id=received_ctx_id)
                await self.append_to_audio_context(received_ctx_id, frame)

-            if msg.get("alignment"):
-                alignment = msg["alignment"]
+            if msg.get("normalizedAlignment"):
+                # Use normalizedAlignment (what was actually spoken) rather than
+                # alignment (the input text), so word timestamps stay accurate
+                # when a pronunciation dictionary or text normalization rewrites
+                # the input.
+                alignment = _strip_leading_space(
+                    msg["normalizedAlignment"],
+                    ("chars", "charStartTimesMs", "charDurationsMs"),
+                )
                word_times, self._partial_word, self._partial_word_start_time = (
                    calculate_word_times(
                        alignment,
@@ -1296,21 +1332,30 @@ class ElevenLabsHttpTTSService(TTSService):
                                audio, self.sample_rate, 1, context_id=context_id
                            )

-                        # Process alignment if present
-                        if data and "alignment" in data:
-                            alignment = data["alignment"]
-                            if alignment:  # Ensure alignment is not None
-                                # Get end time of the last character in this chunk
-                                char_end_times = alignment.get("character_end_times_seconds", [])
-                                if char_end_times:
-                                    chunk_end_time = char_end_times[-1]
-                                    # Update to the longest end time seen so far
-                                    utterance_duration = max(utterance_duration, chunk_end_time)
+                        # Process alignment if present. Use normalized_alignment
+                        # (what was actually spoken) so word timestamps stay
+                        # accurate when a pronunciation dictionary or text
+                        # normalization rewrites the input.
+                        if data and data.get("normalized_alignment"):
+                            alignment = _strip_leading_space(
+                                data["normalized_alignment"],
+                                (
+                                    "characters",
+                                    "character_start_times_seconds",
+                                    "character_end_times_seconds",
+                                ),
+                            )
+                            # Get end time of the last character in this chunk
+                            char_end_times = alignment.get("character_end_times_seconds", [])
+                            if char_end_times:
+                                chunk_end_time = char_end_times[-1]
+                                # Update to the longest end time seen so far
+                                utterance_duration = max(utterance_duration, chunk_end_time)

-                                # Calculate word timestamps
-                                word_times = self.calculate_word_times(alignment)
-                                if word_times:
-                                    await self.add_word_timestamps(word_times, context_id)
+                            # Calculate word timestamps
+                            word_times = self.calculate_word_times(alignment)
+                            if word_times:
+                                await self.add_word_timestamps(word_times, context_id)
                    except json.JSONDecodeError as e:
                        logger.warning(f"Failed to parse JSON from stream: {e}")
                        continue
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.11"
 resolution-markers = [
    "python_full_version >= '3.14'",
@@ -4522,7 +4522,7 @@ requires-dist = [
    { name = "requests", marker = "extra == 'kokoro'", specifier = ">=2.32.5,<3" },
    { name = "requests", marker = "extra == 'piper'", specifier = ">=2.32.5,<3" },
    { name = "resampy", specifier = "~=0.4.3" },
-    { name = "sarvamai", marker = "extra == 'sarvam'", specifier = "==0.1.26" },
+    { name = "sarvamai", marker = "extra == 'sarvam'", specifier = "==0.1.28" },
    { name = "sentry-sdk", marker = "extra == 'sentry'", specifier = ">=2.28.0,<3" },
    { name = "simli-ai", marker = "extra == 'simli'", specifier = "~=2.0.1" },
    { name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.1" },
@@ -5891,7 +5891,7 @@ wheels = [

 [[package]]
 name = "sarvamai"
-version = "0.1.26"
+version = "0.1.28"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "httpx" },
@@ -5900,9 +5900,9 @@ dependencies = [
    { name = "typing-extensions" },
    { name = "websockets" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d7/31/13f65e8533b667514e1cfe838d12a14494cbc5943fd8f0c101305127459b/sarvamai-0.1.26.tar.gz", hash = "sha256:d51a213c27feb33d65f5b71e4882dcdb873dc5e0d720390b7ba18d1bdeec2471", size = 113050, upload-time = "2026-03-06T16:40:36.647Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/23/44/57a7a37be64953bb0bec9f674e92a9f8fb7070ce6aeb44c6f22720458b40/sarvamai-0.1.28.tar.gz", hash = "sha256:bc52f0c849e429d1a4493e49b1b4b65bf06965f3936d4eb6ee3da3130452e7ea", size = 139022, upload-time = "2026-04-20T08:05:12.185Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/76/c9/c03a807ace9cafbfe26418be995e4959142a55313c9f26564586e111f31d/sarvamai-0.1.26-py3-none-any.whl", hash = "sha256:39e79ba0932f4501a2aa28f84fd2de64d34fc9a7af2b0d4ead1efa617517b3bd", size = 229057, upload-time = "2026-03-06T16:40:35.584Z" },
+    { url = "https://files.pythonhosted.org/packages/84/90/95cd195a3a2ae9a9973c6a05705207e8dd97b5aea90339bc24343cb54850/sarvamai-0.1.28-py3-none-any.whl", hash = "sha256:52e71c0a0f521552d4d948ec452699b44554a56e64ee354b14da2efc9d9046b3", size = 269335, upload-time = "2026-04-20T08:05:10.712Z" },
 ]

 [[package]]