From a0f79b47002d946410e552f5cfcbec726aed3975 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 21 Apr 2026 09:09:19 -0400 Subject: [PATCH] Use ElevenLabs normalized_alignment so word timestamps match spoken audio --- src/pipecat/services/elevenlabs/tts.py | 77 ++++++++++++++++++++------ uv.lock | 10 ++-- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index 02e6383ff..f72864ef4 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -245,6 +245,35 @@ class ElevenLabsHttpTTSSettings(TTSSettings): ) +def _strip_leading_space( + alignment: Mapping[str, Any], keys: tuple[str, str, str] +) -> Mapping[str, Any]: + """Return alignment with a prepended space char removed, if present. + + Normalized alignment chunks from ElevenLabs begin with a leading space that + marks the prosody/chunk boundary. Left in place, it would prematurely + terminate a partial word carried over from the previous chunk. Stripping it + is lossless for timing: the dropped space's duration is still reflected in + the next char's `charStartTimesMs`, and the chunk's last-element values + (used to advance cumulative time) are untouched. + + Args: + alignment: Alignment dict from the API. + keys: Tuple of (chars_key, start_times_key, durations_or_end_times_key) + naming the three parallel arrays — these differ between the + WebSocket and HTTP response schemas. + """ + chars_key, starts_key, tail_key = keys + chars = alignment.get(chars_key) or [] + if chars and chars[0] == " ": + return { + chars_key: chars[1:], + starts_key: alignment.get(starts_key, [])[1:], + tail_key: alignment.get(tail_key, [])[1:], + } + return alignment + + def calculate_word_times( alignment_info: Mapping[str, Any], cumulative_time: float, @@ -790,8 +819,15 @@ class ElevenLabsTTSService(WebsocketTTSService): frame = TTSAudioRawFrame(audio, self.sample_rate, 1, context_id=received_ctx_id) await self.append_to_audio_context(received_ctx_id, frame) - if msg.get("alignment"): - alignment = msg["alignment"] + if msg.get("normalizedAlignment"): + # Use normalizedAlignment (what was actually spoken) rather than + # alignment (the input text), so word timestamps stay accurate + # when a pronunciation dictionary or text normalization rewrites + # the input. + alignment = _strip_leading_space( + msg["normalizedAlignment"], + ("chars", "charStartTimesMs", "charDurationsMs"), + ) word_times, self._partial_word, self._partial_word_start_time = ( calculate_word_times( alignment, @@ -1296,21 +1332,30 @@ class ElevenLabsHttpTTSService(TTSService): audio, self.sample_rate, 1, context_id=context_id ) - # Process alignment if present - if data and "alignment" in data: - alignment = data["alignment"] - if alignment: # Ensure alignment is not None - # Get end time of the last character in this chunk - char_end_times = alignment.get("character_end_times_seconds", []) - if char_end_times: - chunk_end_time = char_end_times[-1] - # Update to the longest end time seen so far - utterance_duration = max(utterance_duration, chunk_end_time) + # Process alignment if present. Use normalized_alignment + # (what was actually spoken) so word timestamps stay + # accurate when a pronunciation dictionary or text + # normalization rewrites the input. + if data and data.get("normalized_alignment"): + alignment = _strip_leading_space( + data["normalized_alignment"], + ( + "characters", + "character_start_times_seconds", + "character_end_times_seconds", + ), + ) + # Get end time of the last character in this chunk + char_end_times = alignment.get("character_end_times_seconds", []) + if char_end_times: + chunk_end_time = char_end_times[-1] + # Update to the longest end time seen so far + utterance_duration = max(utterance_duration, chunk_end_time) - # Calculate word timestamps - word_times = self.calculate_word_times(alignment) - if word_times: - await self.add_word_timestamps(word_times, context_id) + # Calculate word timestamps + word_times = self.calculate_word_times(alignment) + if word_times: + await self.add_word_timestamps(word_times, context_id) except json.JSONDecodeError as e: logger.warning(f"Failed to parse JSON from stream: {e}") continue diff --git a/uv.lock b/uv.lock index 49ae58ec4..16e9e5e66 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.14'", @@ -4522,7 +4522,7 @@ requires-dist = [ { name = "requests", marker = "extra == 'kokoro'", specifier = ">=2.32.5,<3" }, { name = "requests", marker = "extra == 'piper'", specifier = ">=2.32.5,<3" }, { name = "resampy", specifier = "~=0.4.3" }, - { name = "sarvamai", marker = "extra == 'sarvam'", specifier = "==0.1.26" }, + { name = "sarvamai", marker = "extra == 'sarvam'", specifier = "==0.1.28" }, { name = "sentry-sdk", marker = "extra == 'sentry'", specifier = ">=2.28.0,<3" }, { name = "simli-ai", marker = "extra == 'simli'", specifier = "~=2.0.1" }, { name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.1" }, @@ -5891,7 +5891,7 @@ wheels = [ [[package]] name = "sarvamai" -version = "0.1.26" +version = "0.1.28" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, @@ -5900,9 +5900,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d7/31/13f65e8533b667514e1cfe838d12a14494cbc5943fd8f0c101305127459b/sarvamai-0.1.26.tar.gz", hash = "sha256:d51a213c27feb33d65f5b71e4882dcdb873dc5e0d720390b7ba18d1bdeec2471", size = 113050, upload-time = "2026-03-06T16:40:36.647Z" } +sdist = { url = "https://files.pythonhosted.org/packages/23/44/57a7a37be64953bb0bec9f674e92a9f8fb7070ce6aeb44c6f22720458b40/sarvamai-0.1.28.tar.gz", hash = "sha256:bc52f0c849e429d1a4493e49b1b4b65bf06965f3936d4eb6ee3da3130452e7ea", size = 139022, upload-time = "2026-04-20T08:05:12.185Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/76/c9/c03a807ace9cafbfe26418be995e4959142a55313c9f26564586e111f31d/sarvamai-0.1.26-py3-none-any.whl", hash = "sha256:39e79ba0932f4501a2aa28f84fd2de64d34fc9a7af2b0d4ead1efa617517b3bd", size = 229057, upload-time = "2026-03-06T16:40:35.584Z" }, + { url = "https://files.pythonhosted.org/packages/84/90/95cd195a3a2ae9a9973c6a05705207e8dd97b5aea90339bc24343cb54850/sarvamai-0.1.28-py3-none-any.whl", hash = "sha256:52e71c0a0f521552d4d948ec452699b44554a56e64ee354b14da2efc9d9046b3", size = 269335, upload-time = "2026-04-20T08:05:10.712Z" }, ] [[package]]