From 90e6b51acd804220bea707f03e6b48b940616272 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 4 May 2026 15:15:37 -0400 Subject: [PATCH 1/2] Fix ElevenLabs alignment chunk spacing --- src/pipecat/services/elevenlabs/tts.py | 48 +++++++++----- tests/test_elevenlabs_tts.py | 92 ++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 18 deletions(-) create mode 100644 tests/test_elevenlabs_tts.py diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index fe8e7ab84..6bd02eb87 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -248,32 +248,37 @@ class ElevenLabsHttpTTSSettings(TTSSettings): ) -def _strip_leading_space( - alignment: Mapping[str, Any], keys: tuple[str, str, str] +def _strip_utterance_leading_spaces( + alignment: Mapping[str, Any], keys: tuple[str, str, str], should_strip: bool ) -> Mapping[str, Any]: - """Return alignment with a prepended space char removed, if present. + """Return alignment with utterance-leading space chars removed, if requested. - Normalized alignment chunks from ElevenLabs begin with a leading space that - marks the prosody/chunk boundary. Left in place, it would prematurely - terminate a partial word carried over from the previous chunk. Stripping it - is lossless for timing: the dropped space's duration is still reflected in - the next char's `charStartTimesMs`, and the chunk's last-element values - (used to advance cumulative time) are untouched. + Normalized alignment chunks from ElevenLabs often begin with a space. On the + first chunk of an utterance, that space is leading whitespace and should not + become a text token. On subsequent chunks, however, a leading space can be a + real inter-word separator (Flash models commonly split sentences this way), + so it must be preserved for ``calculate_word_times`` to flush any partial + word carried over from the previous chunk. Args: alignment: Alignment dict from the API. keys: Tuple of (chars_key, start_times_key, durations_or_end_times_key) - naming the three parallel arrays — these differ between the + naming the three parallel arrays - these differ between the WebSocket and HTTP response schemas. + should_strip: Whether this is still utterance-leading alignment data. """ chars_key, starts_key, tail_key = keys chars = alignment.get(chars_key) or [] - if chars and chars[0] == " ": - return { - chars_key: chars[1:], - starts_key: alignment.get(starts_key, [])[1:], - tail_key: alignment.get(tail_key, [])[1:], - } + if should_strip and chars and chars[0] == " ": + strip_count = 0 + while strip_count < len(chars) and chars[strip_count] == " ": + strip_count += 1 + + stripped = dict(alignment) + stripped[chars_key] = chars[strip_count:] + stripped[starts_key] = alignment.get(starts_key, [])[strip_count:] + stripped[tail_key] = alignment.get(tail_key, [])[strip_count:] + return stripped return alignment @@ -548,6 +553,7 @@ class ElevenLabsTTSService(WebsocketTTSService): # Track partial words that span across alignment chunks self._partial_word = "" self._partial_word_start_time = 0.0 + self._alignment_started_context_ids: set[str | None] = set() # Context management for v1 multi API self._receive_task = None @@ -773,6 +779,7 @@ class ElevenLabsTTSService(WebsocketTTSService): self._cumulative_time = 0.0 self._partial_word = "" self._partial_word_start_time = 0.0 + self._alignment_started_context_ids.discard(context_id) async def on_audio_context_interrupted(self, context_id: str): """Close the ElevenLabs context when the bot is interrupted.""" @@ -827,10 +834,12 @@ class ElevenLabsTTSService(WebsocketTTSService): # alignment (the input text), so word timestamps stay accurate # when a pronunciation dictionary or text normalization rewrites # the input. - alignment = _strip_leading_space( + alignment = _strip_utterance_leading_spaces( msg["normalizedAlignment"], ("chars", "charStartTimesMs", "charDurationsMs"), + received_ctx_id not in self._alignment_started_context_ids, ) + self._alignment_started_context_ids.add(received_ctx_id) word_times, self._partial_word, self._partial_word_start_time = ( calculate_word_times( alignment, @@ -1326,6 +1335,7 @@ class ElevenLabsHttpTTSService(TTSService): # Track the duration of this utterance based on the last character's end time utterance_duration = 0 + alignment_started = False async for line in response.content: line_str = line.decode("utf-8").strip() if not line_str: @@ -1348,14 +1358,16 @@ class ElevenLabsHttpTTSService(TTSService): # accurate when a pronunciation dictionary or text # normalization rewrites the input. if data and data.get("normalized_alignment"): - alignment = _strip_leading_space( + alignment = _strip_utterance_leading_spaces( data["normalized_alignment"], ( "characters", "character_start_times_seconds", "character_end_times_seconds", ), + not alignment_started, ) + alignment_started = True # Get end time of the last character in this chunk char_end_times = alignment.get("character_end_times_seconds", []) if char_end_times: diff --git a/tests/test_elevenlabs_tts.py b/tests/test_elevenlabs_tts.py new file mode 100644 index 000000000..1dafc4eff --- /dev/null +++ b/tests/test_elevenlabs_tts.py @@ -0,0 +1,92 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Tests for ElevenLabs TTS alignment handling.""" + +from typing import Any + +from pipecat.services.elevenlabs.tts import ( + _strip_utterance_leading_spaces, + calculate_word_times, +) + +_WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs") + + +def _chunk(text: str) -> dict[str, list[Any]]: + chars = list(text) + return { + "chars": chars, + "charStartTimesMs": [i * 100 for i in range(len(chars))], + "charDurationsMs": [100 for _ in chars], + } + + +def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]: + cumulative_time = 0.0 + partial_word = "" + partial_word_start_time = 0.0 + word_times = [] + alignment_started = False + + for chunk in chunks: + alignment = _strip_utterance_leading_spaces( + chunk, + _WS_ALIGNMENT_KEYS, + not alignment_started, + ) + alignment_started = True + chunk_word_times, partial_word, partial_word_start_time = calculate_word_times( + alignment, + cumulative_time, + partial_word, + partial_word_start_time, + ) + word_times.extend(chunk_word_times) + + starts = alignment["charStartTimesMs"] + durations = alignment["charDurationsMs"] + if starts and durations: + cumulative_time += (starts[-1] + durations[-1]) / 1000.0 + + if partial_word: + word_times.append((partial_word, partial_word_start_time)) + + return [word for word, _ in word_times] + + +def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space(): + chunks = [ + _chunk(" Why did the math book"), + _chunk(" look so sad? "), + _chunk(" Because it had too m"), + _chunk("any problems. "), + ] + + assert _words_from_chunks(chunks) == [ + "Why", + "did", + "the", + "math", + "book", + "look", + "so", + "sad?", + "Because", + "it", + "had", + "too", + "many", + "problems.", + ] + + +def test_elevenlabs_alignment_strips_only_utterance_leading_spaces(): + first = _strip_utterance_leading_spaces(_chunk(" Hello"), _WS_ALIGNMENT_KEYS, True) + subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False) + + assert first["chars"] == list("Hello") + assert subsequent["chars"] == list(" world") From 9886d72f5ea37886b1a07ece1e18cb32feceaf33 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 4 May 2026 15:18:15 -0400 Subject: [PATCH 2/2] Add changelog for PR #4415 --- changelog/4415.fixed.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog/4415.fixed.md diff --git a/changelog/4415.fixed.md b/changelog/4415.fixed.md new file mode 100644 index 000000000..c84bff37b --- /dev/null +++ b/changelog/4415.fixed.md @@ -0,0 +1 @@ +- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` producing merged words (e.g. `bookLook`) when using Flash models. Flash often splits sentences mid-stream into alignment chunks that begin with a real inter-word space, but the previous fix unconditionally stripped that space from every chunk. Leading spaces are now stripped only on the first alignment chunk of an utterance, so subsequent chunks correctly flush partial words across boundaries.