Merge pull request #4415 from pipecat-ai/mb/fix-elevenlabs-leading-spaces-flash

2026-05-04 18:08:31 -04:00
parent b363b91d12 9886d72f5e
commit b2b7e9ee6f
3 changed files with 123 additions and 18 deletions
--- a/changelog/4415.fixed.md
+++ b/changelog/4415.fixed.md
@@ -0,0 +1 @@
+- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` producing merged words (e.g. `bookLook`) when using Flash models. Flash often splits sentences mid-stream into alignment chunks that begin with a real inter-word space, but the previous fix unconditionally stripped that space from every chunk. Leading spaces are now stripped only on the first alignment chunk of an utterance, so subsequent chunks correctly flush partial words across boundaries.
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -248,32 +248,37 @@ class ElevenLabsHttpTTSSettings(TTSSettings):
    )


-def _strip_leading_space(
-    alignment: Mapping[str, Any], keys: tuple[str, str, str]
+def _strip_utterance_leading_spaces(
+    alignment: Mapping[str, Any], keys: tuple[str, str, str], should_strip: bool
 ) -> Mapping[str, Any]:
-    """Return alignment with a prepended space char removed, if present.
+    """Return alignment with utterance-leading space chars removed, if requested.

-    Normalized alignment chunks from ElevenLabs begin with a leading space that
-    marks the prosody/chunk boundary. Left in place, it would prematurely
-    terminate a partial word carried over from the previous chunk. Stripping it
-    is lossless for timing: the dropped space's duration is still reflected in
-    the next char's `charStartTimesMs`, and the chunk's last-element values
-    (used to advance cumulative time) are untouched.
+    Normalized alignment chunks from ElevenLabs often begin with a space. On the
+    first chunk of an utterance, that space is leading whitespace and should not
+    become a text token. On subsequent chunks, however, a leading space can be a
+    real inter-word separator (Flash models commonly split sentences this way),
+    so it must be preserved for ``calculate_word_times`` to flush any partial
+    word carried over from the previous chunk.

    Args:
        alignment: Alignment dict from the API.
        keys: Tuple of (chars_key, start_times_key, durations_or_end_times_key)
-            naming the three parallel arrays — these differ between the
+            naming the three parallel arrays - these differ between the
            WebSocket and HTTP response schemas.
+        should_strip: Whether this is still utterance-leading alignment data.
    """
    chars_key, starts_key, tail_key = keys
    chars = alignment.get(chars_key) or []
-    if chars and chars[0] == " ":
-        return {
-            chars_key: chars[1:],
-            starts_key: alignment.get(starts_key, [])[1:],
-            tail_key: alignment.get(tail_key, [])[1:],
-        }
+    if should_strip and chars and chars[0] == " ":
+        strip_count = 0
+        while strip_count < len(chars) and chars[strip_count] == " ":
+            strip_count += 1
+
+        stripped = dict(alignment)
+        stripped[chars_key] = chars[strip_count:]
+        stripped[starts_key] = alignment.get(starts_key, [])[strip_count:]
+        stripped[tail_key] = alignment.get(tail_key, [])[strip_count:]
+        return stripped
    return alignment


@@ -548,6 +553,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
        # Track partial words that span across alignment chunks
        self._partial_word = ""
        self._partial_word_start_time = 0.0
+        self._alignment_started_context_ids: set[str | None] = set()

        # Context management for v1 multi API
        self._receive_task = None
@@ -773,6 +779,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
        self._cumulative_time = 0.0
        self._partial_word = ""
        self._partial_word_start_time = 0.0
+        self._alignment_started_context_ids.discard(context_id)

    async def on_audio_context_interrupted(self, context_id: str):
        """Close the ElevenLabs context when the bot is interrupted."""
@@ -827,10 +834,12 @@ class ElevenLabsTTSService(WebsocketTTSService):
                # alignment (the input text), so word timestamps stay accurate
                # when a pronunciation dictionary or text normalization rewrites
                # the input.
-                alignment = _strip_leading_space(
+                alignment = _strip_utterance_leading_spaces(
                    msg["normalizedAlignment"],
                    ("chars", "charStartTimesMs", "charDurationsMs"),
+                    received_ctx_id not in self._alignment_started_context_ids,
                )
+                self._alignment_started_context_ids.add(received_ctx_id)
                word_times, self._partial_word, self._partial_word_start_time = (
                    calculate_word_times(
                        alignment,
@@ -1326,6 +1335,7 @@ class ElevenLabsHttpTTSService(TTSService):

                # Track the duration of this utterance based on the last character's end time
                utterance_duration = 0
+                alignment_started = False
                async for line in response.content:
                    line_str = line.decode("utf-8").strip()
                    if not line_str:
@@ -1348,14 +1358,16 @@ class ElevenLabsHttpTTSService(TTSService):
                        # accurate when a pronunciation dictionary or text
                        # normalization rewrites the input.
                        if data and data.get("normalized_alignment"):
-                            alignment = _strip_leading_space(
+                            alignment = _strip_utterance_leading_spaces(
                                data["normalized_alignment"],
                                (
                                    "characters",
                                    "character_start_times_seconds",
                                    "character_end_times_seconds",
                                ),
+                                not alignment_started,
                            )
+                            alignment_started = True
                            # Get end time of the last character in this chunk
                            char_end_times = alignment.get("character_end_times_seconds", [])
                            if char_end_times:
--- a/tests/test_elevenlabs_tts.py
+++ b/tests/test_elevenlabs_tts.py
@@ -0,0 +1,92 @@
+#
+# Copyright (c) 2024-2026, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+"""Tests for ElevenLabs TTS alignment handling."""
+
+from typing import Any
+
+from pipecat.services.elevenlabs.tts import (
+    _strip_utterance_leading_spaces,
+    calculate_word_times,
+)
+
+_WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs")
+
+
+def _chunk(text: str) -> dict[str, list[Any]]:
+    chars = list(text)
+    return {
+        "chars": chars,
+        "charStartTimesMs": [i * 100 for i in range(len(chars))],
+        "charDurationsMs": [100 for _ in chars],
+    }
+
+
+def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]:
+    cumulative_time = 0.0
+    partial_word = ""
+    partial_word_start_time = 0.0
+    word_times = []
+    alignment_started = False
+
+    for chunk in chunks:
+        alignment = _strip_utterance_leading_spaces(
+            chunk,
+            _WS_ALIGNMENT_KEYS,
+            not alignment_started,
+        )
+        alignment_started = True
+        chunk_word_times, partial_word, partial_word_start_time = calculate_word_times(
+            alignment,
+            cumulative_time,
+            partial_word,
+            partial_word_start_time,
+        )
+        word_times.extend(chunk_word_times)
+
+        starts = alignment["charStartTimesMs"]
+        durations = alignment["charDurationsMs"]
+        if starts and durations:
+            cumulative_time += (starts[-1] + durations[-1]) / 1000.0
+
+    if partial_word:
+        word_times.append((partial_word, partial_word_start_time))
+
+    return [word for word, _ in word_times]
+
+
+def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
+    chunks = [
+        _chunk(" Why did the math book"),
+        _chunk(" look so sad? "),
+        _chunk(" Because it had too m"),
+        _chunk("any problems. "),
+    ]
+
+    assert _words_from_chunks(chunks) == [
+        "Why",
+        "did",
+        "the",
+        "math",
+        "book",
+        "look",
+        "so",
+        "sad?",
+        "Because",
+        "it",
+        "had",
+        "too",
+        "many",
+        "problems.",
+    ]
+
+
+def test_elevenlabs_alignment_strips_only_utterance_leading_spaces():
+    first = _strip_utterance_leading_spaces(_chunk("  Hello"), _WS_ALIGNMENT_KEYS, True)
+    subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False)
+
+    assert first["chars"] == list("Hello")
+    assert subsequent["chars"] == list(" world")
				`@@ -0,0 +1 @@`
				- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` producing merged words (e.g. `bookLook`) when using Flash models. Flash often splits sentences mid-stream into alignment chunks that begin with a real inter-word space, but the previous fix unconditionally stripped that space from every chunk. Leading spaces are now stripped only on the first alignment chunk of an utterance, so subsequent chunks correctly flush partial words across boundaries.