Merge pull request #4415 from pipecat-ai/mb/fix-elevenlabs-leading-spaces-flash

This commit is contained in:
Mark Backman
2026-05-04 18:08:31 -04:00
committed by GitHub
3 changed files with 123 additions and 18 deletions

1
changelog/4415.fixed.md Normal file
View File

@@ -0,0 +1 @@
- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` producing merged words (e.g. `bookLook`) when using Flash models. Flash often splits sentences mid-stream into alignment chunks that begin with a real inter-word space, but the previous fix unconditionally stripped that space from every chunk. Leading spaces are now stripped only on the first alignment chunk of an utterance, so subsequent chunks correctly flush partial words across boundaries.

View File

@@ -248,32 +248,37 @@ class ElevenLabsHttpTTSSettings(TTSSettings):
)
def _strip_leading_space(
alignment: Mapping[str, Any], keys: tuple[str, str, str]
def _strip_utterance_leading_spaces(
alignment: Mapping[str, Any], keys: tuple[str, str, str], should_strip: bool
) -> Mapping[str, Any]:
"""Return alignment with a prepended space char removed, if present.
"""Return alignment with utterance-leading space chars removed, if requested.
Normalized alignment chunks from ElevenLabs begin with a leading space that
marks the prosody/chunk boundary. Left in place, it would prematurely
terminate a partial word carried over from the previous chunk. Stripping it
is lossless for timing: the dropped space's duration is still reflected in
the next char's `charStartTimesMs`, and the chunk's last-element values
(used to advance cumulative time) are untouched.
Normalized alignment chunks from ElevenLabs often begin with a space. On the
first chunk of an utterance, that space is leading whitespace and should not
become a text token. On subsequent chunks, however, a leading space can be a
real inter-word separator (Flash models commonly split sentences this way),
so it must be preserved for ``calculate_word_times`` to flush any partial
word carried over from the previous chunk.
Args:
alignment: Alignment dict from the API.
keys: Tuple of (chars_key, start_times_key, durations_or_end_times_key)
naming the three parallel arrays these differ between the
naming the three parallel arrays - these differ between the
WebSocket and HTTP response schemas.
should_strip: Whether this is still utterance-leading alignment data.
"""
chars_key, starts_key, tail_key = keys
chars = alignment.get(chars_key) or []
if chars and chars[0] == " ":
return {
chars_key: chars[1:],
starts_key: alignment.get(starts_key, [])[1:],
tail_key: alignment.get(tail_key, [])[1:],
}
if should_strip and chars and chars[0] == " ":
strip_count = 0
while strip_count < len(chars) and chars[strip_count] == " ":
strip_count += 1
stripped = dict(alignment)
stripped[chars_key] = chars[strip_count:]
stripped[starts_key] = alignment.get(starts_key, [])[strip_count:]
stripped[tail_key] = alignment.get(tail_key, [])[strip_count:]
return stripped
return alignment
@@ -548,6 +553,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
# Track partial words that span across alignment chunks
self._partial_word = ""
self._partial_word_start_time = 0.0
self._alignment_started_context_ids: set[str | None] = set()
# Context management for v1 multi API
self._receive_task = None
@@ -773,6 +779,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
self._cumulative_time = 0.0
self._partial_word = ""
self._partial_word_start_time = 0.0
self._alignment_started_context_ids.discard(context_id)
async def on_audio_context_interrupted(self, context_id: str):
"""Close the ElevenLabs context when the bot is interrupted."""
@@ -827,10 +834,12 @@ class ElevenLabsTTSService(WebsocketTTSService):
# alignment (the input text), so word timestamps stay accurate
# when a pronunciation dictionary or text normalization rewrites
# the input.
alignment = _strip_leading_space(
alignment = _strip_utterance_leading_spaces(
msg["normalizedAlignment"],
("chars", "charStartTimesMs", "charDurationsMs"),
received_ctx_id not in self._alignment_started_context_ids,
)
self._alignment_started_context_ids.add(received_ctx_id)
word_times, self._partial_word, self._partial_word_start_time = (
calculate_word_times(
alignment,
@@ -1326,6 +1335,7 @@ class ElevenLabsHttpTTSService(TTSService):
# Track the duration of this utterance based on the last character's end time
utterance_duration = 0
alignment_started = False
async for line in response.content:
line_str = line.decode("utf-8").strip()
if not line_str:
@@ -1348,14 +1358,16 @@ class ElevenLabsHttpTTSService(TTSService):
# accurate when a pronunciation dictionary or text
# normalization rewrites the input.
if data and data.get("normalized_alignment"):
alignment = _strip_leading_space(
alignment = _strip_utterance_leading_spaces(
data["normalized_alignment"],
(
"characters",
"character_start_times_seconds",
"character_end_times_seconds",
),
not alignment_started,
)
alignment_started = True
# Get end time of the last character in this chunk
char_end_times = alignment.get("character_end_times_seconds", [])
if char_end_times:

View File

@@ -0,0 +1,92 @@
#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Tests for ElevenLabs TTS alignment handling."""
from typing import Any
from pipecat.services.elevenlabs.tts import (
_strip_utterance_leading_spaces,
calculate_word_times,
)
_WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs")
def _chunk(text: str) -> dict[str, list[Any]]:
chars = list(text)
return {
"chars": chars,
"charStartTimesMs": [i * 100 for i in range(len(chars))],
"charDurationsMs": [100 for _ in chars],
}
def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]:
cumulative_time = 0.0
partial_word = ""
partial_word_start_time = 0.0
word_times = []
alignment_started = False
for chunk in chunks:
alignment = _strip_utterance_leading_spaces(
chunk,
_WS_ALIGNMENT_KEYS,
not alignment_started,
)
alignment_started = True
chunk_word_times, partial_word, partial_word_start_time = calculate_word_times(
alignment,
cumulative_time,
partial_word,
partial_word_start_time,
)
word_times.extend(chunk_word_times)
starts = alignment["charStartTimesMs"]
durations = alignment["charDurationsMs"]
if starts and durations:
cumulative_time += (starts[-1] + durations[-1]) / 1000.0
if partial_word:
word_times.append((partial_word, partial_word_start_time))
return [word for word, _ in word_times]
def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
chunks = [
_chunk(" Why did the math book"),
_chunk(" look so sad? "),
_chunk(" Because it had too m"),
_chunk("any problems. "),
]
assert _words_from_chunks(chunks) == [
"Why",
"did",
"the",
"math",
"book",
"look",
"so",
"sad?",
"Because",
"it",
"had",
"too",
"many",
"problems.",
]
def test_elevenlabs_alignment_strips_only_utterance_leading_spaces():
first = _strip_utterance_leading_spaces(_chunk(" Hello"), _WS_ALIGNMENT_KEYS, True)
subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False)
assert first["chars"] == list("Hello")
assert subsequent["chars"] == list(" world")