Compare commits

...

2 Commits

Author SHA1 Message Date
Mark Backman
eb3dce1c73 Add changelog for #4517 2026-05-18 15:56:42 -04:00
Mark Backman
15577496e3 Fix ElevenLabs Chinese/Japanese timestamp spacing 2026-05-18 15:44:08 -04:00
3 changed files with 99 additions and 3 deletions

1
changelog/4517.fixed.md Normal file
View File

@@ -0,0 +1 @@
- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` inserting unwanted spaces between words when synthesizing Chinese or Japanese. Word timestamps for these languages already include their own spacing, so they are now forwarded with `includes_inter_frame_spaces=True` to avoid double-spacing in transcripts and context.

View File

@@ -149,6 +149,17 @@ def output_format_from_sample_rate(sample_rate: int) -> str:
return "pcm_24000"
def _is_chinese_or_japanese_language(language: str) -> bool:
"""Check if the given language is Chinese or Japanese."""
base_lang = language.split("-")[0].lower()
return base_lang in {"zh", "ja"}
def _word_timestamps_include_inter_frame_spaces(language: str | None) -> bool:
"""Whether timestamp text should be treated as carrying its own spacing."""
return bool(language and _is_chinese_or_japanese_language(language))
def build_elevenlabs_voice_settings(
settings: Union[dict[str, Any], "TTSSettings"],
) -> dict[str, float | bool] | None:
@@ -890,7 +901,17 @@ class ElevenLabsTTSService(WebsocketTTSService):
)
if word_times:
await self.add_word_timestamps(word_times, received_ctx_id)
await self.add_word_timestamps(
word_times,
received_ctx_id,
includes_inter_frame_spaces=(
True
if _word_timestamps_include_inter_frame_spaces(
assert_given(self._settings.language)
)
else None
),
)
# Calculate the actual end time of this audio chunk
char_start_times_ms = alignment.get("charStartTimesMs", [])
@@ -1420,7 +1441,17 @@ class ElevenLabsHttpTTSService(TTSService):
# Calculate word timestamps
word_times = self.calculate_word_times(alignment)
if word_times:
await self.add_word_timestamps(word_times, context_id)
await self.add_word_timestamps(
word_times,
context_id,
includes_inter_frame_spaces=(
True
if _word_timestamps_include_inter_frame_spaces(
assert_given(self._settings.language)
)
else None
),
)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON from stream: {e}")
continue
@@ -1432,7 +1463,17 @@ class ElevenLabsHttpTTSService(TTSService):
# since this is the end of the utterance
if self._partial_word:
final_word_time = [(self._partial_word, self._partial_word_start_time)]
await self.add_word_timestamps(final_word_time, context_id)
await self.add_word_timestamps(
final_word_time,
context_id,
includes_inter_frame_spaces=(
True
if _word_timestamps_include_inter_frame_spaces(
assert_given(self._settings.language)
)
else None
),
)
self._partial_word = ""
self._partial_word_start_time = 0.0

View File

@@ -11,8 +11,10 @@ from typing import Any
from pipecat.services.elevenlabs.tts import (
_select_alignment,
_strip_utterance_leading_spaces,
_word_timestamps_include_inter_frame_spaces,
calculate_word_times,
)
from pipecat.utils.string import TextPartForConcatenation, concatenate_aggregated_text
_WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs")
@@ -59,6 +61,19 @@ def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]:
return [word for word, _ in word_times]
def _concatenate_words_for_language(words: list[str], language: str) -> str:
includes_inter_frame_spaces = _word_timestamps_include_inter_frame_spaces(language)
return concatenate_aggregated_text(
[
TextPartForConcatenation(
word,
includes_inter_part_spaces=includes_inter_frame_spaces,
)
for word in words
]
)
def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
chunks = [
_chunk(" Why did the math book"),
@@ -85,6 +100,45 @@ def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
]
def test_elevenlabs_japanese_timestamp_chunks_reassemble_without_spaces():
words = _words_from_chunks(
[
_chunk("どんなことでも気 "),
_chunk("軽に相談してくださいね。 "),
]
)
assert words == ["どんなことでも気", "軽に相談してくださいね。"]
assert (
_concatenate_words_for_language(words, language="ja")
== "どんなことでも気軽に相談してくださいね。"
)
def test_elevenlabs_chinese_timestamp_chunks_reassemble_without_spaces():
words = _words_from_chunks(
[
_chunk("你好,我是 "),
_chunk("你的智能助手。 "),
]
)
assert words == ["你好,我是", "你的智能助手。"]
assert _concatenate_words_for_language(words, language="zh-CN") == "你好,我是你的智能助手。"
def test_elevenlabs_english_timestamp_chunks_reassemble_with_spaces():
words = ["Hello", "world."]
assert _concatenate_words_for_language(words, language="en") == "Hello world."
def test_elevenlabs_timestamp_spacing_languages():
assert _word_timestamps_include_inter_frame_spaces("ja") is True
assert _word_timestamps_include_inter_frame_spaces("zh-CN") is True
assert _word_timestamps_include_inter_frame_spaces("en") is False
def test_elevenlabs_alignment_strips_only_utterance_leading_spaces():
first = _strip_utterance_leading_spaces(_chunk(" Hello"), _WS_ALIGNMENT_KEYS, True)
subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False)