Add changelog for #4517

Fix ElevenLabs Chinese/Japanese timestamp spacing
2026-05-18 15:56:42 -04:00 · 2026-05-18 15:44:08 -04:00
3 changed files with 99 additions and 3 deletions
--- a/changelog/4517.fixed.md
+++ b/changelog/4517.fixed.md
@@ -0,0 +1 @@
+- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` inserting unwanted spaces between words when synthesizing Chinese or Japanese. Word timestamps for these languages already include their own spacing, so they are now forwarded with `includes_inter_frame_spaces=True` to avoid double-spacing in transcripts and context.
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -149,6 +149,17 @@ def output_format_from_sample_rate(sample_rate: int) -> str:
    return "pcm_24000"


+def _is_chinese_or_japanese_language(language: str) -> bool:
+    """Check if the given language is Chinese or Japanese."""
+    base_lang = language.split("-")[0].lower()
+    return base_lang in {"zh", "ja"}
+
+
+def _word_timestamps_include_inter_frame_spaces(language: str | None) -> bool:
+    """Whether timestamp text should be treated as carrying its own spacing."""
+    return bool(language and _is_chinese_or_japanese_language(language))
+
+
 def build_elevenlabs_voice_settings(
    settings: Union[dict[str, Any], "TTSSettings"],
 ) -> dict[str, float | bool] | None:
@@ -890,7 +901,17 @@ class ElevenLabsTTSService(WebsocketTTSService):
                )

                if word_times:
-                    await self.add_word_timestamps(word_times, received_ctx_id)
+                    await self.add_word_timestamps(
+                        word_times,
+                        received_ctx_id,
+                        includes_inter_frame_spaces=(
+                            True
+                            if _word_timestamps_include_inter_frame_spaces(
+                                assert_given(self._settings.language)
+                            )
+                            else None
+                        ),
+                    )

                    # Calculate the actual end time of this audio chunk
                    char_start_times_ms = alignment.get("charStartTimesMs", [])
@@ -1420,7 +1441,17 @@ class ElevenLabsHttpTTSService(TTSService):
                            # Calculate word timestamps
                            word_times = self.calculate_word_times(alignment)
                            if word_times:
-                                await self.add_word_timestamps(word_times, context_id)
+                                await self.add_word_timestamps(
+                                    word_times,
+                                    context_id,
+                                    includes_inter_frame_spaces=(
+                                        True
+                                        if _word_timestamps_include_inter_frame_spaces(
+                                            assert_given(self._settings.language)
+                                        )
+                                        else None
+                                    ),
+                                )
                    except json.JSONDecodeError as e:
                        logger.warning(f"Failed to parse JSON from stream: {e}")
                        continue
@@ -1432,7 +1463,17 @@ class ElevenLabsHttpTTSService(TTSService):
                # since this is the end of the utterance
                if self._partial_word:
                    final_word_time = [(self._partial_word, self._partial_word_start_time)]
-                    await self.add_word_timestamps(final_word_time, context_id)
+                    await self.add_word_timestamps(
+                        final_word_time,
+                        context_id,
+                        includes_inter_frame_spaces=(
+                            True
+                            if _word_timestamps_include_inter_frame_spaces(
+                                assert_given(self._settings.language)
+                            )
+                            else None
+                        ),
+                    )
                    self._partial_word = ""
                    self._partial_word_start_time = 0.0

--- a/tests/test_elevenlabs_tts.py
+++ b/tests/test_elevenlabs_tts.py
@@ -11,8 +11,10 @@ from typing import Any
 from pipecat.services.elevenlabs.tts import (
    _select_alignment,
    _strip_utterance_leading_spaces,
+    _word_timestamps_include_inter_frame_spaces,
    calculate_word_times,
 )
+from pipecat.utils.string import TextPartForConcatenation, concatenate_aggregated_text

 _WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs")

@@ -59,6 +61,19 @@ def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]:
    return [word for word, _ in word_times]


+def _concatenate_words_for_language(words: list[str], language: str) -> str:
+    includes_inter_frame_spaces = _word_timestamps_include_inter_frame_spaces(language)
+    return concatenate_aggregated_text(
+        [
+            TextPartForConcatenation(
+                word,
+                includes_inter_part_spaces=includes_inter_frame_spaces,
+            )
+            for word in words
+        ]
+    )
+
+
 def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
    chunks = [
        _chunk(" Why did the math book"),
@@ -85,6 +100,45 @@ def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
    ]


+def test_elevenlabs_japanese_timestamp_chunks_reassemble_without_spaces():
+    words = _words_from_chunks(
+        [
+            _chunk("どんなことでも気 "),
+            _chunk("軽に相談してくださいね。 "),
+        ]
+    )
+
+    assert words == ["どんなことでも気", "軽に相談してくださいね。"]
+    assert (
+        _concatenate_words_for_language(words, language="ja")
+        == "どんなことでも気軽に相談してくださいね。"
+    )
+
+
+def test_elevenlabs_chinese_timestamp_chunks_reassemble_without_spaces():
+    words = _words_from_chunks(
+        [
+            _chunk("你好，我是 "),
+            _chunk("你的智能助手。 "),
+        ]
+    )
+
+    assert words == ["你好，我是", "你的智能助手。"]
+    assert _concatenate_words_for_language(words, language="zh-CN") == "你好，我是你的智能助手。"
+
+
+def test_elevenlabs_english_timestamp_chunks_reassemble_with_spaces():
+    words = ["Hello", "world."]
+
+    assert _concatenate_words_for_language(words, language="en") == "Hello world."
+
+
+def test_elevenlabs_timestamp_spacing_languages():
+    assert _word_timestamps_include_inter_frame_spaces("ja") is True
+    assert _word_timestamps_include_inter_frame_spaces("zh-CN") is True
+    assert _word_timestamps_include_inter_frame_spaces("en") is False
+
+
 def test_elevenlabs_alignment_strips_only_utterance_leading_spaces():
    first = _strip_utterance_leading_spaces(_chunk("  Hello"), _WS_ALIGNMENT_KEYS, True)
    subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False)
Author	SHA1	Message	Date
Mark Backman	eb3dce1c73	Add changelog for #4517	2026-05-18 15:56:42 -04:00
Mark Backman	15577496e3	Fix ElevenLabs Chinese/Japanese timestamp spacing	2026-05-18 15:44:08 -04:00
				`@@ -0,0 +1 @@`
				- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` inserting unwanted spaces between words when synthesizing Chinese or Japanese. Word timestamps for these languages already include their own spacing, so they are now forwarded with `includes_inter_frame_spaces=True` to avoid double-spacing in transcripts and context.