Compare commits
2 Commits
main
...
mb/fix-ele
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eb3dce1c73 | ||
|
|
15577496e3 |
1
changelog/4517.fixed.md
Normal file
1
changelog/4517.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` inserting unwanted spaces between words when synthesizing Chinese or Japanese. Word timestamps for these languages already include their own spacing, so they are now forwarded with `includes_inter_frame_spaces=True` to avoid double-spacing in transcripts and context.
|
||||
@@ -149,6 +149,17 @@ def output_format_from_sample_rate(sample_rate: int) -> str:
|
||||
return "pcm_24000"
|
||||
|
||||
|
||||
def _is_chinese_or_japanese_language(language: str) -> bool:
|
||||
"""Check if the given language is Chinese or Japanese."""
|
||||
base_lang = language.split("-")[0].lower()
|
||||
return base_lang in {"zh", "ja"}
|
||||
|
||||
|
||||
def _word_timestamps_include_inter_frame_spaces(language: str | None) -> bool:
|
||||
"""Whether timestamp text should be treated as carrying its own spacing."""
|
||||
return bool(language and _is_chinese_or_japanese_language(language))
|
||||
|
||||
|
||||
def build_elevenlabs_voice_settings(
|
||||
settings: Union[dict[str, Any], "TTSSettings"],
|
||||
) -> dict[str, float | bool] | None:
|
||||
@@ -890,7 +901,17 @@ class ElevenLabsTTSService(WebsocketTTSService):
|
||||
)
|
||||
|
||||
if word_times:
|
||||
await self.add_word_timestamps(word_times, received_ctx_id)
|
||||
await self.add_word_timestamps(
|
||||
word_times,
|
||||
received_ctx_id,
|
||||
includes_inter_frame_spaces=(
|
||||
True
|
||||
if _word_timestamps_include_inter_frame_spaces(
|
||||
assert_given(self._settings.language)
|
||||
)
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
# Calculate the actual end time of this audio chunk
|
||||
char_start_times_ms = alignment.get("charStartTimesMs", [])
|
||||
@@ -1420,7 +1441,17 @@ class ElevenLabsHttpTTSService(TTSService):
|
||||
# Calculate word timestamps
|
||||
word_times = self.calculate_word_times(alignment)
|
||||
if word_times:
|
||||
await self.add_word_timestamps(word_times, context_id)
|
||||
await self.add_word_timestamps(
|
||||
word_times,
|
||||
context_id,
|
||||
includes_inter_frame_spaces=(
|
||||
True
|
||||
if _word_timestamps_include_inter_frame_spaces(
|
||||
assert_given(self._settings.language)
|
||||
)
|
||||
else None
|
||||
),
|
||||
)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse JSON from stream: {e}")
|
||||
continue
|
||||
@@ -1432,7 +1463,17 @@ class ElevenLabsHttpTTSService(TTSService):
|
||||
# since this is the end of the utterance
|
||||
if self._partial_word:
|
||||
final_word_time = [(self._partial_word, self._partial_word_start_time)]
|
||||
await self.add_word_timestamps(final_word_time, context_id)
|
||||
await self.add_word_timestamps(
|
||||
final_word_time,
|
||||
context_id,
|
||||
includes_inter_frame_spaces=(
|
||||
True
|
||||
if _word_timestamps_include_inter_frame_spaces(
|
||||
assert_given(self._settings.language)
|
||||
)
|
||||
else None
|
||||
),
|
||||
)
|
||||
self._partial_word = ""
|
||||
self._partial_word_start_time = 0.0
|
||||
|
||||
|
||||
@@ -11,8 +11,10 @@ from typing import Any
|
||||
from pipecat.services.elevenlabs.tts import (
|
||||
_select_alignment,
|
||||
_strip_utterance_leading_spaces,
|
||||
_word_timestamps_include_inter_frame_spaces,
|
||||
calculate_word_times,
|
||||
)
|
||||
from pipecat.utils.string import TextPartForConcatenation, concatenate_aggregated_text
|
||||
|
||||
_WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs")
|
||||
|
||||
@@ -59,6 +61,19 @@ def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]:
|
||||
return [word for word, _ in word_times]
|
||||
|
||||
|
||||
def _concatenate_words_for_language(words: list[str], language: str) -> str:
|
||||
includes_inter_frame_spaces = _word_timestamps_include_inter_frame_spaces(language)
|
||||
return concatenate_aggregated_text(
|
||||
[
|
||||
TextPartForConcatenation(
|
||||
word,
|
||||
includes_inter_part_spaces=includes_inter_frame_spaces,
|
||||
)
|
||||
for word in words
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
|
||||
chunks = [
|
||||
_chunk(" Why did the math book"),
|
||||
@@ -85,6 +100,45 @@ def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
|
||||
]
|
||||
|
||||
|
||||
def test_elevenlabs_japanese_timestamp_chunks_reassemble_without_spaces():
|
||||
words = _words_from_chunks(
|
||||
[
|
||||
_chunk("どんなことでも気 "),
|
||||
_chunk("軽に相談してくださいね。 "),
|
||||
]
|
||||
)
|
||||
|
||||
assert words == ["どんなことでも気", "軽に相談してくださいね。"]
|
||||
assert (
|
||||
_concatenate_words_for_language(words, language="ja")
|
||||
== "どんなことでも気軽に相談してくださいね。"
|
||||
)
|
||||
|
||||
|
||||
def test_elevenlabs_chinese_timestamp_chunks_reassemble_without_spaces():
|
||||
words = _words_from_chunks(
|
||||
[
|
||||
_chunk("你好,我是 "),
|
||||
_chunk("你的智能助手。 "),
|
||||
]
|
||||
)
|
||||
|
||||
assert words == ["你好,我是", "你的智能助手。"]
|
||||
assert _concatenate_words_for_language(words, language="zh-CN") == "你好,我是你的智能助手。"
|
||||
|
||||
|
||||
def test_elevenlabs_english_timestamp_chunks_reassemble_with_spaces():
|
||||
words = ["Hello", "world."]
|
||||
|
||||
assert _concatenate_words_for_language(words, language="en") == "Hello world."
|
||||
|
||||
|
||||
def test_elevenlabs_timestamp_spacing_languages():
|
||||
assert _word_timestamps_include_inter_frame_spaces("ja") is True
|
||||
assert _word_timestamps_include_inter_frame_spaces("zh-CN") is True
|
||||
assert _word_timestamps_include_inter_frame_spaces("en") is False
|
||||
|
||||
|
||||
def test_elevenlabs_alignment_strips_only_utterance_leading_spaces():
|
||||
first = _strip_utterance_leading_spaces(_chunk(" Hello"), _WS_ALIGNMENT_KEYS, True)
|
||||
subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False)
|
||||
|
||||
Reference in New Issue
Block a user