Merge pull request #3617 from pipecat-ai/fix/cjk-sentence-splitting

Fix sentence splitting for CJK and other non-Latin languages
This commit is contained in:
Mark Backman
2026-02-02 18:16:51 -05:00
committed by GitHub
4 changed files with 123 additions and 1 deletions

5
changelog/3617.fixed.md Normal file
View File

@@ -0,0 +1,5 @@
- Fixed sentence splitting for Japanese, Chinese, Korean, and other non-Latin
languages in TTS pipeline. NLTK's sentence tokenizer does not support CJK
languages, causing text to accumulate until flush instead of being split at
sentence boundaries. Added fallback detection for unambiguous non-Latin
sentence-ending punctuation (e.g., `。`, ``, ``).

View File

@@ -89,6 +89,17 @@ SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
}
)
# Latin punctuation that NLTK handles well — these need NLTK's disambiguation
# because "." can appear in abbreviations, decimals, etc.
_LATIN_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset({".", "!", "?", ";", ""})
# Non-Latin sentence-ending punctuation that is always unambiguous and never needs
# NLTK's disambiguation logic. Used as a fallback when NLTK doesn't support the
# language (e.g., Japanese, Chinese, Korean, Hindi, Arabic).
UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = (
SENTENCE_ENDING_PUNCTUATION - _LATIN_SENTENCE_ENDING_PUNCTUATION
)
StartEndTags = Tuple[str, str]
@@ -144,7 +155,17 @@ def match_endofsentence(text: str) -> int:
# common for text to be single words, so we need to ensure
# sentence-ending punctuation is present.
if len(sentences) == 1 and first_sentence == text:
return len(text) if text and text[-1] in SENTENCE_ENDING_PUNCTUATION else 0
if text and text[-1] in SENTENCE_ENDING_PUNCTUATION:
return len(text)
# Fallback for languages not supported by NLTK (e.g., Japanese, Chinese,
# Korean, Hindi, Arabic). NLTK returned the entire text as a single
# sentence, and the last character is not sentence-ending punctuation
# (it's a lookahead character). Scan for unambiguous non-Latin sentence-
# ending punctuation that doesn't need NLTK's disambiguation.
for i, ch in enumerate(text):
if ch in UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION:
return i + 1
return 0
# If there are multiple sentences, the first one is complete by definition
# (NLTK found a boundary, so there must be proper punctuation)

View File

@@ -124,6 +124,62 @@ class TestSimpleTextAggregator(unittest.IsolatedAsyncioTestCase):
result = await self.aggregator.flush()
assert result.text == "W"
async def test_japanese_multiple_sentences(self):
"""Test that Japanese sentences are properly split during streaming."""
text = "こんにちは。元気ですか?"
results = [agg async for agg in self.aggregator.aggregate(text)]
# First sentence detected when 元 arrives as lookahead after 。
assert len(results) == 1
assert results[0].text == "こんにちは。"
# Flush returns the second sentence
result = await self.aggregator.flush()
assert result.text == "元気ですか?"
async def test_japanese_sentence_with_lookahead(self):
"""Test that a Japanese sentence is detected with a lookahead character."""
text = "こんにちは。元"
results = [agg async for agg in self.aggregator.aggregate(text)]
# 。 triggers lookahead, then 元 confirms it
assert len(results) == 1
assert results[0].text == "こんにちは。"
# Flush returns remainder
result = await self.aggregator.flush()
assert result.text == ""
async def test_chinese_streaming_tokens(self):
"""Test Chinese text split across multiple streaming tokens."""
aggregator = SimpleTextAggregator()
tokens = ["你好", "世界", "", "下一", "句话", ""]
all_results = []
for token in tokens:
results = [agg async for agg in aggregator.aggregate(token)]
all_results.extend(results)
# First sentence detected when 下 arrives after 。
assert len(all_results) == 1
assert all_results[0].text == "你好世界。"
# Flush returns the second sentence
result = await aggregator.flush()
assert result.text == "下一句话。"
async def test_japanese_single_sentence_flush(self):
"""Test that a single Japanese sentence with no lookahead flushes correctly."""
text = "こんにちは。"
results = [agg async for agg in self.aggregator.aggregate(text)]
# No lookahead yet - waiting
assert len(results) == 0
# Flush returns the complete sentence
result = await self.aggregator.flush()
assert result.text == "こんにちは。"
if __name__ == "__main__":
unittest.main()

View File

@@ -153,6 +153,46 @@ class TestUtilsString(unittest.IsolatedAsyncioTestCase):
for sentence in latin_script_sentences:
assert match_endofsentence(sentence), f"Failed for Latin script: {sentence}"
async def test_endofsentence_cjk_with_lookahead(self):
"""Test sentence detection for CJK text with lookahead characters.
This tests the NLTK fallback path: NLTK returns entire text as one
sentence because it doesn't support CJK languages, but unambiguous
punctuation is detected via the fallback scan.
"""
# Japanese: sentence + lookahead character
assert match_endofsentence("こんにちは。元") == 6
assert match_endofsentence("元気ですか?は") == 6
assert match_endofsentence("ありがとう!次") == 6
# Chinese: sentence + lookahead character
assert match_endofsentence("你好世界。下") == 5
assert match_endofsentence("你好吗?我") == 4
# Korean: sentence + lookahead character
assert match_endofsentence("안녕하세요。다") == 6
# Multiple CJK sentences with lookahead - should return first sentence
assert match_endofsentence("こんにちは。元気ですか?は") == 6
# Indic script with lookahead
assert match_endofsentence("हैलो।अ") == 5
# Arabic with lookahead
assert match_endofsentence("مرحبا؟ك") == 6
async def test_endofsentence_latin_not_affected_by_fallback(self):
"""Verify that the CJK fallback does not change behavior for Latin text."""
# These should still return 0 - Latin "." is NOT in the unambiguous set
assert not match_endofsentence("Mr. S")
assert not match_endofsentence("Ok, Mr. Smith let's ")
assert not match_endofsentence("The number pi is 3.14159")
assert not match_endofsentence("America, or the U.S")
# These should still return correct values via NLTK path
assert match_endofsentence("This is a sentence. This is another one") == 19
assert match_endofsentence("For information, call 411.") == 26
async def test_endofsentence_streaming_tokens(self):
"""Test the specific use case of streaming LLM tokens."""