From 763002f2bcb621b30fbd4fcab447a380a316aa08 Mon Sep 17 00:00:00 2001 From: James Hush Date: Mon, 2 Feb 2026 14:27:49 +0800 Subject: [PATCH] Fix sentence splitting for CJK and other non-Latin languages in TTS pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NLTK's sent_tokenize() only supports ~15 European languages and defaults to English. For Japanese, Chinese, Korean, Hindi, Arabic, and other non-Latin languages, NLTK fails to recognize sentence boundaries like 。?! causing text to accumulate until flush instead of being emitted sentence-by-sentence. Add a fallback in match_endofsentence() that scans for unambiguous non-Latin sentence-ending punctuation when NLTK fails to split the text. Latin punctuation (. ! ? ; …) is excluded from the fallback since NLTK handles those correctly and they can be ambiguous (abbreviations, decimals, etc.). Co-Authored-By: Claude Opus 4.5 --- src/pipecat/utils/string.py | 23 +++++++++++- tests/test_simple_text_aggregator.py | 56 ++++++++++++++++++++++++++++ tests/test_utils_string.py | 40 ++++++++++++++++++++ 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/src/pipecat/utils/string.py b/src/pipecat/utils/string.py index 3a5d69cad..20fcdb2e0 100644 --- a/src/pipecat/utils/string.py +++ b/src/pipecat/utils/string.py @@ -89,6 +89,17 @@ SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset( } ) +# Latin punctuation that NLTK handles well — these need NLTK's disambiguation +# because "." can appear in abbreviations, decimals, etc. +_LATIN_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset({".", "!", "?", ";", "…"}) + +# Non-Latin sentence-ending punctuation that is always unambiguous and never needs +# NLTK's disambiguation logic. Used as a fallback when NLTK doesn't support the +# language (e.g., Japanese, Chinese, Korean, Hindi, Arabic). +UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = ( + SENTENCE_ENDING_PUNCTUATION - _LATIN_SENTENCE_ENDING_PUNCTUATION +) + StartEndTags = Tuple[str, str] @@ -144,7 +155,17 @@ def match_endofsentence(text: str) -> int: # common for text to be single words, so we need to ensure # sentence-ending punctuation is present. if len(sentences) == 1 and first_sentence == text: - return len(text) if text and text[-1] in SENTENCE_ENDING_PUNCTUATION else 0 + if text and text[-1] in SENTENCE_ENDING_PUNCTUATION: + return len(text) + # Fallback for languages not supported by NLTK (e.g., Japanese, Chinese, + # Korean, Hindi, Arabic). NLTK returned the entire text as a single + # sentence, and the last character is not sentence-ending punctuation + # (it's a lookahead character). Scan for unambiguous non-Latin sentence- + # ending punctuation that doesn't need NLTK's disambiguation. + for i, ch in enumerate(text): + if ch in UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION: + return i + 1 + return 0 # If there are multiple sentences, the first one is complete by definition # (NLTK found a boundary, so there must be proper punctuation) diff --git a/tests/test_simple_text_aggregator.py b/tests/test_simple_text_aggregator.py index ef51cfc49..4b3613e27 100644 --- a/tests/test_simple_text_aggregator.py +++ b/tests/test_simple_text_aggregator.py @@ -124,6 +124,62 @@ class TestSimpleTextAggregator(unittest.IsolatedAsyncioTestCase): result = await self.aggregator.flush() assert result.text == "W" + async def test_japanese_multiple_sentences(self): + """Test that Japanese sentences are properly split during streaming.""" + text = "こんにちは。元気ですか?" + results = [agg async for agg in self.aggregator.aggregate(text)] + + # First sentence detected when 元 arrives as lookahead after 。 + assert len(results) == 1 + assert results[0].text == "こんにちは。" + + # Flush returns the second sentence + result = await self.aggregator.flush() + assert result.text == "元気ですか?" + + async def test_japanese_sentence_with_lookahead(self): + """Test that a Japanese sentence is detected with a lookahead character.""" + text = "こんにちは。元" + results = [agg async for agg in self.aggregator.aggregate(text)] + + # 。 triggers lookahead, then 元 confirms it + assert len(results) == 1 + assert results[0].text == "こんにちは。" + + # Flush returns remainder + result = await self.aggregator.flush() + assert result.text == "元" + + async def test_chinese_streaming_tokens(self): + """Test Chinese text split across multiple streaming tokens.""" + aggregator = SimpleTextAggregator() + + tokens = ["你好", "世界", "。", "下一", "句话", "。"] + all_results = [] + for token in tokens: + results = [agg async for agg in aggregator.aggregate(token)] + all_results.extend(results) + + # First sentence detected when 下 arrives after 。 + assert len(all_results) == 1 + assert all_results[0].text == "你好世界。" + + # Flush returns the second sentence + result = await aggregator.flush() + assert result.text == "下一句话。" + + async def test_japanese_single_sentence_flush(self): + """Test that a single Japanese sentence with no lookahead flushes correctly.""" + text = "こんにちは。" + results = [agg async for agg in self.aggregator.aggregate(text)] + + # No lookahead yet - waiting + assert len(results) == 0 + + # Flush returns the complete sentence + result = await self.aggregator.flush() + assert result.text == "こんにちは。" + if __name__ == "__main__": unittest.main() diff --git a/tests/test_utils_string.py b/tests/test_utils_string.py index 4afde718c..5130c1daa 100644 --- a/tests/test_utils_string.py +++ b/tests/test_utils_string.py @@ -153,6 +153,46 @@ class TestUtilsString(unittest.IsolatedAsyncioTestCase): for sentence in latin_script_sentences: assert match_endofsentence(sentence), f"Failed for Latin script: {sentence}" + async def test_endofsentence_cjk_with_lookahead(self): + """Test sentence detection for CJK text with lookahead characters. + + This tests the NLTK fallback path: NLTK returns entire text as one + sentence because it doesn't support CJK languages, but unambiguous + punctuation is detected via the fallback scan. + """ + # Japanese: sentence + lookahead character + assert match_endofsentence("こんにちは。元") == 6 + assert match_endofsentence("元気ですか?は") == 6 + assert match_endofsentence("ありがとう!次") == 6 + + # Chinese: sentence + lookahead character + assert match_endofsentence("你好世界。下") == 5 + assert match_endofsentence("你好吗?我") == 4 + + # Korean: sentence + lookahead character + assert match_endofsentence("안녕하세요。다") == 6 + + # Multiple CJK sentences with lookahead - should return first sentence + assert match_endofsentence("こんにちは。元気ですか?は") == 6 + + # Indic script with lookahead + assert match_endofsentence("हैलो।अ") == 5 + + # Arabic with lookahead + assert match_endofsentence("مرحبا؟ك") == 6 + + async def test_endofsentence_latin_not_affected_by_fallback(self): + """Verify that the CJK fallback does not change behavior for Latin text.""" + # These should still return 0 - Latin "." is NOT in the unambiguous set + assert not match_endofsentence("Mr. S") + assert not match_endofsentence("Ok, Mr. Smith let's ") + assert not match_endofsentence("The number pi is 3.14159") + assert not match_endofsentence("America, or the U.S") + + # These should still return correct values via NLTK path + assert match_endofsentence("This is a sentence. This is another one") == 19 + assert match_endofsentence("For information, call 411.") == 26 + async def test_endofsentence_streaming_tokens(self): """Test the specific use case of streaming LLM tokens."""