Fix sentence splitting for CJK and other non-Latin languages in TTS pipeline

NLTK's sent_tokenize() only supports ~15 European languages and defaults to English. For Japanese, Chinese, Korean, Hindi, Arabic, and other non-Latin languages, NLTK fails to recognize sentence boundaries like 。？！ causing text to accumulate until flush instead of being emitted sentence-by-sentence. Add a fallback in match_endofsentence() that scans for unambiguous non-Latin sentence-ending punctuation when NLTK fails to split the text. Latin punctuation (. ! ? ; …) is excluded from the fallback since NLTK handles those correctly and they can be ambiguous (abbreviations, decimals, etc.). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 14:27:49 +08:00
parent f453227ba3
commit 763002f2bc
3 changed files with 118 additions and 1 deletions
--- a/src/pipecat/utils/string.py
+++ b/src/pipecat/utils/string.py
@@ -89,6 +89,17 @@ SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
    }
 )

+# Latin punctuation that NLTK handles well — these need NLTK's disambiguation
+# because "." can appear in abbreviations, decimals, etc.
+_LATIN_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset({".", "!", "?", ";", "…"})
+
+# Non-Latin sentence-ending punctuation that is always unambiguous and never needs
+# NLTK's disambiguation logic. Used as a fallback when NLTK doesn't support the
+# language (e.g., Japanese, Chinese, Korean, Hindi, Arabic).
+UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = (
+    SENTENCE_ENDING_PUNCTUATION - _LATIN_SENTENCE_ENDING_PUNCTUATION
+)
+
 StartEndTags = Tuple[str, str]


@@ -144,7 +155,17 @@ def match_endofsentence(text: str) -> int:
    # common for text to be single words, so we need to ensure
    # sentence-ending punctuation is present.
    if len(sentences) == 1 and first_sentence == text:
-        return len(text) if text and text[-1] in SENTENCE_ENDING_PUNCTUATION else 0
+        if text and text[-1] in SENTENCE_ENDING_PUNCTUATION:
+            return len(text)
+        # Fallback for languages not supported by NLTK (e.g., Japanese, Chinese,
+        # Korean, Hindi, Arabic). NLTK returned the entire text as a single
+        # sentence, and the last character is not sentence-ending punctuation
+        # (it's a lookahead character). Scan for unambiguous non-Latin sentence-
+        # ending punctuation that doesn't need NLTK's disambiguation.
+        for i, ch in enumerate(text):
+            if ch in UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION:
+                return i + 1
+        return 0

    # If there are multiple sentences, the first one is complete by definition
    # (NLTK found a boundary, so there must be proper punctuation)
--- a/tests/test_simple_text_aggregator.py
+++ b/tests/test_simple_text_aggregator.py
@@ -124,6 +124,62 @@ class TestSimpleTextAggregator(unittest.IsolatedAsyncioTestCase):
        result = await self.aggregator.flush()
        assert result.text == "W"

+    async def test_japanese_multiple_sentences(self):
+        """Test that Japanese sentences are properly split during streaming."""
+        text = "こんにちは。元気ですか？"
+        results = [agg async for agg in self.aggregator.aggregate(text)]
+
+        # First sentence detected when 元 arrives as lookahead after 。
+        assert len(results) == 1
+        assert results[0].text == "こんにちは。"
+
+        # Flush returns the second sentence
+        result = await self.aggregator.flush()
+        assert result.text == "元気ですか？"
+
+    async def test_japanese_sentence_with_lookahead(self):
+        """Test that a Japanese sentence is detected with a lookahead character."""
+        text = "こんにちは。元"
+        results = [agg async for agg in self.aggregator.aggregate(text)]
+
+        # 。 triggers lookahead, then 元 confirms it
+        assert len(results) == 1
+        assert results[0].text == "こんにちは。"
+
+        # Flush returns remainder
+        result = await self.aggregator.flush()
+        assert result.text == "元"
+
+    async def test_chinese_streaming_tokens(self):
+        """Test Chinese text split across multiple streaming tokens."""
+        aggregator = SimpleTextAggregator()
+
+        tokens = ["你好", "世界", "。", "下一", "句话", "。"]
+        all_results = []
+        for token in tokens:
+            results = [agg async for agg in aggregator.aggregate(token)]
+            all_results.extend(results)
+
+        # First sentence detected when 下 arrives after 。
+        assert len(all_results) == 1
+        assert all_results[0].text == "你好世界。"
+
+        # Flush returns the second sentence
+        result = await aggregator.flush()
+        assert result.text == "下一句话。"
+
+    async def test_japanese_single_sentence_flush(self):
+        """Test that a single Japanese sentence with no lookahead flushes correctly."""
+        text = "こんにちは。"
+        results = [agg async for agg in self.aggregator.aggregate(text)]
+
+        # No lookahead yet - waiting
+        assert len(results) == 0
+
+        # Flush returns the complete sentence
+        result = await self.aggregator.flush()
+        assert result.text == "こんにちは。"
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_utils_string.py
+++ b/tests/test_utils_string.py
@@ -153,6 +153,46 @@ class TestUtilsString(unittest.IsolatedAsyncioTestCase):
        for sentence in latin_script_sentences:
            assert match_endofsentence(sentence), f"Failed for Latin script: {sentence}"

+    async def test_endofsentence_cjk_with_lookahead(self):
+        """Test sentence detection for CJK text with lookahead characters.
+
+        This tests the NLTK fallback path: NLTK returns entire text as one
+        sentence because it doesn't support CJK languages, but unambiguous
+        punctuation is detected via the fallback scan.
+        """
+        # Japanese: sentence + lookahead character
+        assert match_endofsentence("こんにちは。元") == 6
+        assert match_endofsentence("元気ですか？は") == 6
+        assert match_endofsentence("ありがとう！次") == 6
+
+        # Chinese: sentence + lookahead character
+        assert match_endofsentence("你好世界。下") == 5
+        assert match_endofsentence("你好吗？我") == 4
+
+        # Korean: sentence + lookahead character
+        assert match_endofsentence("안녕하세요。다") == 6
+
+        # Multiple CJK sentences with lookahead - should return first sentence
+        assert match_endofsentence("こんにちは。元気ですか？は") == 6
+
+        # Indic script with lookahead
+        assert match_endofsentence("हैलो।अ") == 5
+
+        # Arabic with lookahead
+        assert match_endofsentence("مرحبا؟ك") == 6
+
+    async def test_endofsentence_latin_not_affected_by_fallback(self):
+        """Verify that the CJK fallback does not change behavior for Latin text."""
+        # These should still return 0 - Latin "." is NOT in the unambiguous set
+        assert not match_endofsentence("Mr. S")
+        assert not match_endofsentence("Ok, Mr. Smith let's ")
+        assert not match_endofsentence("The number pi is 3.14159")
+        assert not match_endofsentence("America, or the U.S")
+
+        # These should still return correct values via NLTK path
+        assert match_endofsentence("This is a sentence. This is another one") == 19
+        assert match_endofsentence("For information, call 411.") == 26
+
    async def test_endofsentence_streaming_tokens(self):
        """Test the specific use case of streaming LLM tokens."""