Fix sentence splitting for CJK and other non-Latin languages in TTS pipeline

NLTK's sent_tokenize() only supports ~15 European languages and defaults to
English. For Japanese, Chinese, Korean, Hindi, Arabic, and other non-Latin
languages, NLTK fails to recognize sentence boundaries like 。?! causing
text to accumulate until flush instead of being emitted sentence-by-sentence.

Add a fallback in match_endofsentence() that scans for unambiguous non-Latin
sentence-ending punctuation when NLTK fails to split the text. Latin
punctuation (. ! ? ; …) is excluded from the fallback since NLTK handles
those correctly and they can be ambiguous (abbreviations, decimals, etc.).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
James Hush
2026-02-02 14:27:49 +08:00
parent f453227ba3
commit 763002f2bc
3 changed files with 118 additions and 1 deletions

View File

@@ -89,6 +89,17 @@ SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
}
)
# Latin punctuation that NLTK handles well — these need NLTK's disambiguation
# because "." can appear in abbreviations, decimals, etc.
_LATIN_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset({".", "!", "?", ";", ""})
# Non-Latin sentence-ending punctuation that is always unambiguous and never needs
# NLTK's disambiguation logic. Used as a fallback when NLTK doesn't support the
# language (e.g., Japanese, Chinese, Korean, Hindi, Arabic).
UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = (
SENTENCE_ENDING_PUNCTUATION - _LATIN_SENTENCE_ENDING_PUNCTUATION
)
StartEndTags = Tuple[str, str]
@@ -144,7 +155,17 @@ def match_endofsentence(text: str) -> int:
# common for text to be single words, so we need to ensure
# sentence-ending punctuation is present.
if len(sentences) == 1 and first_sentence == text:
return len(text) if text and text[-1] in SENTENCE_ENDING_PUNCTUATION else 0
if text and text[-1] in SENTENCE_ENDING_PUNCTUATION:
return len(text)
# Fallback for languages not supported by NLTK (e.g., Japanese, Chinese,
# Korean, Hindi, Arabic). NLTK returned the entire text as a single
# sentence, and the last character is not sentence-ending punctuation
# (it's a lookahead character). Scan for unambiguous non-Latin sentence-
# ending punctuation that doesn't need NLTK's disambiguation.
for i, ch in enumerate(text):
if ch in UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION:
return i + 1
return 0
# If there are multiple sentences, the first one is complete by definition
# (NLTK found a boundary, so there must be proper punctuation)

View File

@@ -124,6 +124,62 @@ class TestSimpleTextAggregator(unittest.IsolatedAsyncioTestCase):
result = await self.aggregator.flush()
assert result.text == "W"
async def test_japanese_multiple_sentences(self):
"""Test that Japanese sentences are properly split during streaming."""
text = "こんにちは。元気ですか?"
results = [agg async for agg in self.aggregator.aggregate(text)]
# First sentence detected when 元 arrives as lookahead after 。
assert len(results) == 1
assert results[0].text == "こんにちは。"
# Flush returns the second sentence
result = await self.aggregator.flush()
assert result.text == "元気ですか?"
async def test_japanese_sentence_with_lookahead(self):
"""Test that a Japanese sentence is detected with a lookahead character."""
text = "こんにちは。元"
results = [agg async for agg in self.aggregator.aggregate(text)]
# 。 triggers lookahead, then 元 confirms it
assert len(results) == 1
assert results[0].text == "こんにちは。"
# Flush returns remainder
result = await self.aggregator.flush()
assert result.text == ""
async def test_chinese_streaming_tokens(self):
"""Test Chinese text split across multiple streaming tokens."""
aggregator = SimpleTextAggregator()
tokens = ["你好", "世界", "", "下一", "句话", ""]
all_results = []
for token in tokens:
results = [agg async for agg in aggregator.aggregate(token)]
all_results.extend(results)
# First sentence detected when 下 arrives after 。
assert len(all_results) == 1
assert all_results[0].text == "你好世界。"
# Flush returns the second sentence
result = await aggregator.flush()
assert result.text == "下一句话。"
async def test_japanese_single_sentence_flush(self):
"""Test that a single Japanese sentence with no lookahead flushes correctly."""
text = "こんにちは。"
results = [agg async for agg in self.aggregator.aggregate(text)]
# No lookahead yet - waiting
assert len(results) == 0
# Flush returns the complete sentence
result = await self.aggregator.flush()
assert result.text == "こんにちは。"
if __name__ == "__main__":
unittest.main()

View File

@@ -153,6 +153,46 @@ class TestUtilsString(unittest.IsolatedAsyncioTestCase):
for sentence in latin_script_sentences:
assert match_endofsentence(sentence), f"Failed for Latin script: {sentence}"
async def test_endofsentence_cjk_with_lookahead(self):
"""Test sentence detection for CJK text with lookahead characters.
This tests the NLTK fallback path: NLTK returns entire text as one
sentence because it doesn't support CJK languages, but unambiguous
punctuation is detected via the fallback scan.
"""
# Japanese: sentence + lookahead character
assert match_endofsentence("こんにちは。元") == 6
assert match_endofsentence("元気ですか?は") == 6
assert match_endofsentence("ありがとう!次") == 6
# Chinese: sentence + lookahead character
assert match_endofsentence("你好世界。下") == 5
assert match_endofsentence("你好吗?我") == 4
# Korean: sentence + lookahead character
assert match_endofsentence("안녕하세요。다") == 6
# Multiple CJK sentences with lookahead - should return first sentence
assert match_endofsentence("こんにちは。元気ですか?は") == 6
# Indic script with lookahead
assert match_endofsentence("हैलो।अ") == 5
# Arabic with lookahead
assert match_endofsentence("مرحبا؟ك") == 6
async def test_endofsentence_latin_not_affected_by_fallback(self):
"""Verify that the CJK fallback does not change behavior for Latin text."""
# These should still return 0 - Latin "." is NOT in the unambiguous set
assert not match_endofsentence("Mr. S")
assert not match_endofsentence("Ok, Mr. Smith let's ")
assert not match_endofsentence("The number pi is 3.14159")
assert not match_endofsentence("America, or the U.S")
# These should still return correct values via NLTK path
assert match_endofsentence("This is a sentence. This is another one") == 19
assert match_endofsentence("For information, call 411.") == 26
async def test_endofsentence_streaming_tokens(self):
"""Test the specific use case of streaming LLM tokens."""