Merge pull request #3617 from pipecat-ai/fix/cjk-sentence-splitting
Fix sentence splitting for CJK and other non-Latin languages
This commit is contained in:
5
changelog/3617.fixed.md
Normal file
5
changelog/3617.fixed.md
Normal file
@@ -0,0 +1,5 @@
|
||||
- Fixed sentence splitting for Japanese, Chinese, Korean, and other non-Latin
|
||||
languages in TTS pipeline. NLTK's sentence tokenizer does not support CJK
|
||||
languages, causing text to accumulate until flush instead of being split at
|
||||
sentence boundaries. Added fallback detection for unambiguous non-Latin
|
||||
sentence-ending punctuation (e.g., `。`, `?`, `!`).
|
||||
@@ -89,6 +89,17 @@ SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
|
||||
}
|
||||
)
|
||||
|
||||
# Latin punctuation that NLTK handles well — these need NLTK's disambiguation
|
||||
# because "." can appear in abbreviations, decimals, etc.
|
||||
_LATIN_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset({".", "!", "?", ";", "…"})
|
||||
|
||||
# Non-Latin sentence-ending punctuation that is always unambiguous and never needs
|
||||
# NLTK's disambiguation logic. Used as a fallback when NLTK doesn't support the
|
||||
# language (e.g., Japanese, Chinese, Korean, Hindi, Arabic).
|
||||
UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = (
|
||||
SENTENCE_ENDING_PUNCTUATION - _LATIN_SENTENCE_ENDING_PUNCTUATION
|
||||
)
|
||||
|
||||
StartEndTags = Tuple[str, str]
|
||||
|
||||
|
||||
@@ -144,7 +155,17 @@ def match_endofsentence(text: str) -> int:
|
||||
# common for text to be single words, so we need to ensure
|
||||
# sentence-ending punctuation is present.
|
||||
if len(sentences) == 1 and first_sentence == text:
|
||||
return len(text) if text and text[-1] in SENTENCE_ENDING_PUNCTUATION else 0
|
||||
if text and text[-1] in SENTENCE_ENDING_PUNCTUATION:
|
||||
return len(text)
|
||||
# Fallback for languages not supported by NLTK (e.g., Japanese, Chinese,
|
||||
# Korean, Hindi, Arabic). NLTK returned the entire text as a single
|
||||
# sentence, and the last character is not sentence-ending punctuation
|
||||
# (it's a lookahead character). Scan for unambiguous non-Latin sentence-
|
||||
# ending punctuation that doesn't need NLTK's disambiguation.
|
||||
for i, ch in enumerate(text):
|
||||
if ch in UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATION:
|
||||
return i + 1
|
||||
return 0
|
||||
|
||||
# If there are multiple sentences, the first one is complete by definition
|
||||
# (NLTK found a boundary, so there must be proper punctuation)
|
||||
|
||||
@@ -124,6 +124,62 @@ class TestSimpleTextAggregator(unittest.IsolatedAsyncioTestCase):
|
||||
result = await self.aggregator.flush()
|
||||
assert result.text == "W"
|
||||
|
||||
async def test_japanese_multiple_sentences(self):
|
||||
"""Test that Japanese sentences are properly split during streaming."""
|
||||
text = "こんにちは。元気ですか?"
|
||||
results = [agg async for agg in self.aggregator.aggregate(text)]
|
||||
|
||||
# First sentence detected when 元 arrives as lookahead after 。
|
||||
assert len(results) == 1
|
||||
assert results[0].text == "こんにちは。"
|
||||
|
||||
# Flush returns the second sentence
|
||||
result = await self.aggregator.flush()
|
||||
assert result.text == "元気ですか?"
|
||||
|
||||
async def test_japanese_sentence_with_lookahead(self):
|
||||
"""Test that a Japanese sentence is detected with a lookahead character."""
|
||||
text = "こんにちは。元"
|
||||
results = [agg async for agg in self.aggregator.aggregate(text)]
|
||||
|
||||
# 。 triggers lookahead, then 元 confirms it
|
||||
assert len(results) == 1
|
||||
assert results[0].text == "こんにちは。"
|
||||
|
||||
# Flush returns remainder
|
||||
result = await self.aggregator.flush()
|
||||
assert result.text == "元"
|
||||
|
||||
async def test_chinese_streaming_tokens(self):
|
||||
"""Test Chinese text split across multiple streaming tokens."""
|
||||
aggregator = SimpleTextAggregator()
|
||||
|
||||
tokens = ["你好", "世界", "。", "下一", "句话", "。"]
|
||||
all_results = []
|
||||
for token in tokens:
|
||||
results = [agg async for agg in aggregator.aggregate(token)]
|
||||
all_results.extend(results)
|
||||
|
||||
# First sentence detected when 下 arrives after 。
|
||||
assert len(all_results) == 1
|
||||
assert all_results[0].text == "你好世界。"
|
||||
|
||||
# Flush returns the second sentence
|
||||
result = await aggregator.flush()
|
||||
assert result.text == "下一句话。"
|
||||
|
||||
async def test_japanese_single_sentence_flush(self):
|
||||
"""Test that a single Japanese sentence with no lookahead flushes correctly."""
|
||||
text = "こんにちは。"
|
||||
results = [agg async for agg in self.aggregator.aggregate(text)]
|
||||
|
||||
# No lookahead yet - waiting
|
||||
assert len(results) == 0
|
||||
|
||||
# Flush returns the complete sentence
|
||||
result = await self.aggregator.flush()
|
||||
assert result.text == "こんにちは。"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -153,6 +153,46 @@ class TestUtilsString(unittest.IsolatedAsyncioTestCase):
|
||||
for sentence in latin_script_sentences:
|
||||
assert match_endofsentence(sentence), f"Failed for Latin script: {sentence}"
|
||||
|
||||
async def test_endofsentence_cjk_with_lookahead(self):
|
||||
"""Test sentence detection for CJK text with lookahead characters.
|
||||
|
||||
This tests the NLTK fallback path: NLTK returns entire text as one
|
||||
sentence because it doesn't support CJK languages, but unambiguous
|
||||
punctuation is detected via the fallback scan.
|
||||
"""
|
||||
# Japanese: sentence + lookahead character
|
||||
assert match_endofsentence("こんにちは。元") == 6
|
||||
assert match_endofsentence("元気ですか?は") == 6
|
||||
assert match_endofsentence("ありがとう!次") == 6
|
||||
|
||||
# Chinese: sentence + lookahead character
|
||||
assert match_endofsentence("你好世界。下") == 5
|
||||
assert match_endofsentence("你好吗?我") == 4
|
||||
|
||||
# Korean: sentence + lookahead character
|
||||
assert match_endofsentence("안녕하세요。다") == 6
|
||||
|
||||
# Multiple CJK sentences with lookahead - should return first sentence
|
||||
assert match_endofsentence("こんにちは。元気ですか?は") == 6
|
||||
|
||||
# Indic script with lookahead
|
||||
assert match_endofsentence("हैलो।अ") == 5
|
||||
|
||||
# Arabic with lookahead
|
||||
assert match_endofsentence("مرحبا؟ك") == 6
|
||||
|
||||
async def test_endofsentence_latin_not_affected_by_fallback(self):
|
||||
"""Verify that the CJK fallback does not change behavior for Latin text."""
|
||||
# These should still return 0 - Latin "." is NOT in the unambiguous set
|
||||
assert not match_endofsentence("Mr. S")
|
||||
assert not match_endofsentence("Ok, Mr. Smith let's ")
|
||||
assert not match_endofsentence("The number pi is 3.14159")
|
||||
assert not match_endofsentence("America, or the U.S")
|
||||
|
||||
# These should still return correct values via NLTK path
|
||||
assert match_endofsentence("This is a sentence. This is another one") == 19
|
||||
assert match_endofsentence("For information, call 411.") == 26
|
||||
|
||||
async def test_endofsentence_streaming_tokens(self):
|
||||
"""Test the specific use case of streaming LLM tokens."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user