Files
ZNJJ-api-server/test/test_text_chunker.py
Xin Wang cf0a8b71fd Refactor text chunking implementation and update configuration
- Moved SentenceTextChunker and SentenceTextChunkerConfig to the utils module for better organization.
- Updated pytest.ini to include the current directory in the Python path.
- Added a new utils module with shared utility helpers.
- Adjusted import paths in the test files to reflect the new location of text chunking classes.
2026-06-18 10:15:07 +08:00

58 lines
1.6 KiB
Python

from src.utils.text_chunker import SentenceTextChunker, SentenceTextChunkerConfig
def test_chinese_sentence_chunks_wait_for_lookahead():
chunker = SentenceTextChunker()
chunks = []
for token in ["你好", "世界", "", "下一", "句话", ""]:
chunks.extend(chunker.feed(token))
assert chunks == ["你好世界。"]
assert chunker.flush() == "下一句话。"
def test_flush_returns_pending_text():
chunker = SentenceTextChunker()
assert chunker.feed("还没有句号") == []
assert chunker.flush() == "还没有句号"
assert chunker.flush() is None
def test_decimal_point_does_not_split_sentence():
chunker = SentenceTextChunker()
chunks = chunker.feed("价格是29.95元。下一句")
assert chunks == ["价格是29.95元。"]
assert chunker.flush() == "下一句"
def test_soft_break_after_max_chars():
chunker = SentenceTextChunker(
SentenceTextChunkerConfig(
min_chars=1,
max_chars=12,
use_soft_breaks=True,
)
)
chunks = chunker.feed("这是一段比较长的话,需要先切一下继续播放")
assert chunks == ["这是一段比较长的话,"]
assert chunker.flush() == "需要先切一下继续播放"
def test_can_disable_soft_breaks():
chunker = SentenceTextChunker(
SentenceTextChunkerConfig(
min_chars=1,
max_chars=12,
use_soft_breaks=False,
)
)
assert chunker.feed("这是一段比较长的话,需要先切一下继续播放") == []
assert chunker.flush() == "这是一段比较长的话,需要先切一下继续播放"