- Moved SentenceTextChunker and SentenceTextChunkerConfig to the utils module for better organization. - Updated pytest.ini to include the current directory in the Python path. - Added a new utils module with shared utility helpers. - Adjusted import paths in the test files to reflect the new location of text chunking classes.
58 lines
1.6 KiB
Python
58 lines
1.6 KiB
Python
from src.utils.text_chunker import SentenceTextChunker, SentenceTextChunkerConfig
|
|
|
|
|
|
def test_chinese_sentence_chunks_wait_for_lookahead():
|
|
chunker = SentenceTextChunker()
|
|
chunks = []
|
|
|
|
for token in ["你好", "世界", "。", "下一", "句话", "。"]:
|
|
chunks.extend(chunker.feed(token))
|
|
|
|
assert chunks == ["你好世界。"]
|
|
assert chunker.flush() == "下一句话。"
|
|
|
|
|
|
def test_flush_returns_pending_text():
|
|
chunker = SentenceTextChunker()
|
|
|
|
assert chunker.feed("还没有句号") == []
|
|
assert chunker.flush() == "还没有句号"
|
|
assert chunker.flush() is None
|
|
|
|
|
|
def test_decimal_point_does_not_split_sentence():
|
|
chunker = SentenceTextChunker()
|
|
|
|
chunks = chunker.feed("价格是29.95元。下一句")
|
|
|
|
assert chunks == ["价格是29.95元。"]
|
|
assert chunker.flush() == "下一句"
|
|
|
|
|
|
def test_soft_break_after_max_chars():
|
|
chunker = SentenceTextChunker(
|
|
SentenceTextChunkerConfig(
|
|
min_chars=1,
|
|
max_chars=12,
|
|
use_soft_breaks=True,
|
|
)
|
|
)
|
|
|
|
chunks = chunker.feed("这是一段比较长的话,需要先切一下继续播放")
|
|
|
|
assert chunks == ["这是一段比较长的话,"]
|
|
assert chunker.flush() == "需要先切一下继续播放"
|
|
|
|
|
|
def test_can_disable_soft_breaks():
|
|
chunker = SentenceTextChunker(
|
|
SentenceTextChunkerConfig(
|
|
min_chars=1,
|
|
max_chars=12,
|
|
use_soft_breaks=False,
|
|
)
|
|
)
|
|
|
|
assert chunker.feed("这是一段比较长的话,需要先切一下继续播放") == []
|
|
assert chunker.flush() == "这是一段比较长的话,需要先切一下继续播放"
|