Files
pipecat/tests/test_word_timestamp_utils.py
filipi87 81bb81c1d0 test: add automated tests for word tracking, frame sequencing, and Cartesia TTS
Adds tests for AggregatedFrameSequencer, WordCompletionTracker, and
word_timestamp_utils (including CJK language scenarios). Updates existing
Cartesia TTS and TTS frame ordering tests to cover the new behaviours.
2026-05-20 10:03:26 -03:00

91 lines
3.3 KiB
Python

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import unittest
from pipecat.utils.text.word_timestamp_utils import merge_punct_tokens
class TestMergePunctTokens(unittest.TestCase):
def test_empty_list(self):
self.assertEqual(merge_punct_tokens([]), [])
def test_all_alnum_words_pass_through(self):
input = [("hello", 0.0), ("world", 1.0)]
self.assertEqual(merge_punct_tokens(input), [("hello", 0.0), ("world", 1.0)])
def test_trailing_space_merged_and_stripped(self):
input = [("I", 0.0), (" ", 0.2)]
self.assertEqual(merge_punct_tokens(input), [("I", 0.0)])
def test_comma_space_merged_and_stripped(self):
input = [("questions", 1.0), (", ", 1.2), ("explain", 1.4)]
self.assertEqual(merge_punct_tokens(input), [("questions,", 1.0), ("explain", 1.4)])
def test_leading_space_with_no_preceding_word_discarded(self):
input = [(" ", 0.0), ("hello", 0.5)]
self.assertEqual(merge_punct_tokens(input), [("hello", 0.5)])
def test_leading_empty_string_discarded(self):
input = [("", 0.0), ("hello", 0.5)]
self.assertEqual(merge_punct_tokens(input), [("hello", 0.5)])
def test_multiple_consecutive_punct_tokens_merged_and_stripped(self):
input = [("word", 0.0), (",", 0.1), (" ", 0.2), ("next", 0.3)]
self.assertEqual(merge_punct_tokens(input), [("word,", 0.0), ("next", 0.3)])
def test_timestamp_of_preceding_word_is_kept(self):
"""Merged punct tokens adopt the preceding word's timestamp."""
input = [("hello", 2.5), (",", 2.7)]
result = merge_punct_tokens(input)
self.assertEqual(result, [("hello,", 2.5)])
def test_xml_tag_only_token_is_treated_as_punct(self):
"""A token that is only an XML tag (no alnum chars) merges into the preceding word."""
input = [("word", 0.0), ("<break/>", 0.1), ("next", 0.3)]
self.assertEqual(merge_punct_tokens(input), [("word<break/>", 0.0), ("next", 0.3)])
def test_xml_tag_with_alnum_content_passes_through(self):
"""A token like '<spell>123</spell>' has alnum chars after stripping tags."""
input = [("<spell>123</spell>", 0.0), ("and", 0.5)]
self.assertEqual(merge_punct_tokens(input), [("<spell>123</spell>", 0.0), ("and", 0.5)])
def test_inworld_style_full_stream(self):
"""Full Inworld-style raw stream produces expected merged and stripped output."""
raw = [
("", 0.0),
("I", 0.1),
(" ", 0.2),
("can", 0.3),
(" ", 0.4),
("answer", 0.5),
(" ", 0.6),
("questions", 0.7),
(", ", 0.8),
("explain", 0.9),
(" ", 1.0),
("things", 1.1),
(".", 1.2),
]
expected = [
("I", 0.1),
("can", 0.3),
("answer", 0.5),
("questions,", 0.7),
("explain", 0.9),
("things.", 1.1),
]
self.assertEqual(merge_punct_tokens(raw), expected)
def test_only_punct_tokens_returns_empty(self):
"""A list containing only punct/space tokens produces an empty result."""
input = [(" ", 0.0), (",", 0.1), (".", 0.2)]
self.assertEqual(merge_punct_tokens(input), [])
if __name__ == "__main__":
unittest.main()