# # Copyright (c) 2024-2025 Daily # # SPDX-License-Identifier: BSD 2-Clause License # import unittest from pipecat.utils.string import match_endofsentence, parse_start_end_tags class TestUtilsString(unittest.IsolatedAsyncioTestCase): async def test_endofsentence(self): assert match_endofsentence("This is a sentence.") == 19 assert match_endofsentence("This is a sentence!") == 19 assert match_endofsentence("This is a sentence?") == 19 assert match_endofsentence("This is a sentence;") == 19 assert match_endofsentence("This is a sentence...") == 21 assert match_endofsentence("This is a sentence . . .") == 24 assert match_endofsentence("This is a sentence. ..") == 22 assert match_endofsentence("This is for Mr. and Mrs. Jones.") == 31 assert match_endofsentence("U.S.A and U.S.A..") == 17 assert match_endofsentence("My emails are foo@pipecat.ai and bar@pipecat.ai.") == 48 assert match_endofsentence("My email is foo.bar@pipecat.ai.") == 31 assert match_endofsentence("My email is spell(foo.bar@pipecat.ai).") == 38 assert match_endofsentence("My email is foo.bar@pipecat.ai.") == 46 assert match_endofsentence("The number pi is 3.14159.") == 25 assert match_endofsentence("Valid scientific notation 1.23e4.") == 33 assert match_endofsentence("Valid scientific notation 0.e4.") == 31 assert not match_endofsentence("This is not a sentence") assert not match_endofsentence("This is not a sentence,") assert not match_endofsentence("This is not a sentence, ") assert not match_endofsentence("Ok, Mr. Smith let's ") assert not match_endofsentence("Dr. Walker, I presume ") assert not match_endofsentence("Prof. Walker, I presume ") assert not match_endofsentence("zweitens, und 3.") assert not match_endofsentence("Heute ist Dienstag, der 3.") # 3. Juli 2024 assert not match_endofsentence("America, or the U.") # U.S.A. assert not match_endofsentence("It still early, it's 3:00 a.") # 3:00 a.m. assert not match_endofsentence("My emails are foo@pipecat.ai and bar@pipecat.ai") assert not match_endofsentence("The number pi is 3.14159") async def test_endofsentence_zh(self): chinese_sentences = [ "你好。", "你好!", "吃了吗?", "安全第一;", ] for i in chinese_sentences: assert match_endofsentence(i) assert not match_endofsentence("你好,") async def test_endofsentence_hi(self): hindi_sentences = [ "हैलो।", "हैलो!", "आप खाये हैं?", "सुरक्षा पहले।", ] for i in hindi_sentences: assert match_endofsentence(i) assert not match_endofsentence("हैलो,") class TestStartEndTags(unittest.IsolatedAsyncioTestCase): async def test_empty(self): assert parse_start_end_tags("", [], None, 0) == (None, 0) assert parse_start_end_tags("Hello from Pipecat!", [], None, 0) == (None, 0) async def test_simple(self): # (, ) assert parse_start_end_tags("Hello from Pipecat!", [("", "")], None, 0) == ( None, 26, ) assert parse_start_end_tags("Hello from Pipecat", [("", "")], None, 0) == ( ("", ""), 21, ) assert parse_start_end_tags("Hello from Pipecat", [("", "")], None, 6) == ( ("", ""), 21, ) # (spell(, )) assert parse_start_end_tags("Hello from spell(Pipecat)!", [("spell(", ")")], None, 0) == ( None, 26, ) assert parse_start_end_tags("Hello from spell(Pipecat", [("spell(", ")")], None, 0) == ( ("spell(", ")"), 24, ) async def test_multiple(self): # (, ) assert parse_start_end_tags( "Hello from Pipecat! Hello World!", [("", "")], None, 0 ) == ( None, 46, ) assert parse_start_end_tags( "Hello from Pipecat! Hello World", [("", "")], None, 0 ) == ( ("", ""), 41, )