add full-width punctuations as end of the sentence

This commit is contained in:
duyalei
2024-09-23 16:34:31 +08:00
parent 9a4e749c7c
commit 4533ed014f
2 changed files with 15 additions and 1 deletions

View File

@@ -14,7 +14,8 @@ ENDOFSENTENCE_PATTERN_STR = r"""
(?<!Mr|Ms|Dr) # Negative lookbehind: not preceded by Mr, Ms, Dr (combined bc. length is the same)
(?<!Mrs) # Negative lookbehind: not preceded by "Mrs"
(?<!Prof) # Negative lookbehind: not preceded by "Prof"
[\.\?\!:] # Match a period, question mark, exclamation point, or colon
[\.\?\!:;]| # Match a period, question mark, exclamation point, colon, or semicolon
[。?!:;] # the full-width version (mainly used in East Asian languages such as Chinese)
$ # End of string
"""
ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)

View File

@@ -32,6 +32,7 @@ class TestBaseAIService(unittest.IsolatedAsyncioTestCase):
assert match_endofsentence("This is a sentence! ")
assert match_endofsentence("This is a sentence?")
assert match_endofsentence("This is a sentence:")
assert match_endofsentence("This is a sentence;")
assert not match_endofsentence("This is not a sentence")
assert not match_endofsentence("This is not a sentence,")
assert not match_endofsentence("This is not a sentence, ")
@@ -43,6 +44,18 @@ class TestBaseAIService(unittest.IsolatedAsyncioTestCase):
assert not match_endofsentence("America, or the U.") # U.S.A.
assert not match_endofsentence("It still early, it's 3:00 a.") # 3:00 a.m.
async def test_endofsentence_zh(self):
chinese_sentences = [
"你好。",
"你好!",
"吃了吗?",
"安全第一;",
"他说:",
]
for i in chinese_sentences:
assert match_endofsentence(i)
assert not match_endofsentence("你好,")
if __name__ == "__main__":
unittest.main()