add full-width punctuations as end of the sentence
This commit is contained in:
@@ -14,7 +14,8 @@ ENDOFSENTENCE_PATTERN_STR = r"""
|
||||
(?<!Mr|Ms|Dr) # Negative lookbehind: not preceded by Mr, Ms, Dr (combined bc. length is the same)
|
||||
(?<!Mrs) # Negative lookbehind: not preceded by "Mrs"
|
||||
(?<!Prof) # Negative lookbehind: not preceded by "Prof"
|
||||
[\.\?\!:] # Match a period, question mark, exclamation point, or colon
|
||||
[\.\?\!:;]| # Match a period, question mark, exclamation point, colon, or semicolon
|
||||
[。?!:;] # the full-width version (mainly used in East Asian languages such as Chinese)
|
||||
$ # End of string
|
||||
"""
|
||||
ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)
|
||||
|
||||
@@ -32,6 +32,7 @@ class TestBaseAIService(unittest.IsolatedAsyncioTestCase):
|
||||
assert match_endofsentence("This is a sentence! ")
|
||||
assert match_endofsentence("This is a sentence?")
|
||||
assert match_endofsentence("This is a sentence:")
|
||||
assert match_endofsentence("This is a sentence;")
|
||||
assert not match_endofsentence("This is not a sentence")
|
||||
assert not match_endofsentence("This is not a sentence,")
|
||||
assert not match_endofsentence("This is not a sentence, ")
|
||||
@@ -43,6 +44,18 @@ class TestBaseAIService(unittest.IsolatedAsyncioTestCase):
|
||||
assert not match_endofsentence("America, or the U.") # U.S.A.
|
||||
assert not match_endofsentence("It still early, it's 3:00 a.") # 3:00 a.m.
|
||||
|
||||
async def test_endofsentence_zh(self):
|
||||
chinese_sentences = [
|
||||
"你好。",
|
||||
"你好!",
|
||||
"吃了吗?",
|
||||
"安全第一;",
|
||||
"他说:",
|
||||
]
|
||||
for i in chinese_sentences:
|
||||
assert match_endofsentence(i)
|
||||
assert not match_endofsentence("你好,")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user