Adding hindi danda symbol as end of sentence marker
This commit is contained in:
@@ -14,7 +14,7 @@ ENDOFSENTENCE_PATTERN_STR = r"""
|
||||
(?<!Mrs) # Negative lookbehind: not preceded by "Mrs"
|
||||
(?<!Prof) # Negative lookbehind: not preceded by "Prof"
|
||||
[\.\?\!:;]| # Match a period, question mark, exclamation point, colon, or semicolon
|
||||
[。?!:;] # the full-width version (mainly used in East Asian languages such as Chinese)
|
||||
[。?!:;।] # the full-width version (mainly used in East Asian languages such as Chinese, Hindi)
|
||||
$ # End of string
|
||||
"""
|
||||
ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)
|
||||
|
||||
@@ -38,3 +38,14 @@ class TestUtilsString(unittest.IsolatedAsyncioTestCase):
|
||||
for i in chinese_sentences:
|
||||
assert match_endofsentence(i)
|
||||
assert not match_endofsentence("你好,")
|
||||
|
||||
async def test_endofsentence_hi(self):
|
||||
hindi_sentences = [
|
||||
"हैलो।",
|
||||
"हैलो!",
|
||||
"आप खाये हैं?",
|
||||
"सुरक्षा पहले।",
|
||||
]
|
||||
for i in hindi_sentences:
|
||||
assert match_endofsentence(i)
|
||||
assert not match_endofsentence("हैलो,")
|
||||
|
||||
Reference in New Issue
Block a user