Adding hindi danda symbol as end of sentence marker

This commit is contained in:
vengadanathan srinivasan
2025-01-25 09:48:24 +05:30
parent b881dd57b3
commit 7a0cfc8d3d
2 changed files with 12 additions and 1 deletions

View File

@@ -14,7 +14,7 @@ ENDOFSENTENCE_PATTERN_STR = r"""
(?<!Mrs) # Negative lookbehind: not preceded by "Mrs"
(?<!Prof) # Negative lookbehind: not preceded by "Prof"
[\.\?\!:;]| # Match a period, question mark, exclamation point, colon, or semicolon
[。?!:;] # the full-width version (mainly used in East Asian languages such as Chinese)
[。?!:;] # the full-width version (mainly used in East Asian languages such as Chinese, Hindi)
$ # End of string
"""
ENDOFSENTENCE_PATTERN = re.compile(ENDOFSENTENCE_PATTERN_STR, re.VERBOSE)

View File

@@ -38,3 +38,14 @@ class TestUtilsString(unittest.IsolatedAsyncioTestCase):
for i in chinese_sentences:
assert match_endofsentence(i)
assert not match_endofsentence("你好,")
async def test_endofsentence_hi(self):
hindi_sentences = [
"हैलो।",
"हैलो!",
"आप खाये हैं?",
"सुरक्षा पहले।",
]
for i in hindi_sentences:
assert match_endofsentence(i)
assert not match_endofsentence("हैलो,")