-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
55 lines (48 loc) · 1.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from botok.utils.lenient_normalization import remove_affixes, normalize_graphical, normalize_punctuation
from botok import normalize_unicode
from botok import tokenize_in_stacks
import re
from fontTools import unicodedata
import pyewts
EWTSCONVERTER = pyewts.pyewts()
def get_main_script_tag(text: str):
"""
return the ISO 15924 tag of the main script used
in a string
"""
halflen = int(len(text) / 2)
scripts = {}
maxcharscript_nbchars = 0
maxcharscript = "Zyyy"
for i, c in enumerate(text):
cscript = unicodedata.script(c)
if cscript not in scripts:
scripts[cscript] = 0
script_nbchars = scripts[cscript] + 1
scripts[cscript] = script_nbchars
if script_nbchars > maxcharscript_nbchars:
maxcharscript_nbchars = script_nbchars
maxcharscript = cscript
if maxcharscript_nbchars > halflen:
return maxcharscript
return maxcharscript
def normalize_bo(s):
if len(s) < 2:
# otherwise bug in botok triggered, see
# https://github.com/OpenPecha/Botok/commits/master/
return s
#s = normalize_unicode(s)
# remove non-Tibetan
s = re.sub(r"[^ༀ-࿚ ]+", "", s)
s = normalize_graphical(s)
s = normalize_punctuation(s).strip()
return s
def make_lenient_bo(s):
return s
# unnecessary
#EWTS_PARTS = re.compile(r"[bcdghjklmnprstvwyzBCDGHJKLMNPRSTVWYZ]*[-aeiouAIU*_ ]*")
def normalize_ewts(s):
return EWTSCONVERTER.normalizeSloppyWylie(s)
def tokenize_ewts_base(query_s):
# TODO: remove non-ewts:
return EWTSCONVERTER.splitIntoTokens(query_s)