-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathparser.py
440 lines (329 loc) · 15.6 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
import re
from importlib import import_module
import unicodedata
SENTENCE_SEPARATORS = [".", ","]
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru']
RE_BUG_LANGUAGES = ['hi']
NUMERAL_SYSTEMS = ('decimal', 'roman')
ROMAN_REGEX_EXPRESSION = "(?i)^(m{0,3})(cm|cd|d?c{0,4})(xc|xl|l?x{0,4})(ix|iv|v?i{0,4})$"
class LanguageData:
"""Main language class to populate the requisite language-specific variables."""
unit_numbers = {}
direct_numbers = {}
tens = {}
hundreds = {}
big_powers_of_ten = {}
skip_tokens = []
all_numbers = {}
unit_and_direct_numbers = {}
def __init__(self, language):
if language not in SUPPORTED_LANGUAGES:
raise ValueError(f'"{language}" is not a supported language')
language_info = getattr(import_module('number_parser.data.' + language), 'info')
self.unit_numbers = _normalize_dict(language_info["UNIT_NUMBERS"])
self.direct_numbers = _normalize_dict(language_info["DIRECT_NUMBERS"])
self.tens = _normalize_dict(language_info["TENS"])
self.hundreds = _normalize_dict(language_info["HUNDREDS"])
self.big_powers_of_ten = _normalize_dict(language_info["BIG_POWERS_OF_TEN"])
self.skip_tokens = language_info["SKIP_TOKENS"]
self.all_numbers = {**self.unit_numbers, **self.direct_numbers, **self.tens,
**self.hundreds, **self.big_powers_of_ten}
self.unit_and_direct_numbers = {**self.unit_numbers, **self.direct_numbers}
self.maximum_group_value = 10000 if language_info["USE_LONG_SCALE"] else 100
def _check_validity(current_token, previous_token, previous_power_of_10, total_value, current_grp_value, lang_data):
"""Identifies whether the new token can continue building the previous number."""
if previous_token is None:
return True
if current_token in lang_data.unit_and_direct_numbers and previous_token in lang_data.unit_and_direct_numbers:
# both tokens are "units" or "direct numbers"
return False
elif current_token in lang_data.direct_numbers and previous_token in lang_data.tens:
# current token in "direct numbers" and previous token in "tens"
return False
elif current_token in lang_data.tens \
and (previous_token in lang_data.tens or previous_token in lang_data.unit_and_direct_numbers):
# current token in "tens" and previous token in "tens" or it's a "unit" or "direct number"
return False
elif current_token in lang_data.hundreds and previous_token not in lang_data.big_powers_of_ten:
# current token in "hundreds" and previous token is not a "big power of ten"
return False
elif current_token in lang_data.big_powers_of_ten:
# current token is a "big power of ten"
power_of_ten = lang_data.big_powers_of_ten[current_token]
if power_of_ten < current_grp_value:
return False
if total_value != 0 and previous_power_of_10 and power_of_ten >= previous_power_of_10:
return False
return True
def _check_large_multiplier(current_token, total_value, current_grp_value, lang_data):
"""Checks if the current token (power of ten) is larger than the total value formed till now."""
combined_value = total_value + current_grp_value
if combined_value and current_token in lang_data.big_powers_of_ten:
large_value = lang_data.big_powers_of_ten[current_token]
if large_value > combined_value and large_value != 100:
return True
return False
def _build_number(token_list, lang_data):
"""Incrementaly builds a number from the list of tokens."""
total_value = 0
current_grp_value = 0
previous_token = None
previous_power_of_10 = None
value_list = []
used_skip_tokens = []
for token in token_list:
if not token.strip():
continue
if token in lang_data.skip_tokens:
used_skip_tokens.append(token)
continue
is_large_multiplier = _check_large_multiplier(token, total_value, current_grp_value, lang_data)
if is_large_multiplier:
combined_value = total_value + current_grp_value
total_value = combined_value * lang_data.big_powers_of_ten[token]
previous_token = token
current_grp_value = 0
used_skip_tokens = []
previous_power_of_10 = lang_data.big_powers_of_ten[token]
continue
valid = _check_validity(token, previous_token, previous_power_of_10, total_value, current_grp_value, lang_data)
if not valid:
total_value += current_grp_value
value_list.append(str(total_value))
total_value = 0
current_grp_value = 0
for skip_token in used_skip_tokens:
value_list.append(skip_token)
previous_power_of_10 = None
if token in lang_data.unit_and_direct_numbers:
current_grp_value += lang_data.unit_and_direct_numbers[token]
elif token in lang_data.tens:
current_grp_value += lang_data.tens[token]
elif token in lang_data.hundreds:
current_grp_value += lang_data.hundreds[token]
elif token in lang_data.big_powers_of_ten:
power_of_ten = lang_data.big_powers_of_ten[token]
if current_grp_value == 0:
current_grp_value = 1
current_grp_value *= power_of_ten
if power_of_ten > lang_data.maximum_group_value:
total_value += current_grp_value
current_grp_value = 0
previous_power_of_10 = power_of_ten
previous_token = token
used_skip_tokens = []
total_value += current_grp_value
value_list.append(str(total_value))
return value_list
def _tokenize(input_string, language):
"""Breaks string on any non-word character."""
input_string = input_string.replace('\xad', '')
if language in RE_BUG_LANGUAGES:
return re.split(r'(\s+)', input_string)
return re.split(r'(\W)', input_string)
def _strip_accents(word):
"""Removes accent from the input word."""
return ''.join(char for char in unicodedata.normalize('NFD', word) if unicodedata.category(char) != 'Mn')
def _normalize_tokens(token_list):
"""Converts all tokens to lowercase then removes accents."""
return [_strip_accents(token.lower()) for token in token_list]
def _normalize_dict(lang_data):
"""Removes the accent from each key of input dictionary"""
return {_strip_accents(word): number for word, number in lang_data.items()}
def _is_cardinal_token(token, lang_data):
"""Checks if the given token is a cardinal number and returns token"""
if token in lang_data.all_numbers:
return token
return None
def _is_ordinal_token(token, lang_data):
"""Checks if the given token is a ordinal number and returns token"""
if _is_cardinal_token(token, lang_data) is None:
return _is_number_token(token, lang_data)
return None
def _is_number_token(token, lang_data):
"""
Checks if the given token belongs to either cardinal or ordinal numbers
and returns the cardinal form.
"""
token = _apply_cardinal_conversion(token, lang_data)
return _is_cardinal_token(token, lang_data)
def _is_skip_token(token, lang_data):
return token in lang_data.skip_tokens
def _apply_cardinal_conversion(token, lang_data): # Currently only for English language.
"""Converts ordinal tokens to cardinal while leaving other tokens unchanged."""
CARDINAL_DIRECT_NUMBERS = {'first': 'one', 'second': 'two', 'third': 'three', 'fifth': 'five', 'eighth': 'eight',
'ninth': 'nine', 'twelfth': 'twelve'}
for word, number in CARDINAL_DIRECT_NUMBERS.items():
token = token.replace(word, number)
token_cardinal_form_1 = re.sub(r'ieth$', 'y', token)
if _is_cardinal_token(token_cardinal_form_1, lang_data) is not None:
return token_cardinal_form_1
token_cardinal_form_2 = re.sub(r'th$', '', token)
if _is_cardinal_token(token_cardinal_form_2, lang_data) is not None:
return token_cardinal_form_2
return token
def _valid_tokens_by_language(input_string):
language_matches = {}
for language in SUPPORTED_LANGUAGES:
lang_data = LanguageData(language)
tokens = _tokenize(input_string, language)
normalized_tokens = _normalize_tokens(tokens)
valid_list = [
_is_number_token(token, lang_data) is not None or _is_skip_token(token, lang_data)
for token in normalized_tokens
]
cnt_valid_words = valid_list.count(True)
language_matches[language] = cnt_valid_words
best_language = max(language_matches, key=language_matches.get)
if language_matches[best_language] == 0: # return English if not matching words
return 'en'
return best_language
def parse_ordinal(input_string, language=None):
"""Converts a single number in ordinal or cardinal form to it's numeric equivalent"""
if language is None:
language = _valid_tokens_by_language(input_string)
lang_data = LanguageData(language)
tokens = _tokenize(input_string, language)
normalized_tokens = _normalize_tokens(tokens)
processed_tokens = [_apply_cardinal_conversion(token, lang_data) for token in normalized_tokens]
output_string = ' '.join(processed_tokens)
return parse_number(output_string, language)
def _is_roman(search_string):
return re.search(ROMAN_REGEX_EXPRESSION, search_string, re.IGNORECASE)
def parse_number(input_string, language=None, numeral_systems=None):
"""Converts a single number written in natural language to a numeric type"""
if not input_string.strip():
return None
if input_string.strip().isnumeric():
return int(input_string)
if language is None:
language = _valid_tokens_by_language(input_string)
if numeral_systems is None:
if _is_roman(input_string):
numeral_systems = ['roman']
elif language in SUPPORTED_LANGUAGES and not _is_roman(input_string):
numeral_systems = ['decimal']
for numeral_system in numeral_systems:
if numeral_system == 'decimal':
lang_data = LanguageData(language)
tokens = _tokenize(input_string, language)
normalized_tokens = _normalize_tokens(tokens)
for index, token in enumerate(normalized_tokens):
if _is_cardinal_token(token, lang_data) or not token.strip():
continue
if _is_skip_token(token, lang_data) and index != 0:
continue
return None
number_built = _build_number(normalized_tokens, lang_data)
if len(number_built) == 1:
return int(number_built[0])
return None
elif numeral_system == 'roman':
return int(_parse_roman(input_string))
else:
raise ValueError(f'{numeral_system!r} is not a supported numeral system')
def parse_fraction(input_string, language=None):
"""Converts a single number written in fraction to a numeric type"""
if not input_string.strip():
return None
if language is None:
language = _valid_tokens_by_language(input_string)
FRACTION_SEPARATORS = ["divided by", "over", "by", "/"]
for separator in FRACTION_SEPARATORS:
position_of_separator = input_string.find(separator)
if position_of_separator == -1:
continue
string_before_separator = input_string[:position_of_separator]
string_after_separator = input_string[position_of_separator + len(separator):]
number_before_separator = parse_number(string_before_separator, language)
number_after_separator = parse_number(string_after_separator, language)
if number_before_separator is None or number_after_separator is None:
return None
return f'{number_before_separator}/{number_after_separator}'
return None
def parse(input_string, language=None, numeral_systems=None):
"""
Converts all the numbers in a sentence written in natural language to their numeric type while keeping
the other words unchanged. Returns the transformed string.
"""
if numeral_systems is None:
numeral_systems = NUMERAL_SYSTEMS
if language is None:
language = _valid_tokens_by_language(input_string)
result = input_string
for numeral_system in numeral_systems:
if numeral_system == 'decimal':
result = _parse_decimal(result, language)
elif numeral_system == 'roman':
result = _parse_roman(result)
else:
raise ValueError(f'"{numeral_system}" is not a supported numeral system')
return result
def _parse_decimal(input_string, language):
lang_data = LanguageData(language)
tokens = _tokenize(input_string, language)
final_sentence = []
current_sentence = []
tokens_taken = []
def _build_and_add_number(pop_last_space=False):
if tokens_taken:
result = _build_number(tokens_taken, lang_data)
tokens_taken.clear()
for number in result:
current_sentence.extend([number, " "])
if pop_last_space:
current_sentence.pop()
for token in tokens:
compare_token = _strip_accents(token.lower())
ordinal_number = _is_ordinal_token(compare_token, lang_data)
if not compare_token.strip():
if not tokens_taken:
current_sentence.append(token)
continue
if compare_token in SENTENCE_SEPARATORS:
_build_and_add_number(pop_last_space=True)
current_sentence.append(token)
final_sentence.extend(current_sentence)
current_sentence = []
continue
if ordinal_number:
tokens_taken.append(ordinal_number)
_build_and_add_number(pop_last_space=True)
elif (
_is_cardinal_token(compare_token, lang_data)
or (_is_skip_token(compare_token, lang_data) and len(tokens_taken) != 0)
):
tokens_taken.append(compare_token)
else:
if tokens_taken and _is_skip_token(tokens_taken[-1], lang_data):
# when finishing with a skip_token --> keep it
skip_token = tokens_taken[-1]
tokens_taken.pop()
_build_and_add_number()
current_sentence.extend([skip_token, " "])
_build_and_add_number()
current_sentence.append(token)
_build_and_add_number()
final_sentence.extend(current_sentence)
return ''.join(final_sentence).strip()
def _parse_roman(input_string):
tokens = _tokenize(input_string, None)
tokens = [item for item in tokens if item != '']
for token in tokens:
if _is_roman(token):
tokens[tokens.index(token)] = str(_build_roman(token))
final_sentence = ''.join(tokens)
return final_sentence
def _build_roman(roman_number):
roman = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
num_tokens = re.split(ROMAN_REGEX_EXPRESSION, roman_number, re.IGNORECASE)
num_tokens = [item for item in num_tokens if item != '']
built_num = 0
for num_token in num_tokens:
if re.search('iv|ix|xl|xc|cd|cm', num_token, re.IGNORECASE):
built_num += roman[num_token[1].lower()] - roman[num_token[0].lower()]
elif re.search('[XLVD][IXC]{1,4}', num_token, re.IGNORECASE):
built_num += roman[num_token[0].lower()] + (roman[num_token[1].lower()] * (len(num_token) - 1))
elif re.search('[ixcm]{1,4}|[vld]{1}', num_token, re.IGNORECASE):
built_num += roman[num_token[0].lower()] * len(num_token)
return built_num