Skip to content
44 changes: 44 additions & 0 deletions number_parser/data/rom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
info = {
"UNIT_NUMBERS": {
"i": 1,
"ii": 2,
"iii": 3,
"iv": 5,
"vi": 6,
"vii": 7,
"viii": 8,
"ix": 9
},
"DIRECT_NUMBERS": {
"x": 10,

},
"TENS": {
"xx": 20,
"xxx": 30,
"xl": 40,
"l": 50,
"lx": 60,
"lxx": 70,
"lxxx": 80,
"xc": 90
},
"HUNDREDS": {
"c": 100,
"cc": 200,
"ccc": 300,
"cd": 400,
"d": 500,
"dc": 600,
"dcc": 700,
"dccc": 800,
"cm": 900
},
"BIG_POWERS_OF_TEN": {
"m": 1000,
"mm": 2000,
"mmm": 3000
},
"SKIP_TOKENS": [],
"USE_LONG_SCALE": False
}
12 changes: 11 additions & 1 deletion number_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from importlib import import_module
import unicodedata
SENTENCE_SEPARATORS = [".", ","]
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru']
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom']
RE_BUG_LANGUAGES = ['hi']


Expand Down Expand Up @@ -141,6 +141,8 @@ def _build_number(token_list, lang_data):

def _tokenize(input_string, language):
"""Breaks string on any non-word character."""
if language == 'rom':
return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower())
input_string = input_string.replace('\xad', '')
if language in RE_BUG_LANGUAGES:
return re.split(r'(\s+)', input_string)
Expand Down Expand Up @@ -310,6 +312,14 @@ def parse(input_string, language=None):

tokens = _tokenize(input_string, language)

if language == 'rom':
tokens = _tokenize(input_string, language=None)
for token in tokens:
if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()):
tokens[tokens.index(token)] = str(parse_number(token, language='rom'))
final_sentance = ''.join(tokens)
return final_sentance

final_sentence = []
current_sentence = []
tokens_taken = []
Expand Down