diff --git a/lingvodoc/utils/doc_parser.py b/lingvodoc/utils/doc_parser.py index 90c09954..1c05826a 100644 --- a/lingvodoc/utils/doc_parser.py +++ b/lingvodoc/utils/doc_parser.py @@ -24,7 +24,7 @@ def print_to_str(*args, **kwargs): span_id_counter = 0 -def generate_html_wrap(word, ana_tag_list): +def generate_html_wrap(word, ana_tag_list, lang=""): json_list = list() for ana_tag in ana_tag_list: @@ -45,10 +45,16 @@ def generate_html_wrap(word, ana_tag_list): span_id_counter += 1 encoded_attrs = ((json.dumps(attr_json, ensure_ascii=False)).encode('utf8')).decode() wrap += "" + encoded_attrs + "" + + if lang == 'udm' and 'nom' in encoded_attrs: + flag = True + span_id_counter += 1 + wrap += "" + encoded_attrs.replace('nom', 'acc0') + "" + wrap += word + "" return wrap -def insert_parser_output_to_text(text, parser_output): +def insert_parser_output_to_text(text, parser_output, lang=""): ESC_PAT = "$id$" soup = bs4.BeautifulSoup(parser_output, 'html.parser') @@ -63,7 +69,7 @@ def insert_parser_output_to_text(text, parser_output): continue result_list.append(text[search_start_index:match_index]) if (len(w_tag.contents) > 1): - result_list.append(generate_html_wrap(word, w_tag.contents[0:-1])) + result_list.append(generate_html_wrap(word, w_tag.contents[0:-1], lang=lang)) search_start_index = match_index + len(word) result_list.append(text[search_start_index:]) result = "".join(result_list) @@ -115,7 +121,7 @@ def timarkh_uniparser(dedoc_output, lang, has_disamb=False, disambiguate=False): parser_output = analyzer.analyze_words(wordlist, format="xml") parser_output_str = print_to_str(parser_output) - return insert_parser_output_to_text(dedoc_output, parser_output_str) + return insert_parser_output_to_text(dedoc_output, parser_output_str, lang=lang) def apertium_parser(dedoc_output, apertium_path, lang): @@ -338,7 +344,7 @@ def trans(elem): parser_output = reformat(morph_filename=morph_filename) os.remove(morph_filename) - return insert_parser_output_to_text(dedoc_output, parser_output) + return insert_parser_output_to_text(dedoc_output, parser_output, lang=lang) def timarkh_udm(dedoc_output):