From 68caf3a7aedda4d241e959db937773a669264e1b Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Wed, 19 Feb 2025 23:03:10 +0300 Subject: [PATCH] 'Only orphans option' and keeping suggestions state -- https://github.com/ispras/lingvodoc-react/issues/1182 --- lingvodoc/schema/gql_cognate.py | 77 ++++++++++++++++++++------- lingvodoc/schema/query.py | 26 +++++++-- lingvodoc/scripts/list_cognates.py | 8 +-- lingvodoc/utils/neuro_cognates/app.py | 50 ++++++++++++----- 4 files changed, 120 insertions(+), 41 deletions(-) diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py index dd88526d..ac454bb9 100644 --- a/lingvodoc/schema/gql_cognate.py +++ b/lingvodoc/schema/gql_cognate.py @@ -5630,15 +5630,14 @@ class Arguments: base_language_id = LingvodocID() input_pairs = ObjectVal() truth_threshold = graphene.Float() + only_orphans_flag = graphene.Boolean() + group_field_id = LingvodocID() debug_flag = graphene.Boolean() + intermediate_flag = graphene.Boolean() triumph = graphene.Boolean() - - suggestion_list = ObjectVal() message = graphene.String() - perspective_name_list = graphene.List(graphene.String) - transcription_count = graphene.Int() @staticmethod def neuro_cognate_statistics( @@ -5649,10 +5648,12 @@ def neuro_cognate_statistics( locale_id, user_id, truth_threshold, + only_orphans_flag, + group_field_id, storage, host_url, cache_kwargs, - debug_flag = False): + debug_flag): input_pairs_list = input_pairs or [] compare_pairs_list = [] @@ -5673,13 +5674,14 @@ def neuro_cognate_statistics( lex_id, xcript_text, xlat_text, - _ - ) in entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=False): + linked_group + ) in entities_getter(perspective_id, xcript_fid, xlat_fid, only_orphans_flag, group_field_id): if not xcript_text or not xlat_text: continue - current_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id]))) + # Gathering each-to-each combinations of transcriptions and translations + current_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id], [linked_group]))) if perspective_id != source_perspective_id: compare_pairs_list.append(current_pairs_list[:]) @@ -5695,14 +5697,13 @@ def neuro_cognate_statistics( perspective_name = perspective.get_translation(locale_id) dictionary_name = perspective.parent.get_translation(locale_id) - dictionary_name_list.append(dictionary_name) perspective_name_list.append(f"{perspective_name} - {dictionary_name}") + dictionary_name_list.append(f"{idx + 1}. {dictionary_name}") message = "" triumph = True input_len = len(input_pairs_list) compare_len = sum(map(len, compare_pairs_list)) - dictionaries = [] if not input_len or not compare_len: triumph = False @@ -5711,12 +5712,6 @@ def neuro_cognate_statistics( triumph = False message = f"Too many words to compare: {compare_len}" else: - for i, d in enumerate(dictionary_name_list, 1): - dictionaries.append(f"{i}. {d}") - - task = TaskStatus(user_id, 'Neuro cognates computation', '\n\n'.join(dictionaries), input_len) - task.set(1, 0, "first words processing...", "") - NeuroCognatesEngine = NeuroCognates( compare_pairs_list, input_index, @@ -5726,9 +5721,14 @@ def neuro_cognate_statistics( host_url, cache_kwargs, match_translations, - truth_threshold) + truth_threshold, + only_orphans_flag + ) - NeuroCognatesEngine.index(input_pairs_list, task) + NeuroCognatesEngine.index( + input_pairs_list, + TaskStatus(user_id, 'Neuro cognates computation', '\n\n'.join(dictionary_name_list), input_len) + ) result_dict = ( dict( @@ -5746,8 +5746,11 @@ def mutate( match_translations, base_language_id, truth_threshold=0.97, + only_orphans_flag=True, + group_field_id=(66, 25), input_pairs=None, - debug_flag=False): + debug_flag=False, + intermediate_flag=False): # Administrator / perspective author / editing permission check. error_str = ( @@ -5842,6 +5845,8 @@ def mutate( locale_id, user.id, truth_threshold, + only_orphans_flag, + group_field_id, storage, host_url, cache_kwargs, @@ -7253,3 +7258,37 @@ def mutate(root, info, **args): print(result) + +class SaveSuggestionsState(graphene.Mutation): + + class Arguments: + result_file = graphene.String(required=True) + suggestions_state = ObjectVal() + debug_flag = graphene.Boolean() + + triumph = graphene.Boolean() + + @staticmethod + def mutate( + root, + info, + result_file, + suggestions_state=None, + debug_flag=False): + + storage = info.context.request.registry.settings['storage'] + storage_dir = os.path.join(storage['path'], 'neuro_cognates') + pickle_path = os.path.join(storage_dir, f'{result_file}_sg') + os.makedirs(storage_dir, exist_ok=True) + + if suggestions_state is None: + suggestions_state = dict( + sg_select_list=None, + sg_state_list=None, + sg_count=None, + sg_entry_map=None) + + with gzip.open(pickle_path, 'wb') as suggestions_state_file: + pickle.dump(suggestions_state, suggestions_state_file) + + return SaveSuggestionsState(triumph=True) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 0ff00003..b919f0c0 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -149,7 +149,8 @@ NeuroCognateAnalysis, PhonemicAnalysis, SwadeshAnalysis, - XlsxBulkDisconnect) + XlsxBulkDisconnect, + SaveSuggestionsState) from lingvodoc.schema.gql_column import ( Column, @@ -5334,16 +5335,30 @@ def resolve_result_suggestions(self, storage = ( info.context.request.registry.settings['storage']) - pickle_path = os.path.join(storage['path'], 'neuro_cognates', result_file) + result_path = os.path.join(storage['path'], 'neuro_cognates', result_file) try: - with gzip.open(pickle_path, 'rb') as pickle_file: + with gzip.open(result_path, 'rb') as pickle_file: result_dict = pickle.load(pickle_file) + except: + return ResponseError(f'Cannot access file \'{result_path}\'.') + + # We're trying to get file with current user changes + sg_state_path = os.path.join(storage['path'], 'neuro_cognates', f'{result_file}_sg') + + sg_state_dict = dict( + sg_select_list=None, + sg_state_list=None, + sg_count=None, + sg_entry_map=None) + try: + with gzip.open(sg_state_path, 'rb') as pickle_file: + sg_state_dict = pickle.load(pickle_file) except: - return ResponseError(f'Cannot access file \'{pickle_path}\'.') + pass - return result_dict + return {**result_dict, **sg_state_dict} class PerspectivesAndFields(graphene.InputObjectType): @@ -9249,6 +9264,7 @@ class MyMutations(graphene.ObjectType): delete_markup_group = DeleteMarkupGroup.Field() save_markup_groups = SaveMarkupGroups.Field() stop_mutation = StopMutation.Field() + save_suggestions_state = SaveSuggestionsState.Field() schema = graphene.Schema( query=Query, diff --git a/lingvodoc/scripts/list_cognates.py b/lingvodoc/scripts/list_cognates.py index 33ece2c4..4b94462c 100644 --- a/lingvodoc/scripts/list_cognates.py +++ b/lingvodoc/scripts/list_cognates.py @@ -566,7 +566,7 @@ def has_word(word, text): yield None -def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True): +def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True, group_field_id=(66, 25)): xcript_text = None xlat_text = None @@ -607,14 +607,16 @@ def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True) elif field_id == xlat_fid: xlat_text = field_text - linked_group = None + linked_group = [] if get_linked_group: linked_group = ( DBSession .execute( - f'select * from linked_group(66, 25, {lex_id[0]}, {lex_id[1]})') + f'select * from linked_group' + f'({group_field_id[0]}, {group_field_id[1]}, {lex_id[0]}, {lex_id[1]})' + ) .fetchall()) # Preparing of linked_group for json-serialization diff --git a/lingvodoc/utils/neuro_cognates/app.py b/lingvodoc/utils/neuro_cognates/app.py index 1dcdd365..85e775d4 100644 --- a/lingvodoc/utils/neuro_cognates/app.py +++ b/lingvodoc/utils/neuro_cognates/app.py @@ -34,7 +34,8 @@ def __init__(self, host_url, cache_kwargs, four_tensors=False, - truth_threshold=0.97): + truth_threshold=0.97, + only_orphans_flag=True): self.compare_lists = compare_lists self.input_index = input_index @@ -45,6 +46,7 @@ def __init__(self, self.storage = storage self.host_url = host_url self.cache_kwargs = cache_kwargs + self.only_orphans_flag = only_orphans_flag project_dir = os.path.abspath(os.getcwd()) script_path = os.path.abspath(__file__) @@ -119,10 +121,11 @@ def split_items(items): return ( list(map(lambda x: x[0], items)), list(map(lambda x: x[1], items)), - list(map(lambda x: x[2], items))) + list(map(lambda x: x[2], items)), + list(map(lambda x: x[3], items))) # Разделяем входные пары на слова и переводы - input_words, input_translations, input_lex_ids = split_items(word_pairs) + input_words, input_translations, input_lex_ids, input_linked_groups = split_items(word_pairs) # Токенизация и паддинг входных данных seq_input_words = [tokenizer.texts_to_sequences([word]) for word in input_words] @@ -140,7 +143,7 @@ def split_items(items): # Проход по каждому списку для сравнения for compare_list in self.compare_lists: - compare_words, compare_translations, compare_lex_ids = split_items(compare_list) + compare_words, compare_translations, _, _ = split_items(compare_list) # Токенизация и паддинг данных для сравнения seq_compare_words = [tokenizer.texts_to_sequences([word]) for word in compare_words] @@ -157,7 +160,7 @@ def split_items(items): stamp_file = os.path.join(self.storage['path'], 'lingvodoc_stamps', str(task.id)) # Calculate prediction - def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event): + def get_prediction(input_word, input_trans, input_id, input_links, X_word, X_trans, event): if event.is_set(): return None @@ -165,18 +168,29 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event): similarities = [] result = [] + count = 0 + links = 0 + # Проход по каждому списку для сравнения for i, compare_list in enumerate(self.compare_lists): if not compare_list: continue - compare_words, compare_translations, compare_lex_ids = split_items(compare_list) + compare_words, compare_translations, compare_lex_ids, compare_linked_groups = split_items(compare_list) + + for compare_word, compare_trans, compare_id, compare_links, X_comp_word, X_comp_trans in ( + itertools.zip_longest( + compare_words, + compare_translations, + compare_lex_ids, + compare_linked_groups, + X_compare_words[i], + X_compare_translations[i])): - count = 0 - for compare_word, compare_trans, compare_id, X_comp_word, X_comp_trans in itertools.zip_longest( - compare_words, compare_translations, compare_lex_ids, X_compare_words[i], - X_compare_translations[i]): + if set(input_links) & set(compare_links): + links += 1 + continue # Checking stamp-to-stop every hundred comparings count += 1 @@ -205,10 +219,11 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event): event.set() return None - return result + return result, links start_time = now() results = [] + group_count = 0 current_stage = 0 flushed = 0 result_link = "" @@ -219,10 +234,16 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event): def add_result(res): + nonlocal current_stage, flushed, result_link, group_count + if res is None: return - nonlocal current_stage, flushed, result_link + result, links = res + + results.extend(result) + group_count += links + current_stage += 1 finished = (current_stage == input_len) passed = now() - start_time @@ -235,8 +256,6 @@ def add_result(res): progress = 100 if finished else int(current_stage / input_len * 100) status = "Finished" if finished else f"~ {days}d:{hours}h:{minutes}m left ~" - results.extend(res) - if passed - flushed > 300 or finished: flushed = passed @@ -245,6 +264,7 @@ def add_result(res): suggestion_list=results, perspective_name_list=self.perspective_name_list, transcription_count=compare_len * current_stage, + group_count=f"{group_count} filtered" if self.only_orphans_flag else "non-filtered", source_perspective_id=self.source_perspective_id)) storage_dir = os.path.join(self.storage['path'], 'neuro_cognates') @@ -261,11 +281,13 @@ def add_result(res): with multiprocess.Pool(multiprocess.cpu_count() // 2) as p: event = multiprocess.Manager().Event() + task.set(1, 0, "first words processing...") for args in itertools.zip_longest( input_words, input_translations, input_lex_ids, + input_linked_groups, X_input_words, X_input_translations, [event] * input_len