From 68caf3a7aedda4d241e959db937773a669264e1b Mon Sep 17 00:00:00 2001
From: vmonakhov <vmonakhov@ispras.ru>
Date: Wed, 19 Feb 2025 23:03:10 +0300
Subject: [PATCH] 'Only orphans option' and keeping suggestions state --
 https://github.com/ispras/lingvodoc-react/issues/1182

---
 lingvodoc/schema/gql_cognate.py       | 77 ++++++++++++++++++++-------
 lingvodoc/schema/query.py             | 26 +++++++--
 lingvodoc/scripts/list_cognates.py    |  8 +--
 lingvodoc/utils/neuro_cognates/app.py | 50 ++++++++++++-----
 4 files changed, 120 insertions(+), 41 deletions(-)

diff --git a/lingvodoc/schema/gql_cognate.py b/lingvodoc/schema/gql_cognate.py
index dd88526d..ac454bb9 100644
--- a/lingvodoc/schema/gql_cognate.py
+++ b/lingvodoc/schema/gql_cognate.py
@@ -5630,15 +5630,14 @@ class Arguments:
         base_language_id = LingvodocID()
         input_pairs = ObjectVal()
         truth_threshold = graphene.Float()
+        only_orphans_flag = graphene.Boolean()
+        group_field_id = LingvodocID()
 
         debug_flag = graphene.Boolean()
+        intermediate_flag = graphene.Boolean()
 
     triumph = graphene.Boolean()
-
-    suggestion_list = ObjectVal()
     message = graphene.String()
-    perspective_name_list = graphene.List(graphene.String)
-    transcription_count = graphene.Int()
 
     @staticmethod
     def neuro_cognate_statistics(
@@ -5649,10 +5648,12 @@ def neuro_cognate_statistics(
             locale_id,
             user_id,
             truth_threshold,
+            only_orphans_flag,
+            group_field_id,
             storage,
             host_url,
             cache_kwargs,
-            debug_flag = False):
+            debug_flag):
 
         input_pairs_list = input_pairs or []
         compare_pairs_list = []
@@ -5673,13 +5674,14 @@ def neuro_cognate_statistics(
                 lex_id,
                 xcript_text,
                 xlat_text,
-                _
-            ) in entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=False):
+                linked_group
+            ) in entities_getter(perspective_id, xcript_fid, xlat_fid, only_orphans_flag, group_field_id):
 
                 if not xcript_text or not xlat_text:
                     continue
 
-                current_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id])))
+                # Gathering each-to-each combinations of transcriptions and translations
+                current_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id], [linked_group])))
 
             if perspective_id != source_perspective_id:
                 compare_pairs_list.append(current_pairs_list[:])
@@ -5695,14 +5697,13 @@ def neuro_cognate_statistics(
             perspective_name = perspective.get_translation(locale_id)
             dictionary_name = perspective.parent.get_translation(locale_id)
 
-            dictionary_name_list.append(dictionary_name)
             perspective_name_list.append(f"{perspective_name} - {dictionary_name}")
+            dictionary_name_list.append(f"{idx + 1}. {dictionary_name}")
 
         message = ""
         triumph = True
         input_len = len(input_pairs_list)
         compare_len = sum(map(len, compare_pairs_list))
-        dictionaries = []
 
         if not input_len or not compare_len:
             triumph = False
@@ -5711,12 +5712,6 @@ def neuro_cognate_statistics(
             triumph = False
             message = f"Too many words to compare: {compare_len}"
         else:
-            for i, d in enumerate(dictionary_name_list, 1):
-                dictionaries.append(f"{i}. {d}")
-
-            task = TaskStatus(user_id, 'Neuro cognates computation', '\n\n'.join(dictionaries), input_len)
-            task.set(1, 0, "first words processing...", "")
-
             NeuroCognatesEngine = NeuroCognates(
                 compare_pairs_list,
                 input_index,
@@ -5726,9 +5721,14 @@ def neuro_cognate_statistics(
                 host_url,
                 cache_kwargs,
                 match_translations,
-                truth_threshold)
+                truth_threshold,
+                only_orphans_flag
+            )
 
-            NeuroCognatesEngine.index(input_pairs_list, task)
+            NeuroCognatesEngine.index(
+                input_pairs_list,
+                TaskStatus(user_id, 'Neuro cognates computation', '\n\n'.join(dictionary_name_list), input_len)
+            )
 
         result_dict = (
             dict(
@@ -5746,8 +5746,11 @@ def mutate(
         match_translations,
         base_language_id,
         truth_threshold=0.97,
+        only_orphans_flag=True,
+        group_field_id=(66, 25),
         input_pairs=None,
-        debug_flag=False):
+        debug_flag=False,
+        intermediate_flag=False):
 
         # Administrator / perspective author / editing permission check.
         error_str = (
@@ -5842,6 +5845,8 @@ def mutate(
                 locale_id,
                 user.id,
                 truth_threshold,
+                only_orphans_flag,
+                group_field_id,
                 storage,
                 host_url,
                 cache_kwargs,
@@ -7253,3 +7258,37 @@ def mutate(root, info, **args):
 
     print(result)
 
+
+class SaveSuggestionsState(graphene.Mutation):
+
+    class Arguments:
+        result_file = graphene.String(required=True)
+        suggestions_state = ObjectVal()
+        debug_flag = graphene.Boolean()
+
+    triumph = graphene.Boolean()
+
+    @staticmethod
+    def mutate(
+            root,
+            info,
+            result_file,
+            suggestions_state=None,
+            debug_flag=False):
+
+        storage = info.context.request.registry.settings['storage']
+        storage_dir = os.path.join(storage['path'], 'neuro_cognates')
+        pickle_path = os.path.join(storage_dir, f'{result_file}_sg')
+        os.makedirs(storage_dir, exist_ok=True)
+
+        if suggestions_state is None:
+            suggestions_state = dict(
+                sg_select_list=None,
+                sg_state_list=None,
+                sg_count=None,
+                sg_entry_map=None)
+
+        with gzip.open(pickle_path, 'wb') as suggestions_state_file:
+            pickle.dump(suggestions_state, suggestions_state_file)
+
+        return SaveSuggestionsState(triumph=True)
diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 0ff00003..b919f0c0 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -149,7 +149,8 @@
     NeuroCognateAnalysis,
     PhonemicAnalysis,
     SwadeshAnalysis,
-    XlsxBulkDisconnect)
+    XlsxBulkDisconnect,
+    SaveSuggestionsState)
 
 from lingvodoc.schema.gql_column import (
     Column,
@@ -5334,16 +5335,30 @@ def resolve_result_suggestions(self,
         storage = (
             info.context.request.registry.settings['storage'])
 
-        pickle_path = os.path.join(storage['path'], 'neuro_cognates', result_file)
+        result_path = os.path.join(storage['path'], 'neuro_cognates', result_file)
 
         try:
-            with gzip.open(pickle_path, 'rb') as pickle_file:
+            with gzip.open(result_path, 'rb') as pickle_file:
                 result_dict = pickle.load(pickle_file)
+        except:
+            return ResponseError(f'Cannot access file \'{result_path}\'.')
+
+        # We're trying to get file with current user changes
+        sg_state_path = os.path.join(storage['path'], 'neuro_cognates', f'{result_file}_sg')
+
+        sg_state_dict = dict(
+            sg_select_list=None,
+            sg_state_list=None,
+            sg_count=None,
+            sg_entry_map=None)
 
+        try:
+            with gzip.open(sg_state_path, 'rb') as pickle_file:
+                sg_state_dict = pickle.load(pickle_file)
         except:
-            return ResponseError(f'Cannot access file \'{pickle_path}\'.')
+            pass
 
-        return result_dict
+        return {**result_dict, **sg_state_dict}
 
 
 class PerspectivesAndFields(graphene.InputObjectType):
@@ -9249,6 +9264,7 @@ class MyMutations(graphene.ObjectType):
     delete_markup_group = DeleteMarkupGroup.Field()
     save_markup_groups = SaveMarkupGroups.Field()
     stop_mutation = StopMutation.Field()
+    save_suggestions_state = SaveSuggestionsState.Field()
 
 schema = graphene.Schema(
     query=Query,
diff --git a/lingvodoc/scripts/list_cognates.py b/lingvodoc/scripts/list_cognates.py
index 33ece2c4..4b94462c 100644
--- a/lingvodoc/scripts/list_cognates.py
+++ b/lingvodoc/scripts/list_cognates.py
@@ -566,7 +566,7 @@ def has_word(word, text):
             yield None
 
 
-def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True):
+def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True, group_field_id=(66, 25)):
 
     xcript_text = None
     xlat_text = None
@@ -607,14 +607,16 @@ def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True)
             elif field_id == xlat_fid:
                 xlat_text = field_text
 
-        linked_group = None
+        linked_group = []
 
         if get_linked_group:
 
             linked_group = (
                 DBSession
                     .execute(
-                        f'select * from linked_group(66, 25, {lex_id[0]}, {lex_id[1]})')
+                        f'select * from linked_group'
+                        f'({group_field_id[0]}, {group_field_id[1]}, {lex_id[0]}, {lex_id[1]})'
+                    )
                     .fetchall())
 
             # Preparing of linked_group for json-serialization
diff --git a/lingvodoc/utils/neuro_cognates/app.py b/lingvodoc/utils/neuro_cognates/app.py
index 1dcdd365..85e775d4 100644
--- a/lingvodoc/utils/neuro_cognates/app.py
+++ b/lingvodoc/utils/neuro_cognates/app.py
@@ -34,7 +34,8 @@ def __init__(self,
                  host_url,
                  cache_kwargs,
                  four_tensors=False,
-                 truth_threshold=0.97):
+                 truth_threshold=0.97,
+                 only_orphans_flag=True):
 
         self.compare_lists = compare_lists
         self.input_index = input_index
@@ -45,6 +46,7 @@ def __init__(self,
         self.storage = storage
         self.host_url = host_url
         self.cache_kwargs = cache_kwargs
+        self.only_orphans_flag = only_orphans_flag
 
         project_dir = os.path.abspath(os.getcwd())
         script_path = os.path.abspath(__file__)
@@ -119,10 +121,11 @@ def split_items(items):
             return (
                 list(map(lambda x: x[0], items)),
                 list(map(lambda x: x[1], items)),
-                list(map(lambda x: x[2], items)))
+                list(map(lambda x: x[2], items)),
+                list(map(lambda x: x[3], items)))
 
         # Разделяем входные пары на слова и переводы
-        input_words, input_translations, input_lex_ids = split_items(word_pairs)
+        input_words, input_translations, input_lex_ids, input_linked_groups = split_items(word_pairs)
 
         # Токенизация и паддинг входных данных
         seq_input_words = [tokenizer.texts_to_sequences([word]) for word in input_words]
@@ -140,7 +143,7 @@ def split_items(items):
         # Проход по каждому списку для сравнения
         for compare_list in self.compare_lists:
 
-            compare_words, compare_translations, compare_lex_ids = split_items(compare_list)
+            compare_words, compare_translations, _, _ = split_items(compare_list)
 
             # Токенизация и паддинг данных для сравнения
             seq_compare_words = [tokenizer.texts_to_sequences([word]) for word in compare_words]
@@ -157,7 +160,7 @@ def split_items(items):
         stamp_file = os.path.join(self.storage['path'], 'lingvodoc_stamps', str(task.id))
 
         # Calculate prediction
-        def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event):
+        def get_prediction(input_word, input_trans, input_id, input_links, X_word, X_trans, event):
 
             if event.is_set():
                 return None
@@ -165,18 +168,29 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event):
             similarities = []
             result = []
 
+            count = 0
+            links = 0
+
             # Проход по каждому списку для сравнения
             for i, compare_list in enumerate(self.compare_lists):
 
                 if not compare_list:
                     continue
 
-                compare_words, compare_translations, compare_lex_ids = split_items(compare_list)
+                compare_words, compare_translations, compare_lex_ids, compare_linked_groups = split_items(compare_list)
+
+                for compare_word, compare_trans, compare_id, compare_links, X_comp_word, X_comp_trans in (
+                        itertools.zip_longest(
+                            compare_words,
+                            compare_translations,
+                            compare_lex_ids,
+                            compare_linked_groups,
+                            X_compare_words[i],
+                            X_compare_translations[i])):
 
-                count = 0
-                for compare_word, compare_trans, compare_id, X_comp_word, X_comp_trans in itertools.zip_longest(
-                        compare_words, compare_translations, compare_lex_ids, X_compare_words[i],
-                        X_compare_translations[i]):
+                    if set(input_links) & set(compare_links):
+                        links += 1
+                        continue
 
                     # Checking stamp-to-stop every hundred comparings
                     count += 1
@@ -205,10 +219,11 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event):
                 event.set()
                 return None
 
-            return result
+            return result, links
 
         start_time = now()
         results = []
+        group_count = 0
         current_stage = 0
         flushed = 0
         result_link = ""
@@ -219,10 +234,16 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event):
 
         def add_result(res):
 
+            nonlocal current_stage, flushed, result_link, group_count
+
             if res is None:
                 return
 
-            nonlocal current_stage, flushed, result_link
+            result, links = res
+
+            results.extend(result)
+            group_count += links
+
             current_stage += 1
             finished = (current_stage == input_len)
             passed = now() - start_time
@@ -235,8 +256,6 @@ def add_result(res):
             progress = 100 if finished else int(current_stage / input_len * 100)
             status = "Finished" if finished else f"~ {days}d:{hours}h:{minutes}m left ~"
 
-            results.extend(res)
-
             if passed - flushed > 300 or finished:
                 flushed = passed
 
@@ -245,6 +264,7 @@ def add_result(res):
                         suggestion_list=results,
                         perspective_name_list=self.perspective_name_list,
                         transcription_count=compare_len * current_stage,
+                        group_count=f"{group_count} filtered" if self.only_orphans_flag else "non-filtered",
                         source_perspective_id=self.source_perspective_id))
 
                 storage_dir = os.path.join(self.storage['path'], 'neuro_cognates')
@@ -261,11 +281,13 @@ def add_result(res):
         with multiprocess.Pool(multiprocess.cpu_count() // 2) as p:
 
             event = multiprocess.Manager().Event()
+            task.set(1, 0, "first words processing...")
 
             for args in itertools.zip_longest(
                     input_words,
                     input_translations,
                     input_lex_ids,
+                    input_linked_groups,
                     X_input_words,
                     X_input_translations,
                     [event] * input_len