Skip to content

Commit

Permalink
'Only orphans option' and keeping suggestions state -- ispras/lingvod…
Browse files Browse the repository at this point in the history
  • Loading branch information
vmonakhov committed Feb 20, 2025
1 parent ff0726a commit 68caf3a
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 41 deletions.
77 changes: 58 additions & 19 deletions lingvodoc/schema/gql_cognate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5630,15 +5630,14 @@ class Arguments:
base_language_id = LingvodocID()
input_pairs = ObjectVal()
truth_threshold = graphene.Float()
only_orphans_flag = graphene.Boolean()
group_field_id = LingvodocID()

debug_flag = graphene.Boolean()
intermediate_flag = graphene.Boolean()

triumph = graphene.Boolean()

suggestion_list = ObjectVal()
message = graphene.String()
perspective_name_list = graphene.List(graphene.String)
transcription_count = graphene.Int()

@staticmethod
def neuro_cognate_statistics(
Expand All @@ -5649,10 +5648,12 @@ def neuro_cognate_statistics(
locale_id,
user_id,
truth_threshold,
only_orphans_flag,
group_field_id,
storage,
host_url,
cache_kwargs,
debug_flag = False):
debug_flag):

input_pairs_list = input_pairs or []
compare_pairs_list = []
Expand All @@ -5673,13 +5674,14 @@ def neuro_cognate_statistics(
lex_id,
xcript_text,
xlat_text,
_
) in entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=False):
linked_group
) in entities_getter(perspective_id, xcript_fid, xlat_fid, only_orphans_flag, group_field_id):

if not xcript_text or not xlat_text:
continue

current_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id])))
# Gathering each-to-each combinations of transcriptions and translations
current_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id], [linked_group])))

if perspective_id != source_perspective_id:
compare_pairs_list.append(current_pairs_list[:])
Expand All @@ -5695,14 +5697,13 @@ def neuro_cognate_statistics(
perspective_name = perspective.get_translation(locale_id)
dictionary_name = perspective.parent.get_translation(locale_id)

dictionary_name_list.append(dictionary_name)
perspective_name_list.append(f"{perspective_name} - {dictionary_name}")
dictionary_name_list.append(f"{idx + 1}. {dictionary_name}")

message = ""
triumph = True
input_len = len(input_pairs_list)
compare_len = sum(map(len, compare_pairs_list))
dictionaries = []

if not input_len or not compare_len:
triumph = False
Expand All @@ -5711,12 +5712,6 @@ def neuro_cognate_statistics(
triumph = False
message = f"Too many words to compare: {compare_len}"
else:
for i, d in enumerate(dictionary_name_list, 1):
dictionaries.append(f"{i}. {d}")

task = TaskStatus(user_id, 'Neuro cognates computation', '\n\n'.join(dictionaries), input_len)
task.set(1, 0, "first words processing...", "")

NeuroCognatesEngine = NeuroCognates(
compare_pairs_list,
input_index,
Expand All @@ -5726,9 +5721,14 @@ def neuro_cognate_statistics(
host_url,
cache_kwargs,
match_translations,
truth_threshold)
truth_threshold,
only_orphans_flag
)

NeuroCognatesEngine.index(input_pairs_list, task)
NeuroCognatesEngine.index(
input_pairs_list,
TaskStatus(user_id, 'Neuro cognates computation', '\n\n'.join(dictionary_name_list), input_len)
)

result_dict = (
dict(
Expand All @@ -5746,8 +5746,11 @@ def mutate(
match_translations,
base_language_id,
truth_threshold=0.97,
only_orphans_flag=True,
group_field_id=(66, 25),
input_pairs=None,
debug_flag=False):
debug_flag=False,
intermediate_flag=False):

# Administrator / perspective author / editing permission check.
error_str = (
Expand Down Expand Up @@ -5842,6 +5845,8 @@ def mutate(
locale_id,
user.id,
truth_threshold,
only_orphans_flag,
group_field_id,
storage,
host_url,
cache_kwargs,
Expand Down Expand Up @@ -7253,3 +7258,37 @@ def mutate(root, info, **args):

print(result)


class SaveSuggestionsState(graphene.Mutation):

class Arguments:
result_file = graphene.String(required=True)
suggestions_state = ObjectVal()
debug_flag = graphene.Boolean()

triumph = graphene.Boolean()

@staticmethod
def mutate(
root,
info,
result_file,
suggestions_state=None,
debug_flag=False):

storage = info.context.request.registry.settings['storage']
storage_dir = os.path.join(storage['path'], 'neuro_cognates')
pickle_path = os.path.join(storage_dir, f'{result_file}_sg')
os.makedirs(storage_dir, exist_ok=True)

if suggestions_state is None:
suggestions_state = dict(
sg_select_list=None,
sg_state_list=None,
sg_count=None,
sg_entry_map=None)

with gzip.open(pickle_path, 'wb') as suggestions_state_file:
pickle.dump(suggestions_state, suggestions_state_file)

return SaveSuggestionsState(triumph=True)
26 changes: 21 additions & 5 deletions lingvodoc/schema/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@
NeuroCognateAnalysis,
PhonemicAnalysis,
SwadeshAnalysis,
XlsxBulkDisconnect)
XlsxBulkDisconnect,
SaveSuggestionsState)

from lingvodoc.schema.gql_column import (
Column,
Expand Down Expand Up @@ -5334,16 +5335,30 @@ def resolve_result_suggestions(self,
storage = (
info.context.request.registry.settings['storage'])

pickle_path = os.path.join(storage['path'], 'neuro_cognates', result_file)
result_path = os.path.join(storage['path'], 'neuro_cognates', result_file)

try:
with gzip.open(pickle_path, 'rb') as pickle_file:
with gzip.open(result_path, 'rb') as pickle_file:
result_dict = pickle.load(pickle_file)
except:
return ResponseError(f'Cannot access file \'{result_path}\'.')

# We're trying to get file with current user changes
sg_state_path = os.path.join(storage['path'], 'neuro_cognates', f'{result_file}_sg')

sg_state_dict = dict(
sg_select_list=None,
sg_state_list=None,
sg_count=None,
sg_entry_map=None)

try:
with gzip.open(sg_state_path, 'rb') as pickle_file:
sg_state_dict = pickle.load(pickle_file)
except:
return ResponseError(f'Cannot access file \'{pickle_path}\'.')
pass

return result_dict
return {**result_dict, **sg_state_dict}


class PerspectivesAndFields(graphene.InputObjectType):
Expand Down Expand Up @@ -9249,6 +9264,7 @@ class MyMutations(graphene.ObjectType):
delete_markup_group = DeleteMarkupGroup.Field()
save_markup_groups = SaveMarkupGroups.Field()
stop_mutation = StopMutation.Field()
save_suggestions_state = SaveSuggestionsState.Field()

schema = graphene.Schema(
query=Query,
Expand Down
8 changes: 5 additions & 3 deletions lingvodoc/scripts/list_cognates.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def has_word(word, text):
yield None


def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True):
def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True, group_field_id=(66, 25)):

xcript_text = None
xlat_text = None
Expand Down Expand Up @@ -607,14 +607,16 @@ def entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=True)
elif field_id == xlat_fid:
xlat_text = field_text

linked_group = None
linked_group = []

if get_linked_group:

linked_group = (
DBSession
.execute(
f'select * from linked_group(66, 25, {lex_id[0]}, {lex_id[1]})')
f'select * from linked_group'
f'({group_field_id[0]}, {group_field_id[1]}, {lex_id[0]}, {lex_id[1]})'
)
.fetchall())

# Preparing of linked_group for json-serialization
Expand Down
50 changes: 36 additions & 14 deletions lingvodoc/utils/neuro_cognates/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def __init__(self,
host_url,
cache_kwargs,
four_tensors=False,
truth_threshold=0.97):
truth_threshold=0.97,
only_orphans_flag=True):

self.compare_lists = compare_lists
self.input_index = input_index
Expand All @@ -45,6 +46,7 @@ def __init__(self,
self.storage = storage
self.host_url = host_url
self.cache_kwargs = cache_kwargs
self.only_orphans_flag = only_orphans_flag

project_dir = os.path.abspath(os.getcwd())
script_path = os.path.abspath(__file__)
Expand Down Expand Up @@ -119,10 +121,11 @@ def split_items(items):
return (
list(map(lambda x: x[0], items)),
list(map(lambda x: x[1], items)),
list(map(lambda x: x[2], items)))
list(map(lambda x: x[2], items)),
list(map(lambda x: x[3], items)))

# Разделяем входные пары на слова и переводы
input_words, input_translations, input_lex_ids = split_items(word_pairs)
input_words, input_translations, input_lex_ids, input_linked_groups = split_items(word_pairs)

# Токенизация и паддинг входных данных
seq_input_words = [tokenizer.texts_to_sequences([word]) for word in input_words]
Expand All @@ -140,7 +143,7 @@ def split_items(items):
# Проход по каждому списку для сравнения
for compare_list in self.compare_lists:

compare_words, compare_translations, compare_lex_ids = split_items(compare_list)
compare_words, compare_translations, _, _ = split_items(compare_list)

# Токенизация и паддинг данных для сравнения
seq_compare_words = [tokenizer.texts_to_sequences([word]) for word in compare_words]
Expand All @@ -157,26 +160,37 @@ def split_items(items):
stamp_file = os.path.join(self.storage['path'], 'lingvodoc_stamps', str(task.id))

# Calculate prediction
def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event):
def get_prediction(input_word, input_trans, input_id, input_links, X_word, X_trans, event):

if event.is_set():
return None

similarities = []
result = []

count = 0
links = 0

# Проход по каждому списку для сравнения
for i, compare_list in enumerate(self.compare_lists):

if not compare_list:
continue

compare_words, compare_translations, compare_lex_ids = split_items(compare_list)
compare_words, compare_translations, compare_lex_ids, compare_linked_groups = split_items(compare_list)

for compare_word, compare_trans, compare_id, compare_links, X_comp_word, X_comp_trans in (
itertools.zip_longest(
compare_words,
compare_translations,
compare_lex_ids,
compare_linked_groups,
X_compare_words[i],
X_compare_translations[i])):

count = 0
for compare_word, compare_trans, compare_id, X_comp_word, X_comp_trans in itertools.zip_longest(
compare_words, compare_translations, compare_lex_ids, X_compare_words[i],
X_compare_translations[i]):
if set(input_links) & set(compare_links):
links += 1
continue

# Checking stamp-to-stop every hundred comparings
count += 1
Expand Down Expand Up @@ -205,10 +219,11 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event):
event.set()
return None

return result
return result, links

start_time = now()
results = []
group_count = 0
current_stage = 0
flushed = 0
result_link = ""
Expand All @@ -219,10 +234,16 @@ def get_prediction(input_word, input_trans, input_id, X_word, X_trans, event):

def add_result(res):

nonlocal current_stage, flushed, result_link, group_count

if res is None:
return

nonlocal current_stage, flushed, result_link
result, links = res

results.extend(result)
group_count += links

current_stage += 1
finished = (current_stage == input_len)
passed = now() - start_time
Expand All @@ -235,8 +256,6 @@ def add_result(res):
progress = 100 if finished else int(current_stage / input_len * 100)
status = "Finished" if finished else f"~ {days}d:{hours}h:{minutes}m left ~"

results.extend(res)

if passed - flushed > 300 or finished:
flushed = passed

Expand All @@ -245,6 +264,7 @@ def add_result(res):
suggestion_list=results,
perspective_name_list=self.perspective_name_list,
transcription_count=compare_len * current_stage,
group_count=f"{group_count} filtered" if self.only_orphans_flag else "non-filtered",
source_perspective_id=self.source_perspective_id))

storage_dir = os.path.join(self.storage['path'], 'neuro_cognates')
Expand All @@ -261,11 +281,13 @@ def add_result(res):
with multiprocess.Pool(multiprocess.cpu_count() // 2) as p:

event = multiprocess.Manager().Event()
task.set(1, 0, "first words processing...")

for args in itertools.zip_longest(
input_words,
input_translations,
input_lex_ids,
input_linked_groups,
X_input_words,
X_input_translations,
[event] * input_len
Expand Down

0 comments on commit 68caf3a

Please sign in to comment.