Skip to content

Commit

Permalink
Neuro cognates (#1526)
Browse files Browse the repository at this point in the history
* First working version

* Cleanup

* Refactoring

* Compare one-by-one

* Minor

* Multiprocessing

* Optimization

* Structured for existent code

* Structured for existent code

Forwarded matchTranslationsFlag, fixed requirements

Hotfix

Used Python 3.9
  • Loading branch information
vmonakhov committed Feb 4, 2025
1 parent 65885fe commit e18dbbd
Show file tree
Hide file tree
Showing 12 changed files with 504 additions and 19 deletions.
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ RUN apt-get update && apt install -y python3-dev python3-setuptools \
postgresql-server-dev-13 postgresql-client-13 libpq-dev \
fonts-sil-gentium fonts-sil-gentium-basic fonts-sil-gentiumplus \
fonts-sil-gentiumplus-compact libfreetype6-dev libxft-dev \
ffmpeg libxml2-dev libxslt-dev
ffmpeg libxml2-dev libxslt-dev python3.9 python3.9-dev
RUN \
wget https://github.com/ispras/lingvodoc-ext-oslon/archive/master.zip -O /tmp/master.zip && \
unzip /tmp/master.zip -d /tmp/ && \
g++ -O2 -fPIC -shared -Wl,-soname,liboslon.so -Wno-write-strings -o /usr/lib/liboslon.so /tmp/lingvodoc-ext-oslon-master/analysis.cpp && \
ldconfig
RUN \
git config --global http.postBuffer 500M && \
git config --global http.maxRequestBuffer 100M && \
git config --global core.compression 0 && \
ln -sf $(which python3.9) /usr/bin/python3 && \
pip3 install pip==20.0.2 && \
pip3 install --upgrade setuptools==44.0 && \
pip3 install -r server-requirements-1.txt && \
Expand Down
224 changes: 224 additions & 0 deletions lingvodoc/schema/gql_cognate.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@
from lingvodoc.views.v2.phonology import process_sound_markup, PickleCache

from lingvodoc.views.v2.utils import anonymous_userid
from lingvodoc.scripts.list_cognates import entities_getter
from lingvodoc.utils.neuro_cognates.app import NeuroCognates

from pdb import set_trace as A


Expand Down Expand Up @@ -5618,6 +5621,227 @@ def mutate(
'Exception:\n' + traceback_string)


class NeuroCognateAnalysis(graphene.Mutation):
class Arguments:

source_perspective_id = LingvodocID(required=True)
perspective_info_list = graphene.List(graphene.List(LingvodocID), required=True)
match_translations = graphene.Boolean()
base_language_id = LingvodocID()
input_pairs = ObjectVal()

debug_flag = graphene.Boolean()

triumph = graphene.Boolean()

suggestion_list = ObjectVal()
message = graphene.String()
perspective_name_list = graphene.List(graphene.String)
transcription_count = graphene.Int()

@staticmethod
def neuro_cognate_statistics(
#language_str,
#base_language_id,
#base_language_name,
perspective_info_list,
source_perspective_id,
match_translations,
input_pairs,
locale_id,
#storage,
debug_flag = False):

input_pairs_list = input_pairs or []
compare_pairs_list = []
total_transcription_count = len(input_pairs) if input_pairs else 0
input_index = None
perspective_name_list = []

for (
idx, (_,
perspective_id,
xcript_fid,
xlat_fid, _)
) in enumerate(perspective_info_list):

current_pairs_list = []

for (
lex_id,
xcript_text,
xlat_text,
_
) in entities_getter(perspective_id, xcript_fid, xlat_fid, get_linked_group=False):

if not xcript_text or not xlat_text:
continue

current_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id])))

if perspective_id != source_perspective_id:
compare_pairs_list.append(current_pairs_list[:])
total_transcription_count += len(current_pairs_list)
else:
input_index = idx
compare_pairs_list.append([])
if not input_pairs_list:
input_pairs_list = current_pairs_list[:]
total_transcription_count += len(current_pairs_list)

perspective = DBSession.query(dbPerspective).filter_by(
client_id = perspective_id[0], object_id = perspective_id[1]).first()

perspective_name = perspective.get_translation(locale_id)
dictionary_name = perspective.parent.get_translation(locale_id)

perspective_name_list.append(f"{perspective_name} - {dictionary_name}")

message = ""
triumph = True
prediction = None

if not input_pairs_list or not sum(map(len, compare_pairs_list)):
triumph = False
message = "No input words or words to compare is received!"
else:
NeuroCognatesEngine = NeuroCognates(four_tensors=match_translations)
prediction = NeuroCognatesEngine.index(input_pairs_list, compare_pairs_list, input_index)

result_dict = (
dict(
triumph=triumph,
suggestion_list=prediction,
message=message,
perspective_name_list=perspective_name_list,
transcription_count=total_transcription_count))

return NeuroCognateAnalysis(**result_dict)

@staticmethod
def mutate(
self,
info,
source_perspective_id,
perspective_info_list,
match_translations,
base_language_id,
input_pairs=None,
debug_flag=False):

# Administrator / perspective author / editing permission check.
error_str = (
'Only administrator, perspective author and users with perspective editing permissions '
'can perform neuro cognate analysis.')

client_id = info.context.client_id

if not client_id:
return ResponseError(error_str)

user = Client.get_user_by_client_id(client_id)

author_client_id_set = (

set(
client_id
for _, (client_id, _), _, _, _ in perspective_info_list))

author_id_check = (

DBSession

.query(

DBSession
.query(literal(1))
.filter(
Client.id.in_(author_client_id_set),
Client.user_id == user.id)
.exists())

.scalar())

if (user.id != 1 and
not author_id_check and
not info.context.acl_check_if('edit', 'perspective', source_perspective_id)):

return ResponseError(error_str)

# Debug mode check.

if debug_flag and user.id != 1:

return (

ResponseError(
message = 'Only administrator can use debug mode.'))

language_str = (
'{0}/{1}, language {2}/{3}'.format(
source_perspective_id[0], source_perspective_id[1],
base_language_id[0], base_language_id[1]))

try:

# Getting base language info.

locale_id = info.context.locale_id

#base_language = DBSession.query(dbLanguage).filter_by(
#client_id = base_language_id[0], object_id = base_language_id[1]).first()

#base_language_name = base_language.get_translation(locale_id)

#request = info.context.request
#storage = request.registry.settings['storage']

# Transforming client/object pair ids from lists to 2-tuples.

#base_language_id = tuple(base_language_id)

source_perspective_id = tuple(source_perspective_id)

perspective_info_list = [

(tuple(language_id),
tuple(perspective_id),
tuple(word_field_id),
tuple(meaning_field_id),
None)

for language_id,
perspective_id,
word_field_id,
meaning_field_id,
_ in perspective_info_list]

return NeuroCognateAnalysis.neuro_cognate_statistics(
#language_str,
#base_language_id,
#base_language_name,
perspective_info_list,
source_perspective_id,
match_translations,
input_pairs,
locale_id,
#storage,
debug_flag)

# Exception occurred while we tried to perform swadesh analysis.
except Exception as exception:

traceback_string = ''.join(
traceback.format_exception(exception, exception, exception.__traceback__))[:-1]

log.warning(
'neuro_cognate_analysis {0}: exception'.format(language_str))

log.warning(traceback_string)

return ResponseError(message='Exception:\n' + traceback_string)


class ComplexDistance(graphene.Mutation):
class Arguments:

Expand Down
34 changes: 34 additions & 0 deletions lingvodoc/schema/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import gzip
import hashlib
import io
import itertools
import json
import logging
import math
Expand Down Expand Up @@ -145,6 +146,7 @@
CognateAnalysis,
ComplexDistance,
MorphCognateAnalysis,
NeuroCognateAnalysis,
PhonemicAnalysis,
SwadeshAnalysis,
XlsxBulkDisconnect)
Expand Down Expand Up @@ -374,6 +376,8 @@

from operator import attrgetter

from lingvodoc.scripts.list_cognates import entities_getter

from pdb import set_trace as A

# Setting up logging.
Expand Down Expand Up @@ -683,6 +687,13 @@ class Query(graphene.ObjectType):
Markup,
perspective_id = LingvodocID(required = True)))

words = (
graphene.Field(
ObjectVal,
perspective_id = LingvodocID(required = True),
xcript_fid = LingvodocID(required = True),
xlat_fid = LingvodocID(required = True)))

def resolve_fill_logs(self, info, worker=1):
# Check if the current user is administrator
client_id = info.context.client_id
Expand Down Expand Up @@ -5289,6 +5300,28 @@ def resolve_markups(self,

return result

def resolve_words(self,
info,
perspective_id,
xcript_fid,
xlat_fid):

result_pairs_list = []

for (
lex_id,
xcript_text,
xlat_text,
_
) in entities_getter(tuple(perspective_id), tuple(xcript_fid), tuple(xlat_fid), False):

if not xcript_text or not xlat_text:
continue

result_pairs_list.extend(list(itertools.product(xcript_text, xlat_text, [lex_id])))

return result_pairs_list

class PerspectivesAndFields(graphene.InputObjectType):
perspective_id = LingvodocID()
field_id = LingvodocID()
Expand Down Expand Up @@ -9163,6 +9196,7 @@ class MyMutations(graphene.ObjectType):
cognate_analysis = CognateAnalysis.Field()
swadesh_analysis = SwadeshAnalysis.Field()
morph_cognate_analysis = MorphCognateAnalysis.Field()
neuro_cognate_analysis = NeuroCognateAnalysis.Field()
phonology = Phonology.Field()
phonological_statistical_distance = PhonologicalStatisticalDistance.Field()
sound_and_markup = SoundAndMarkup.Field()
Expand Down
Loading

0 comments on commit e18dbbd

Please sign in to comment.