18
18
import itertools
19
19
import math
20
20
import pandas as pd
21
+ import gzip
22
+ import glob
21
23
22
24
import colorama
23
25
from colorama import Fore
@@ -64,6 +66,52 @@ def backoff_handler(details):
64
66
"{kwargs}" .format (** details )
65
67
)
66
68
69
+ # handles reading minified ontologies and performing term/synonym lookups
70
+ class MinifiedOntologyReader ():
71
+ parsed_ontologies = {}
72
+
73
+ def __init__ (self ):
74
+ ontology_dir = f"{ os .path .dirname (os .path .realpath (__file__ ))} /ontologies"
75
+ for ontology_file in glob .glob (f"{ ontology_dir } /*.min.tsv.gz" ):
76
+ ontology_name = ontology_file .split ('/' )[- 1 ].replace (".min.tsv.gz" , "" )
77
+ self .populate_ontology (ontology_name , ontology_file )
78
+
79
+ def ontology_names (self ):
80
+ return list (self .parsed_ontologies .keys ())
81
+
82
+ def populate_ontology (self , ontology_name , ontology_file ):
83
+ """Parses ontology file by name and populates entries into parsed_ontologies for lookup
84
+ :param ontology_name: name of ontology
85
+ :param ontology_file: relative path to ontology file
86
+ :return: parsed ontology dictionary
87
+ """
88
+ dev_logger .debug (f"populating minified ontology { ontology_name } from { ontology_file } " )
89
+ with gzip .open (ontology_file , 'rt' ) as file_gz :
90
+ ontology = {}
91
+ for line in file_gz .readlines ():
92
+ try :
93
+ ontology_id , label , raw_syn = line .split ("\t " )
94
+ entry = {"label" : label , "synonyms" : [syn .replace ("\n " , '' ) for syn in raw_syn .split ("||" )]}
95
+ ontology [ontology_id ] = entry
96
+ except (ValueError , TypeError ) as e :
97
+ dev_logger .error (f"could not process { line } from { ontology_name } : { e } " )
98
+ self .parsed_ontologies [ontology_name ] = ontology
99
+
100
+ def find_ontology_entry (self , ontology_name , identifier , property_name ):
101
+ """Find an entry in a parsed ontology by identfier
102
+ :param ontology_name: name of ontology
103
+ :param identifier: ontology ID, e.g. MONDO_0005887
104
+ :param property_name: name of metadata property, e.g. species
105
+ :return: dict
106
+ """
107
+ entry = self .parsed_ontologies .get (ontology_name , {}).get (identifier , {})
108
+ if entry :
109
+ return entry
110
+ else :
111
+
112
+ msg = f"{ property_name } : No match found in EBI OLS for provided ontology ID: { identifier } "
113
+ raise ValueError (msg )
114
+
67
115
68
116
# contains methods for looking up terms in various ontologies,
69
117
# as well as caching results of previous queries to speed up performance
@@ -113,6 +161,10 @@ def retrieve_ontology_term_label_remote(
113
161
if property_name == "organ_region" :
114
162
return self .retrieve_mouse_brain_term (term , property_name )
115
163
else :
164
+ # leave debug statement for QA purposes later
165
+ dev_logger .debug (
166
+ f"Using fallback EBI OLS call with { ontology_urls } , { term } , { property_name } "
167
+ )
116
168
return self .retrieve_ols_term (
117
169
ontology_urls , term , property_name , attribute_type
118
170
)
@@ -328,7 +380,7 @@ def get_ontology_file_location(ontology):
328
380
329
381
# create an OntologyRetriever instance to handle fetching and caching ontology terms
330
382
retriever = OntologyRetriever ()
331
-
383
+ minified_reader = MinifiedOntologyReader ()
332
384
333
385
def validate_schema (json , metadata ):
334
386
"""Check validity of metadata convention as JSON schema.
@@ -416,6 +468,22 @@ def validate_cells_unique(metadata):
416
468
)
417
469
return valid
418
470
471
+ def retrieve_label_and_synonyms (
472
+ ontology_id , property_name , convention , property_type
473
+ ):
474
+ """Wrapper method to retrieve label and synonyms depending on whether ontology is local or remote
475
+ :param ontology_id: ontology ID, e.g. MONDO_0005887
476
+ :param property_name: name of metadata property, e.g. species
477
+ :param convention: metadata convention being checked against
478
+ :param property_type: attribute type for term (string, array, boolean)
479
+ """
480
+ ontology_name = re .split ("[_:]" , ontology_id )[0 ].lower ()
481
+ if ontology_is_local (ontology_name ):
482
+ return minified_reader .find_ontology_entry (ontology_name , ontology_id , property_name )
483
+ else :
484
+ return retriever .retrieve_ontology_term_label_and_synonyms (
485
+ ontology_id , property_name , convention , property_type
486
+ )
419
487
420
488
def insert_array_ontology_label_row_data (
421
489
property_name , row , metadata , required , convention , ontology_label
@@ -437,11 +505,8 @@ def insert_array_ontology_label_row_data(
437
505
for id in row [property_name ]:
438
506
label_lookup = ""
439
507
try :
440
- label_and_synonyms = (
441
- retriever .retrieve_ontology_term_label_and_synonyms (
442
- id , property_name , convention , "array"
443
- )
444
- )
508
+
509
+ label_and_synonyms = retrieve_label_and_synonyms (id , property_name , convention , "array" )
445
510
label_lookup = label_and_synonyms .get ('label' )
446
511
reference_ontology = (
447
512
"EBI OLS lookup"
@@ -494,9 +559,7 @@ def insert_ontology_label_row_data(
494
559
# for optional columns, try to fill it in
495
560
property_type = convention ["properties" ][property_name ]["type" ]
496
561
try :
497
- label_and_synonyms = retriever .retrieve_ontology_term_label_and_synonyms (
498
- id , property_name , convention , property_type
499
- )
562
+ label_and_synonyms = retrieve_label_and_synonyms (id , property_name , convention , property_type )
500
563
label = label_and_synonyms .get ('label' )
501
564
row [ontology_label ] = label
502
565
reference_ontology = (
@@ -1056,6 +1119,12 @@ def is_label_or_synonym(labels, provided_label):
1056
1119
else :
1057
1120
return False
1058
1121
1122
+ def ontology_is_local (ontology_name ):
1123
+ """Check if it is possible to use local ontology validation instead of OLS
1124
+ :param ontology_name: name of ontology
1125
+ :return: Boolean
1126
+ """
1127
+ return ontology_name is not None and ontology_name in minified_reader .ontology_names ()
1059
1128
1060
1129
def validate_collected_ontology_data (metadata , convention ):
1061
1130
"""Evaluate collected ontology_id, ontology_label info in
@@ -1080,15 +1149,10 @@ def validate_collected_ontology_data(metadata, convention):
1080
1149
1081
1150
for ontology_info in metadata .ontology [property_name ].keys ():
1082
1151
ontology_id , ontology_label = ontology_info
1083
-
1084
1152
try :
1085
1153
attribute_type = convention ["properties" ][property_name ]["type" ]
1086
1154
# get actual label along with synonyms for more robust matching
1087
- label_and_synonyms = (
1088
- retriever .retrieve_ontology_term_label_and_synonyms (
1089
- ontology_id , property_name , convention , attribute_type
1090
- )
1091
- )
1155
+ label_and_synonyms = retrieve_label_and_synonyms (ontology_id , property_name , convention , attribute_type )
1092
1156
1093
1157
if not is_label_or_synonym (label_and_synonyms , ontology_label ):
1094
1158
matched_label_for_id = label_and_synonyms .get ("label" )
0 commit comments