Skip to content

Commit d23858a

Browse files
authored
Merge pull request #393 from broadinstitute/development
Release 1.41.2
2 parents 86aaff6 + efcd1f2 commit d23858a

16 files changed

+1444
-116
lines changed

ingest/validation/minify_ontologies.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,17 @@
2121
EFO_URL = 'https://github.com/EBISPOT/efo/releases/latest/download/efo.json'
2222
UBERON_URL = 'https://github.com/obophenotype/uberon/releases/latest/download/uberon.json'
2323
CL_URL = 'https://github.com/obophenotype/cell-ontology/releases/latest/download/cl.json'
24+
UO_URL = 'https://raw.githubusercontent.com/bio-ontology-research-group/unit-ontology/refs/heads/master/uo.json'
25+
HANCESTRO_URL = 'https://raw.githubusercontent.com/EBISPOT/hancestro/refs/heads/main/hancestro.json'
2426

2527
ONTOLOGY_JSON_URLS = {
2628
'disease': [MONDO_URL, PATO_URL],
2729
'species': [NCBITAXON_URL],
2830
'library_preparation_protocol': [EFO_URL],
2931
'organ': [UBERON_URL],
30-
'cell_type': [CL_URL]
32+
'cell_type': [CL_URL],
33+
'ethnicity': [HANCESTRO_URL],
34+
'organism_age__unit': [UO_URL]
3135
}
3236

3337
def fetch(url, use_cache=True):
@@ -71,7 +75,7 @@ def get_synonyms(node, label):
7175
if 'val' not in synonym_node:
7276
# Handles e.g. incomplete EFO synonym nodes
7377
continue
74-
raw_synonym = synonym_node['val']
78+
raw_synonym = synonym_node['val'].strip()
7579
if (
7680
not raw_synonym.startswith('obsolete ') and # Omit obsolete synonyms
7781
raw_synonym != label # Omit synonyms that are redundant with label
-29 Bytes
Binary file not shown.
Binary file not shown.
502 Bytes
Binary file not shown.
5.19 KB
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1742404288 # validation cache key
1+
1744144302 # validation cache key

ingest/validation/validate_metadata.py

Lines changed: 79 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
import itertools
1919
import math
2020
import pandas as pd
21+
import gzip
22+
import glob
2123

2224
import colorama
2325
from colorama import Fore
@@ -64,6 +66,52 @@ def backoff_handler(details):
6466
"{kwargs}".format(**details)
6567
)
6668

69+
# handles reading minified ontologies and performing term/synonym lookups
70+
class MinifiedOntologyReader():
71+
parsed_ontologies = {}
72+
73+
def __init__(self):
74+
ontology_dir = f"{os.path.dirname(os.path.realpath(__file__))}/ontologies"
75+
for ontology_file in glob.glob(f"{ontology_dir}/*.min.tsv.gz"):
76+
ontology_name = ontology_file.split('/')[-1].replace(".min.tsv.gz", "")
77+
self.populate_ontology(ontology_name, ontology_file)
78+
79+
def ontology_names(self):
80+
return list(self.parsed_ontologies.keys())
81+
82+
def populate_ontology(self, ontology_name, ontology_file):
83+
"""Parses ontology file by name and populates entries into parsed_ontologies for lookup
84+
:param ontology_name: name of ontology
85+
:param ontology_file: relative path to ontology file
86+
:return: parsed ontology dictionary
87+
"""
88+
dev_logger.debug(f"populating minified ontology {ontology_name} from {ontology_file}")
89+
with gzip.open(ontology_file, 'rt') as file_gz:
90+
ontology = {}
91+
for line in file_gz.readlines():
92+
try:
93+
ontology_id, label, raw_syn = line.split("\t")
94+
entry = {"label": label, "synonyms": [syn.replace("\n", '') for syn in raw_syn.split("||")]}
95+
ontology[ontology_id] = entry
96+
except (ValueError, TypeError) as e:
97+
dev_logger.error(f"could not process {line} from {ontology_name}: {e}")
98+
self.parsed_ontologies[ontology_name] = ontology
99+
100+
def find_ontology_entry(self, ontology_name, identifier, property_name):
101+
"""Find an entry in a parsed ontology by identfier
102+
:param ontology_name: name of ontology
103+
:param identifier: ontology ID, e.g. MONDO_0005887
104+
:param property_name: name of metadata property, e.g. species
105+
:return: dict
106+
"""
107+
entry = self.parsed_ontologies.get(ontology_name, {}).get(identifier, {})
108+
if entry:
109+
return entry
110+
else:
111+
112+
msg = f"{property_name}: No match found in EBI OLS for provided ontology ID: {identifier}"
113+
raise ValueError(msg)
114+
67115

68116
# contains methods for looking up terms in various ontologies,
69117
# as well as caching results of previous queries to speed up performance
@@ -113,6 +161,10 @@ def retrieve_ontology_term_label_remote(
113161
if property_name == "organ_region":
114162
return self.retrieve_mouse_brain_term(term, property_name)
115163
else:
164+
# leave debug statement for QA purposes later
165+
dev_logger.debug(
166+
f"Using fallback EBI OLS call with {ontology_urls}, {term}, {property_name}"
167+
)
116168
return self.retrieve_ols_term(
117169
ontology_urls, term, property_name, attribute_type
118170
)
@@ -328,7 +380,7 @@ def get_ontology_file_location(ontology):
328380

329381
# create an OntologyRetriever instance to handle fetching and caching ontology terms
330382
retriever = OntologyRetriever()
331-
383+
minified_reader = MinifiedOntologyReader()
332384

333385
def validate_schema(json, metadata):
334386
"""Check validity of metadata convention as JSON schema.
@@ -416,6 +468,22 @@ def validate_cells_unique(metadata):
416468
)
417469
return valid
418470

471+
def retrieve_label_and_synonyms(
472+
ontology_id, property_name, convention, property_type
473+
):
474+
"""Wrapper method to retrieve label and synonyms depending on whether ontology is local or remote
475+
:param ontology_id: ontology ID, e.g. MONDO_0005887
476+
:param property_name: name of metadata property, e.g. species
477+
:param convention: metadata convention being checked against
478+
:param property_type: attribute type for term (string, array, boolean)
479+
"""
480+
ontology_name = re.split("[_:]", ontology_id)[0].lower()
481+
if ontology_is_local(ontology_name):
482+
return minified_reader.find_ontology_entry(ontology_name, ontology_id, property_name)
483+
else:
484+
return retriever.retrieve_ontology_term_label_and_synonyms(
485+
ontology_id, property_name, convention, property_type
486+
)
419487

420488
def insert_array_ontology_label_row_data(
421489
property_name, row, metadata, required, convention, ontology_label
@@ -437,11 +505,8 @@ def insert_array_ontology_label_row_data(
437505
for id in row[property_name]:
438506
label_lookup = ""
439507
try:
440-
label_and_synonyms = (
441-
retriever.retrieve_ontology_term_label_and_synonyms(
442-
id, property_name, convention, "array"
443-
)
444-
)
508+
509+
label_and_synonyms = retrieve_label_and_synonyms(id, property_name, convention, "array")
445510
label_lookup = label_and_synonyms.get('label')
446511
reference_ontology = (
447512
"EBI OLS lookup"
@@ -494,9 +559,7 @@ def insert_ontology_label_row_data(
494559
# for optional columns, try to fill it in
495560
property_type = convention["properties"][property_name]["type"]
496561
try:
497-
label_and_synonyms = retriever.retrieve_ontology_term_label_and_synonyms(
498-
id, property_name, convention, property_type
499-
)
562+
label_and_synonyms = retrieve_label_and_synonyms(id, property_name, convention, property_type)
500563
label = label_and_synonyms.get('label')
501564
row[ontology_label] = label
502565
reference_ontology = (
@@ -1056,6 +1119,12 @@ def is_label_or_synonym(labels, provided_label):
10561119
else:
10571120
return False
10581121

1122+
def ontology_is_local(ontology_name):
1123+
"""Check if it is possible to use local ontology validation instead of OLS
1124+
:param ontology_name: name of ontology
1125+
:return: Boolean
1126+
"""
1127+
return ontology_name is not None and ontology_name in minified_reader.ontology_names()
10591128

10601129
def validate_collected_ontology_data(metadata, convention):
10611130
"""Evaluate collected ontology_id, ontology_label info in
@@ -1080,15 +1149,10 @@ def validate_collected_ontology_data(metadata, convention):
10801149

10811150
for ontology_info in metadata.ontology[property_name].keys():
10821151
ontology_id, ontology_label = ontology_info
1083-
10841152
try:
10851153
attribute_type = convention["properties"][property_name]["type"]
10861154
# get actual label along with synonyms for more robust matching
1087-
label_and_synonyms = (
1088-
retriever.retrieve_ontology_term_label_and_synonyms(
1089-
ontology_id, property_name, convention, attribute_type
1090-
)
1091-
)
1155+
label_and_synonyms = retrieve_label_and_synonyms(ontology_id, property_name, convention, attribute_type)
10921156

10931157
if not is_label_or_synonym(label_and_synonyms, ontology_label):
10941158
matched_label_for_id = label_and_synonyms.get("label")

schema/alexandria_convention/alexandria_convention_schema.json

Lines changed: 1 addition & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"$id": "https://singlecell.broadinstitute.org/single_cell/api/v1/metadata_schemas/alexandria_convention/2.3.0/json",
2+
"$id": "https://singlecell.broadinstitute.org/single_cell/api/v1/metadata_schemas/alexandria_convention/3.0.0/json",
33
"$schema": "https://json-schema.org/draft-07/schema#",
44
"dependencies": {
55
"cell_type__ontology_label": [
@@ -14,9 +14,6 @@
1414
"culture_duration__unit_label": [
1515
"culture_duration__unit"
1616
],
17-
"development_stage__ontology_label": [
18-
"development_stage"
19-
],
2017
"disease__intracellular_pathogen": [
2118
"disease"
2219
],
@@ -119,16 +116,9 @@
119116
"organism_age__unit_label": [
120117
"organism_age__unit"
121118
],
122-
"race__ontology_label": [
123-
"race"
124-
],
125119
"sequencing_instrument_manufacturer_model__ontology_label": [
126120
"sequencing_instrument_manufacturer_model"
127121
],
128-
"small_molecule_perturbation__concentration": [
129-
"small_molecule_perturbation",
130-
"small_molecule_perturbation__concentration__unit"
131-
],
132122
"small_molecule_perturbation__concentration__unit": [
133123
"small_molecule_perturbation__concentration"
134124
],
@@ -225,9 +215,6 @@
225215
},
226216
"development_stage": {
227217
"description": "A classification of the developmental stage of the organism",
228-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/hsapdv,https://www.ebi.ac.uk/ols/api/ontologies/mmusdv",
229-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/hsapdv,https://www.ebi.ac.uk/ols/ontologies/mmusdv",
230-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
231218
"type": "string"
232219
},
233220
"development_stage__ontology_label": {
@@ -374,11 +361,8 @@
374361
"gene_perturbation": {
375362
"description": "A perturbation to a gene done to a cell culture",
376363
"items": {
377-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/ogg",
378-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
379364
"type": "string"
380365
},
381-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/ogg",
382366
"type": "array"
383367
},
384368
"gene_perturbation__direction": {
@@ -406,9 +390,6 @@
406390
},
407391
"geographical_region": {
408392
"description": "Location where the sample was collected/donated",
409-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/gaz",
410-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/gaz",
411-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
412393
"type": "string"
413394
},
414395
"geographical_region__ontology_label": {
@@ -419,11 +400,8 @@
419400
"dependency_condition": "sample_type in cell line, organoid, cultured primary cells",
420401
"description": "a growth factor added to a cell culture media",
421402
"items": {
422-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/pr",
423-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
424403
"type": "string"
425404
},
426-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/pr",
427405
"type": "array"
428406
},
429407
"growth_factor_perturbation__concentration": {
@@ -494,9 +472,6 @@
494472
"mouse_strain": {
495473
"dependency_condition": "species == NCBITaxon_10090",
496474
"description": "Mouse strain of the donor organism (ex. C57BL/6, BALB/c, 129, undetermined)",
497-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/ncit",
498-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/ncit",
499-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
500475
"type": "string"
501476
},
502477
"mouse_strain__ontology_label": {
@@ -564,14 +539,10 @@
564539
"type": "string"
565540
},
566541
"race": {
567-
"dependency_condition": "species == NCBITaxon_9606",
568542
"description": "An arbitrary classification of a taxonomic group that is a division of a species",
569543
"items": {
570-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/ncit",
571-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
572544
"type": "string"
573545
},
574-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/ncit",
575546
"type": "array"
576547
},
577548
"race__ontology_label": {
@@ -604,11 +575,8 @@
604575
"small_molecule_perturbation": {
605576
"description": "a small molecule added to a cell culture (ex. A drug) growth factor (and if it is recombinant, concentration), gene)",
606577
"items": {
607-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/chebi",
608-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
609578
"type": "string"
610579
},
611-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/chebi",
612580
"type": "array"
613581
},
614582
"small_molecule_perturbation__concentration": {
@@ -677,11 +645,8 @@
677645
"vaccination": {
678646
"description": "Any known vaccines administered to the donor organism. NOT a full vaccine history",
679647
"items": {
680-
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/vo",
681-
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
682648
"type": "string"
683649
},
684-
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/vo",
685650
"type": "array"
686651
},
687652
"vaccination__adjuvants": {

0 commit comments

Comments
 (0)