Skip to content

Release 1.41.2 #393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions ingest/validation/minify_ontologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@
EFO_URL = 'https://github.com/EBISPOT/efo/releases/latest/download/efo.json'
UBERON_URL = 'https://github.com/obophenotype/uberon/releases/latest/download/uberon.json'
CL_URL = 'https://github.com/obophenotype/cell-ontology/releases/latest/download/cl.json'
UO_URL = 'https://raw.githubusercontent.com/bio-ontology-research-group/unit-ontology/refs/heads/master/uo.json'
HANCESTRO_URL = 'https://raw.githubusercontent.com/EBISPOT/hancestro/refs/heads/main/hancestro.json'

ONTOLOGY_JSON_URLS = {
'disease': [MONDO_URL, PATO_URL],
'species': [NCBITAXON_URL],
'library_preparation_protocol': [EFO_URL],
'organ': [UBERON_URL],
'cell_type': [CL_URL]
'cell_type': [CL_URL],
'ethnicity': [HANCESTRO_URL],
'organism_age__unit': [UO_URL]
}

def fetch(url, use_cache=True):
Expand Down Expand Up @@ -71,7 +75,7 @@ def get_synonyms(node, label):
if 'val' not in synonym_node:
# Handles e.g. incomplete EFO synonym nodes
continue
raw_synonym = synonym_node['val']
raw_synonym = synonym_node['val'].strip()
if (
not raw_synonym.startswith('obsolete ') and # Omit obsolete synonyms
raw_synonym != label # Omit synonyms that are redundant with label
Expand Down
Binary file modified ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/hancestro.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/mondo.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/uo.min.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1742404288 # validation cache key
1744144302 # validation cache key
94 changes: 79 additions & 15 deletions ingest/validation/validate_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import itertools
import math
import pandas as pd
import gzip
import glob

import colorama
from colorama import Fore
Expand Down Expand Up @@ -64,6 +66,52 @@ def backoff_handler(details):
"{kwargs}".format(**details)
)

# handles reading minified ontologies and performing term/synonym lookups
class MinifiedOntologyReader():
parsed_ontologies = {}

def __init__(self):
ontology_dir = f"{os.path.dirname(os.path.realpath(__file__))}/ontologies"
for ontology_file in glob.glob(f"{ontology_dir}/*.min.tsv.gz"):
ontology_name = ontology_file.split('/')[-1].replace(".min.tsv.gz", "")
self.populate_ontology(ontology_name, ontology_file)

def ontology_names(self):
return list(self.parsed_ontologies.keys())

def populate_ontology(self, ontology_name, ontology_file):
"""Parses ontology file by name and populates entries into parsed_ontologies for lookup
:param ontology_name: name of ontology
:param ontology_file: relative path to ontology file
:return: parsed ontology dictionary
"""
dev_logger.debug(f"populating minified ontology {ontology_name} from {ontology_file}")
with gzip.open(ontology_file, 'rt') as file_gz:
ontology = {}
for line in file_gz.readlines():
try:
ontology_id, label, raw_syn = line.split("\t")
entry = {"label": label, "synonyms": [syn.replace("\n", '') for syn in raw_syn.split("||")]}
ontology[ontology_id] = entry
except (ValueError, TypeError) as e:
dev_logger.error(f"could not process {line} from {ontology_name}: {e}")
self.parsed_ontologies[ontology_name] = ontology

def find_ontology_entry(self, ontology_name, identifier, property_name):
"""Find an entry in a parsed ontology by identfier
:param ontology_name: name of ontology
:param identifier: ontology ID, e.g. MONDO_0005887
:param property_name: name of metadata property, e.g. species
:return: dict
"""
entry = self.parsed_ontologies.get(ontology_name, {}).get(identifier, {})
if entry:
return entry
else:

msg = f"{property_name}: No match found in EBI OLS for provided ontology ID: {identifier}"
raise ValueError(msg)


# contains methods for looking up terms in various ontologies,
# as well as caching results of previous queries to speed up performance
Expand Down Expand Up @@ -113,6 +161,10 @@ def retrieve_ontology_term_label_remote(
if property_name == "organ_region":
return self.retrieve_mouse_brain_term(term, property_name)
else:
# leave debug statement for QA purposes later
dev_logger.debug(
f"Using fallback EBI OLS call with {ontology_urls}, {term}, {property_name}"
)
return self.retrieve_ols_term(
ontology_urls, term, property_name, attribute_type
)
Expand Down Expand Up @@ -328,7 +380,7 @@ def get_ontology_file_location(ontology):

# create an OntologyRetriever instance to handle fetching and caching ontology terms
retriever = OntologyRetriever()

minified_reader = MinifiedOntologyReader()

def validate_schema(json, metadata):
"""Check validity of metadata convention as JSON schema.
Expand Down Expand Up @@ -416,6 +468,22 @@ def validate_cells_unique(metadata):
)
return valid

def retrieve_label_and_synonyms(
ontology_id, property_name, convention, property_type
):
"""Wrapper method to retrieve label and synonyms depending on whether ontology is local or remote
:param ontology_id: ontology ID, e.g. MONDO_0005887
:param property_name: name of metadata property, e.g. species
:param convention: metadata convention being checked against
:param property_type: attribute type for term (string, array, boolean)
"""
ontology_name = re.split("[_:]", ontology_id)[0].lower()
if ontology_is_local(ontology_name):
return minified_reader.find_ontology_entry(ontology_name, ontology_id, property_name)
else:
return retriever.retrieve_ontology_term_label_and_synonyms(
ontology_id, property_name, convention, property_type
)

def insert_array_ontology_label_row_data(
property_name, row, metadata, required, convention, ontology_label
Expand All @@ -437,11 +505,8 @@ def insert_array_ontology_label_row_data(
for id in row[property_name]:
label_lookup = ""
try:
label_and_synonyms = (
retriever.retrieve_ontology_term_label_and_synonyms(
id, property_name, convention, "array"
)
)

label_and_synonyms = retrieve_label_and_synonyms(id, property_name, convention, "array")
label_lookup = label_and_synonyms.get('label')
reference_ontology = (
"EBI OLS lookup"
Expand Down Expand Up @@ -494,9 +559,7 @@ def insert_ontology_label_row_data(
# for optional columns, try to fill it in
property_type = convention["properties"][property_name]["type"]
try:
label_and_synonyms = retriever.retrieve_ontology_term_label_and_synonyms(
id, property_name, convention, property_type
)
label_and_synonyms = retrieve_label_and_synonyms(id, property_name, convention, property_type)
label = label_and_synonyms.get('label')
row[ontology_label] = label
reference_ontology = (
Expand Down Expand Up @@ -1056,6 +1119,12 @@ def is_label_or_synonym(labels, provided_label):
else:
return False

def ontology_is_local(ontology_name):
"""Check if it is possible to use local ontology validation instead of OLS
:param ontology_name: name of ontology
:return: Boolean
"""
return ontology_name is not None and ontology_name in minified_reader.ontology_names()

def validate_collected_ontology_data(metadata, convention):
"""Evaluate collected ontology_id, ontology_label info in
Expand All @@ -1080,15 +1149,10 @@ def validate_collected_ontology_data(metadata, convention):

for ontology_info in metadata.ontology[property_name].keys():
ontology_id, ontology_label = ontology_info

try:
attribute_type = convention["properties"][property_name]["type"]
# get actual label along with synonyms for more robust matching
label_and_synonyms = (
retriever.retrieve_ontology_term_label_and_synonyms(
ontology_id, property_name, convention, attribute_type
)
)
label_and_synonyms = retrieve_label_and_synonyms(ontology_id, property_name, convention, attribute_type)

if not is_label_or_synonym(label_and_synonyms, ontology_label):
matched_label_for_id = label_and_synonyms.get("label")
Expand Down
37 changes: 1 addition & 36 deletions schema/alexandria_convention/alexandria_convention_schema.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"$id": "https://singlecell.broadinstitute.org/single_cell/api/v1/metadata_schemas/alexandria_convention/2.3.0/json",
"$id": "https://singlecell.broadinstitute.org/single_cell/api/v1/metadata_schemas/alexandria_convention/3.0.0/json",
"$schema": "https://json-schema.org/draft-07/schema#",
"dependencies": {
"cell_type__ontology_label": [
Expand All @@ -14,9 +14,6 @@
"culture_duration__unit_label": [
"culture_duration__unit"
],
"development_stage__ontology_label": [
"development_stage"
],
"disease__intracellular_pathogen": [
"disease"
],
Expand Down Expand Up @@ -119,16 +116,9 @@
"organism_age__unit_label": [
"organism_age__unit"
],
"race__ontology_label": [
"race"
],
"sequencing_instrument_manufacturer_model__ontology_label": [
"sequencing_instrument_manufacturer_model"
],
"small_molecule_perturbation__concentration": [
"small_molecule_perturbation",
"small_molecule_perturbation__concentration__unit"
],
"small_molecule_perturbation__concentration__unit": [
"small_molecule_perturbation__concentration"
],
Expand Down Expand Up @@ -225,9 +215,6 @@
},
"development_stage": {
"description": "A classification of the developmental stage of the organism",
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/hsapdv,https://www.ebi.ac.uk/ols/api/ontologies/mmusdv",
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/hsapdv,https://www.ebi.ac.uk/ols/ontologies/mmusdv",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"development_stage__ontology_label": {
Expand Down Expand Up @@ -374,11 +361,8 @@
"gene_perturbation": {
"description": "A perturbation to a gene done to a cell culture",
"items": {
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/ogg",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/ogg",
"type": "array"
},
"gene_perturbation__direction": {
Expand Down Expand Up @@ -406,9 +390,6 @@
},
"geographical_region": {
"description": "Location where the sample was collected/donated",
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/gaz",
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/gaz",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"geographical_region__ontology_label": {
Expand All @@ -419,11 +400,8 @@
"dependency_condition": "sample_type in cell line, organoid, cultured primary cells",
"description": "a growth factor added to a cell culture media",
"items": {
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/pr",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/pr",
"type": "array"
},
"growth_factor_perturbation__concentration": {
Expand Down Expand Up @@ -494,9 +472,6 @@
"mouse_strain": {
"dependency_condition": "species == NCBITaxon_10090",
"description": "Mouse strain of the donor organism (ex. C57BL/6, BALB/c, 129, undetermined)",
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/ncit",
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/ncit",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"mouse_strain__ontology_label": {
Expand Down Expand Up @@ -564,14 +539,10 @@
"type": "string"
},
"race": {
"dependency_condition": "species == NCBITaxon_9606",
"description": "An arbitrary classification of a taxonomic group that is a division of a species",
"items": {
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/ncit",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/ncit",
"type": "array"
},
"race__ontology_label": {
Expand Down Expand Up @@ -604,11 +575,8 @@
"small_molecule_perturbation": {
"description": "a small molecule added to a cell culture (ex. A drug) growth factor (and if it is recombinant, concentration), gene)",
"items": {
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/chebi",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/chebi",
"type": "array"
},
"small_molecule_perturbation__concentration": {
Expand Down Expand Up @@ -677,11 +645,8 @@
"vaccination": {
"description": "Any known vaccines administered to the donor organism. NOT a full vaccine history",
"items": {
"ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/vo",
"pattern": "^[-A-Za-z0-9]+[_:][-A-Za-z0-9]+",
"type": "string"
},
"ontology": "https://www.ebi.ac.uk/ols/api/ontologies/vo",
"type": "array"
},
"vaccination__adjuvants": {
Expand Down
Loading