Skip to content

Commit

Permalink
fix!: use correct representation of Coding object in mappings (#221)
Browse files Browse the repository at this point in the history
close #220

* correct `Coding` representation
  * `system` MUST use `iriReference`, not a free-text label
  * `code` MUST use syntax defined by the `system`
  * `id` will use record `concept_id`
* `NAMESPACE_TO_SYSTEM_URI` is now a `MappingProxyType`
* Removes `SYSTEM_URI_TO_NAMESPACE` mapping (since it's no longer
needed)
* Removes unused `NamespacePrefix` members (if needed, will be added
back in #214):
* `COHD`, `DECIPHER`, `HP`, `HPO`, `ICD9`, `ICD11`, `IDO`, `MF`, `MP`,
`MPATH`, `NIFSTD`, `OBI`, `OGMS`, `PATO`, `SCDO`, `WIKIPEDIA`,
`WIKIDATA`
  • Loading branch information
korikuzma authored Jan 29, 2025
1 parent e5d0db0 commit 81a9fd3
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 123 deletions.
42 changes: 4 additions & 38 deletions src/disease/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

from botocore.exceptions import ClientError
from ga4gh.core.models import (
Coding,
ConceptMapping,
Extension,
MappableConcept,
Relation,
Expand All @@ -17,16 +15,14 @@
from disease import NAMESPACE_LOOKUP, PREFIX_LOOKUP, SOURCES_LOWER_LOOKUP, __version__
from disease.database.database import AbstractDatabase
from disease.schemas import (
NAMESPACE_TO_SYSTEM_URI,
SYSTEM_URI_TO_NAMESPACE,
Disease,
MatchType,
NamespacePrefix,
NormalizationService,
RefType,
SearchService,
ServiceMeta,
SourceName,
get_concept_mapping,
)

_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -305,7 +301,7 @@ def _add_merged_meta(self, response: dict) -> dict:

sources = []
for m in disease.mappings or []:
ns = SYSTEM_URI_TO_NAMESPACE.get(m.coding.system, "").lower()
ns = re.split(r"[:_]", m.coding.id, maxsplit=1)[0].lower()
if ns in PREFIX_LOOKUP:
sources.append(PREFIX_LOOKUP[ns])

Expand All @@ -325,36 +321,6 @@ def _add_disease(
:param match_type: type of match achieved
:return: completed normalized response object ready to return to user
"""

def _create_concept_mapping(
concept_id: str, relation: Relation = Relation.RELATED_MATCH
) -> ConceptMapping:
"""Create concept mapping for identifier
``system`` will use OBO Foundry persistent URL (PURL), source homepage, or
namespace prefix, in that order of preference, if available.
:param concept_id: Concept identifier represented as a curie
:param relation: SKOS mapping relationship, default is relatedMatch
:return: Concept mapping for identifier
"""
source = concept_id.split(":")[0]

try:
source = NamespacePrefix(source)
except ValueError:
try:
source = NamespacePrefix(source.upper())
except ValueError as e:
err_msg = f"Namespace prefix not supported: {source}"
raise ValueError(err_msg) from e

system = NAMESPACE_TO_SYSTEM_URI.get(source, source)

return ConceptMapping(
coding=Coding(code=code(concept_id), system=system), relation=relation
)

disease_obj = MappableConcept(
id=f"normalize.disease.{record['concept_id']}",
primaryCode=code(root=record["concept_id"]),
Expand All @@ -365,13 +331,13 @@ def _create_concept_mapping(

xrefs = [record["concept_id"], *record.get("xrefs", [])]
disease_obj.mappings = [
_create_concept_mapping(xref_id, relation=Relation.EXACT_MATCH)
get_concept_mapping(xref_id, relation=Relation.EXACT_MATCH)
for xref_id in xrefs
]

associated_with = record.get("associated_with", [])
disease_obj.mappings.extend(
_create_concept_mapping(associated_with_id, relation=Relation.RELATED_MATCH)
get_concept_mapping(associated_with_id, relation=Relation.RELATED_MATCH)
for associated_with_id in associated_with
)

Expand Down
146 changes: 87 additions & 59 deletions src/disease/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,16 @@

import datetime
from enum import Enum, IntEnum
from types import MappingProxyType
from typing import Literal

from ga4gh.core.models import MappableConcept
from ga4gh.core.models import (
Coding,
ConceptMapping,
MappableConcept,
Relation,
code,
)
from pydantic import BaseModel, ConfigDict, StrictBool, StrictStr

from disease import __version__
Expand Down Expand Up @@ -55,72 +62,89 @@ class NamespacePrefix(Enum):
OMIM = "MIM"
ONCOTREE = "oncotree"
# external sources
COHD = "cohd"
DECIPHER = "decipher"
EFO = "efo"
GARD = "gard"
HP = "HP"
HPO = HP
ICD9 = "icd9"
ICD9CM = "icd9.cm"
ICD10 = "icd10"
ICD10WHO = ICD10
ICD10CM = "icd10.cm"
ICD11 = "icd11"
ICDO = "icdo"
IDO = "ido"
IMDRF = "imdrf"
KEGG = "kegg.disease"
MEDDRA = "meddra"
MEDGEN = "medgen"
MESH = "mesh"
MF = "mf"
MP = "MP"
MPATH = "mpath"
NIFSTD = "nifstd"
OBI = "obi"
OGMS = "ogms"
ORPHANET = "orphanet"
PATO = "pato"
SCDO = "scdo"
UMLS = "umls"
WIKIPEDIA = "wikipedia.en"
WIKIDATA = "wikidata"


# Source to URI. Will use OBO Foundry persistent URL (PURL) or source homepage
NAMESPACE_TO_SYSTEM_URI: dict[NamespacePrefix, str] = {
NamespacePrefix.NCIT: "http://purl.obolibrary.org/obo/ncit.owl",
NamespacePrefix.MONDO: "http://purl.obolibrary.org/obo/mondo.owl",
NamespacePrefix.DO: "http://purl.obolibrary.org/obo/doid.owl",
NamespacePrefix.DOID: "http://purl.obolibrary.org/obo/doid.owl",
NamespacePrefix.OMIM: "https://www.omim.org",
NamespacePrefix.ONCOTREE: "https://oncotree.mskcc.org",
NamespacePrefix.COHD: "https://cohd.io",
NamespacePrefix.DECIPHER: "https://www.deciphergenomics.org",
NamespacePrefix.EFO: "https://www.ebi.ac.uk/efo/",
NamespacePrefix.GARD: "https://rarediseases.info.nih.gov",
NamespacePrefix.HP: "http://purl.obolibrary.org/obo/hp.owl",
NamespacePrefix.HPO: "http://purl.obolibrary.org/obo/hp.owl",
NamespacePrefix.ICD11: "https://icd.who.int/en/",
NamespacePrefix.ICDO: "https://www.who.int/standards/classifications/other-classifications/international-classification-of-diseases-for-oncology/",
NamespacePrefix.KEGG: "https://www.genome.jp/kegg/disease/",
NamespacePrefix.MEDDRA: "https://www.meddra.org",
NamespacePrefix.MEDGEN: "https://www.ncbi.nlm.nih.gov/medgen/",
NamespacePrefix.MESH: "https://id.nlm.nih.gov/mesh/",
NamespacePrefix.MP: "http://purl.obolibrary.org/obo/mp.owl",
NamespacePrefix.OBI: "http://purl.obolibrary.org/obo/obi.owl",
NamespacePrefix.ORPHANET: "https://www.orpha.net",
NamespacePrefix.PATO: "http://purl.obolibrary.org/obo/pato.owl",
NamespacePrefix.UMLS: "https://www.nlm.nih.gov/research/umls/index.html",
NamespacePrefix.WIKIPEDIA: "https://en.wikipedia.org",
NamespacePrefix.WIKIDATA: "https://www.wikidata.org",
}

# URI to source
SYSTEM_URI_TO_NAMESPACE = {
system_uri: ns.value for ns, system_uri in NAMESPACE_TO_SYSTEM_URI.items()
}


# Source to URI. Will use system URI prefix, OBO Foundry persistent URL (PURL), or source homepage
NAMESPACE_TO_SYSTEM_URI: MappingProxyType[NamespacePrefix, str] = MappingProxyType(
{
NamespacePrefix.NCIT: "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=",
NamespacePrefix.MONDO: "https://purl.obolibrary.org/obo/",
NamespacePrefix.DO: "https://disease-ontology.org/?id=",
NamespacePrefix.DOID: "https://disease-ontology.org/?id=",
NamespacePrefix.OMIM: "https://omim.org/MIM:",
NamespacePrefix.ONCOTREE: "https://oncotree.mskcc.org/?version=oncotree_latest_stable&field=CODE&search=",
NamespacePrefix.EFO: "http://www.ebi.ac.uk/efo/EFO_",
NamespacePrefix.GARD: "https://rarediseases.info.nih.gov",
NamespacePrefix.ICD9CM: "https://archive.cdc.gov/www_cdc_gov/nchs/icd/icd9cm.htm",
NamespacePrefix.ICD10: "https://icd.who.int/browse10/2016/en#/",
NamespacePrefix.ICD10CM: "https://www.cdc.gov/nchs/icd/icd-10-cm/index.html",
NamespacePrefix.ICD10WHO: "https://icd.who.int/browse10/2016/en#/",
NamespacePrefix.ICDO: "https://www.who.int/standards/classifications/other-classifications/international-classification-of-diseases-for-oncology/",
NamespacePrefix.IMDRF: "https://www.imdrf.org/",
NamespacePrefix.KEGG: "https://www.genome.jp/kegg/disease/",
NamespacePrefix.MEDDRA: "https://bioportal.bioontology.org/ontologies/MEDDRA?p=classes&conceptid=",
NamespacePrefix.MEDGEN: "https://www.ncbi.nlm.nih.gov/medgen/",
NamespacePrefix.MESH: "https://meshb.nlm.nih.gov/record/ui?ui=",
NamespacePrefix.ORPHANET: "https://www.orpha.net",
NamespacePrefix.UMLS: "https://www.nlm.nih.gov/research/umls/index.html",
}
)


def get_concept_mapping(
concept_id: str, relation: Relation = Relation.RELATED_MATCH
) -> ConceptMapping:
"""Get concept mapping for CURIE identifier
``system`` will use system prefix URL, OBO Foundry persistent URL (PURL), or
source homepage, in that order of preference.
:param concept_id: Concept identifier represented as a curie
:param relation: SKOS mapping relationship, default is relatedMatch
:raises ValueError: If source of concept ID is not a valid ``NamespacePrefix``
:return: Concept mapping for identifier
"""
source, source_code = concept_id.split(":")

try:
source = NamespacePrefix(source)
except ValueError:
try:
source = NamespacePrefix(source.upper())
except ValueError as e:
err_msg = f"Namespace prefix not supported: {source}"
raise ValueError(err_msg) from e

id_ = concept_id

if source == NamespacePrefix.MONDO:
source_code = concept_id.upper()
id_ = source_code.replace(":", "_")
elif source == NamespacePrefix.DOID:
source_code = concept_id

return ConceptMapping(
coding=Coding(
id=id_,
code=code(source_code),
system=NAMESPACE_TO_SYSTEM_URI[source],
),
relation=relation,
)


class SourcePriority(IntEnum):
Expand Down Expand Up @@ -314,28 +338,32 @@ class NormalizationService(BaseModel):
"mappings": [
{
"coding": {
"code": "ncit:C4989",
"system": "https://www.ebi.ac.uk/ols4/ontologies/ncit/classes?short_form=NCIT_",
"id": "ncit:C4989",
"code": "C4989",
"system": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=",
},
"relation": "exactMatch",
},
{
"coding": {
"code": "mondo:0004355",
"system": "http://purl.obolibrary.org/obo/mondo.owl",
"id": "MONDO_0004355",
"code": "MONDO:0004355",
"system": "https://purl.obolibrary.org/obo/",
},
"relation": "exactMatch",
},
{
"coding": {
"id": "DOID:7757",
"code": "DOID:7757",
"system": "http://purl.obolibrary.org/obo/doid.owl",
"system": "https://disease-ontology.org/?id=",
},
"relation": "exactMatch",
},
{
"coding": {
"code": "umls:C1332977",
"id": "umls:C1332977",
"code": "C1332977",
"system": "https://www.nlm.nih.gov/research/umls/index.html",
},
"relation": "relatedMatch",
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def pytest_collection_modifyitems(items):
When creating new test modules, be sure to add them here.
"""
module_order = [
"test_schemas",
"test_mondo",
"test_do",
"test_ncit",
Expand Down
Loading

0 comments on commit 81a9fd3

Please sign in to comment.