Skip to content

Commit 18b8124

Browse files
authored
feat!: use system identifiers from source for normalization (#433)
close #431 * Rather than returning the highest match for a list of queries, use system identifiers for source * Input is now a single query, rather than a list
1 parent ba2cc76 commit 18b8124

File tree

9 files changed

+479
-312
lines changed

9 files changed

+479
-312
lines changed

src/metakb/normalizers.py

Lines changed: 75 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -83,188 +83,75 @@ def __init__(self, db_url: str | None = None) -> None:
8383
self.therapy_query_handler = TherapyQueryHandler(create_therapy_db(db_url))
8484

8585
async def normalize_variation(
86-
self, queries: list[str]
86+
self, query: str
8787
) -> Allele | CopyNumberChange | CopyNumberCount | None:
88-
"""Normalize variation queries.
88+
"""Attempt to normalize a variation query
8989
90-
:param queries: Candidate query strings to attempt to normalize. Should be
91-
provided in order of preference, as the result of the first one to normalize
92-
successfully will be returned. Use in the event that a prioritized MANE
93-
transcript is unavailable and multiple possible candidates are known.
90+
:param query: Variation query to normalize
9491
:raises TokenRetrievalError: If AWS credentials are expired
9592
:return: A normalized variation, if available.
9693
"""
97-
for query in queries:
98-
if not query:
99-
continue
100-
try:
101-
variation_norm_resp = (
102-
await self.variation_normalizer.normalize_handler.normalize(query)
103-
)
104-
if variation_norm_resp and variation_norm_resp.variation:
105-
return variation_norm_resp.variation
106-
except TokenRetrievalError as e:
107-
_logger.error(e)
108-
raise e
109-
except Exception as e:
110-
_logger.error(
111-
"Variation Normalizer raised an exception using query %s: %s",
112-
query,
113-
e,
114-
)
94+
try:
95+
variation_norm_resp = (
96+
await self.variation_normalizer.normalize_handler.normalize(query)
97+
)
98+
if variation_norm_resp and variation_norm_resp.variation:
99+
return variation_norm_resp.variation
100+
except TokenRetrievalError as e:
101+
_logger.error(e)
102+
raise e
103+
except Exception as e:
104+
_logger.error(
105+
"Variation Normalizer raised an exception using query %s: %s",
106+
query,
107+
e,
108+
)
115109
return None
116110

117-
def normalize_gene(
118-
self, queries: list[str]
119-
) -> tuple[NormalizedGene | None, str | None]:
120-
"""Normalize gene queries.
121-
122-
Given a collection of terms, return the normalized concept with the highest
123-
match (see the
124-
`Gene Normalizer docs <https://gene-normalizer.readthedocs.io/latest/usage.html#match-types>`_ for
125-
more details on match types, and how queries are resolved).
111+
def normalize_gene(self, query: str) -> tuple[NormalizedGene, str | None]:
112+
"""Attempt to normalize a gene query
126113
127114
>>> from metakb.normalizers import ViccNormalizers
128115
>>> v = ViccNormalizers()
129-
>>> gene_terms = [
130-
... "gibberish", # won't match
131-
... "NETS", # alias
132-
... "hgnc:1097", # HGNC identifier for BRAF
133-
... "MARCH3", # previous symbol
134-
... ]
135-
>>> v.normalize_gene(gene_terms)[0].normalized_id
116+
>>> v.normalize_gene("BRAF")[1]
136117
'hgnc:1097'
137118
138-
:param queries: A list of possible gene terms to normalize. Order is irrelevant,
139-
except for breaking ties (choose earlier if equal).
119+
:param query: Gene query to normalize
140120
:raises TokenRetrievalError: If AWS credentials are expired
141-
:return: The highest matched gene's normalized response and ID
121+
:return: Gene normalization response and normalized gene ID, if available.
142122
"""
143-
gene_norm_resp = None
144-
normalized_gene_id = None
145-
highest_match = 0
146-
for query_str in queries:
147-
if not query_str:
148-
continue
123+
return self._normalize_concept(query, self.gene_query_handler, "gene")
149124

150-
try:
151-
gene_norm_resp = self.gene_query_handler.normalize(query_str)
152-
except TokenRetrievalError as e:
153-
_logger.error(e)
154-
raise e
155-
except Exception as e:
156-
_logger.error(
157-
"Gene Normalizer raised an exception using query %s: %s",
158-
query_str,
159-
e,
160-
)
161-
else:
162-
if gene_norm_resp.match_type > highest_match:
163-
highest_match = gene_norm_resp.match_type
164-
normalized_gene_id = gene_norm_resp.gene.primaryCode.root
165-
if highest_match == 100:
166-
break
167-
return gene_norm_resp, normalized_gene_id
168-
169-
def normalize_disease(
170-
self, queries: list[str]
171-
) -> tuple[NormalizedDisease | None, str | None]:
172-
"""Normalize disease queries.
125+
def normalize_disease(self, query: str) -> tuple[NormalizedDisease, str | None]:
126+
"""Attempt to normalize a disease query
173127
174128
Given a collection of terms, return the normalized concept with the highest
175129
match.
176130
177131
>>> from metakb.normalizers import ViccNormalizers
178132
>>> v = ViccNormalizers()
179-
>>> disease_terms = [
180-
... "AML", # alias
181-
... "von hippel-lindau syndrome", # alias
182-
... "ncit:C9384", # concept ID
183-
... ]
184-
>>> v.normalize_disease(disease_terms)[0].normalized_id
185-
'ncit:C9384'
186-
187-
:param queries: Disease queries to normalize. Order is irrelevant, except for
188-
breaking ties (choose earlier if equal).
133+
>>> v.normalize_disease("von hippel-lindau syndrome")[1]
134+
'ncit:C3105'
135+
136+
:param query: Disease query normalize
189137
:raises TokenRetrievalError: If AWS credentials are expired
190-
:return: The highest matched disease's normalized response and ID
138+
:return: Disease normalization response and normalized disease ID, if available.
191139
"""
192-
highest_match = 0
193-
normalized_disease_id = None
194-
disease_norm_resp = None
195-
196-
for query in queries:
197-
if not query:
198-
continue
199-
200-
try:
201-
disease_norm_resp = self.disease_query_handler.normalize(query)
202-
except TokenRetrievalError as e:
203-
_logger.error(e)
204-
raise e
205-
except Exception as e:
206-
_logger.error(
207-
"Disease Normalizer raised an exception using query %s: %s",
208-
query,
209-
e,
210-
)
211-
else:
212-
if disease_norm_resp.match_type > highest_match:
213-
highest_match = disease_norm_resp.match_type
214-
normalized_disease_id = disease_norm_resp.disease.primaryCode.root
215-
if highest_match == 100:
216-
break
217-
return disease_norm_resp, normalized_disease_id
218-
219-
def normalize_therapy(
220-
self, queries: list[str]
221-
) -> tuple[NormalizedTherapy | None, str | None]:
222-
"""Normalize therapy queries
140+
return self._normalize_concept(query, self.disease_query_handler, "disease")
223141

224-
Given a collection of terms, return the normalized concept with the highest
225-
match.
142+
def normalize_therapy(self, query: str) -> tuple[NormalizedTherapy, str | None]:
143+
"""Attempt to normalize a therapy query
226144
227145
>>> from metakb.normalizers import ViccNormalizers
228146
>>> v = ViccNormalizers()
229-
>>> therapy_terms = [
230-
... "VAZALORE", # trade name
231-
... "RHUMAB HER2", # alias
232-
... "rxcui:5032", # concept ID
233-
... ]
234-
>>> v.normalize_therapy(therapy_terms)[0].normalized_id
235-
'rxcui:5032'
236-
237-
:param queries: Therapy queries to normalize. Order is irrelevant, except for
238-
breaking ties (choose earlier term if equal).
147+
>>> v.normalize_therapy("VAZALORE")[1]
148+
'rxcui:1191'
149+
150+
:param query: Therapy query normalize
239151
:raises TokenRetrievalError: If AWS credentials are expired
240-
:return: The highest matched therapy's normalized response and ID
152+
:return: Therapy normalization response and normalized therapy ID, if available.
241153
"""
242-
highest_match = 0
243-
normalized_therapy_id = None
244-
therapy_norm_resp = None
245-
246-
for query in queries:
247-
if not query:
248-
continue
249-
250-
try:
251-
therapy_norm_resp = self.therapy_query_handler.normalize(query)
252-
except TokenRetrievalError as e:
253-
_logger.error(e)
254-
raise e
255-
except Exception as e:
256-
_logger.error(
257-
"Therapy Normalizer raised an exception using query %s: %s",
258-
query,
259-
e,
260-
)
261-
else:
262-
if therapy_norm_resp.match_type > highest_match:
263-
highest_match = therapy_norm_resp.match_type
264-
normalized_therapy_id = therapy_norm_resp.therapy.primaryCode.root
265-
if highest_match == 100:
266-
break
267-
return therapy_norm_resp, normalized_therapy_id
154+
return self._normalize_concept(query, self.therapy_query_handler, "therapy")
268155

269156
@staticmethod
270157
def get_regulatory_approval_extension(
@@ -331,6 +218,41 @@ def get_regulatory_approval_extension(
331218

332219
return regulatory_approval_extension
333220

221+
@staticmethod
222+
def _normalize_concept(
223+
query: str,
224+
query_handler: GeneQueryHandler | DiseaseQueryHandler | TherapyQueryHandler,
225+
concept_name: str,
226+
) -> tuple[NormalizedGene | NormalizedDisease | NormalizedTherapy, str | None]:
227+
"""Attempt to normalize a concept
228+
229+
:param query: Query to normalize
230+
:param query_handler: Query handler for normalizer
231+
:param concept_name: Name of concept (gene, disease, therapy)
232+
:raises TokenRetrievalError: If AWS credentials are expired
233+
:return: Normalizer response and normalized ID, if available.
234+
"""
235+
normalizer_resp = None
236+
normalized_id = None
237+
238+
try:
239+
normalizer_resp = query_handler.normalize(query)
240+
except TokenRetrievalError as e:
241+
_logger.error(e)
242+
raise e
243+
except Exception as e:
244+
_logger.error(
245+
"%s Normalizer raised an exception using query %s: %s",
246+
concept_name.capitalize(),
247+
query,
248+
e,
249+
)
250+
else:
251+
if normalizer_resp.match_type:
252+
normalized_id = getattr(normalizer_resp, concept_name).primaryCode.root
253+
254+
return normalizer_resp, normalized_id
255+
334256

335257
class NormalizerName(str, Enum):
336258
"""Constrain normalizer CLI options."""

src/metakb/query.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def _get_normalized_therapy(self, therapy: str, warnings: list[str]) -> str | No
326326
:param warnings: A list of warnings for the search query
327327
:return: A normalized therapy concept if it exists
328328
"""
329-
_, normalized_therapy_id = self.vicc_normalizers.normalize_therapy([therapy])
329+
_, normalized_therapy_id = self.vicc_normalizers.normalize_therapy(therapy)
330330

331331
if not normalized_therapy_id:
332332
warnings.append(f"Therapy Normalizer unable to normalize: " f"{therapy}")
@@ -339,7 +339,7 @@ def _get_normalized_disease(self, disease: str, warnings: list[str]) -> str | No
339339
:param warnings: A list of warnings for the search query
340340
:return: A normalized disease concept if it exists
341341
"""
342-
_, normalized_disease_id = self.vicc_normalizers.normalize_disease([disease])
342+
_, normalized_disease_id = self.vicc_normalizers.normalize_disease(disease)
343343

344344
if not normalized_disease_id:
345345
warnings.append(f"Disease Normalizer unable to normalize: " f"{disease}")
@@ -354,7 +354,7 @@ async def _get_normalized_variation(
354354
:param warnings: A list of warnings for the search query
355355
:return: A normalized variant concept if it exists
356356
"""
357-
variant_norm_resp = await self.vicc_normalizers.normalize_variation([variation])
357+
variant_norm_resp = await self.vicc_normalizers.normalize_variation(variation)
358358
normalized_variation = variant_norm_resp.id if variant_norm_resp else None
359359

360360
if not normalized_variation:
@@ -374,7 +374,7 @@ def _get_normalized_gene(self, gene: str, warnings: list[str]) -> str | None:
374374
:param warnings: A list of warnings for the search query.
375375
:return: A normalized gene concept if it exists
376376
"""
377-
_, normalized_gene_id = self.vicc_normalizers.normalize_gene([gene])
377+
_, normalized_gene_id = self.vicc_normalizers.normalize_gene(gene)
378378
if not normalized_gene_id:
379379
warnings.append(f"Gene Normalizer unable to normalize: {gene}")
380380
return normalized_gene_id

src/metakb/transformers/base.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,12 @@
3939
NormalizeService as NormalizedGene,
4040
)
4141
from pydantic import BaseModel, Field, StrictStr, ValidationError
42-
from therapy.schemas import NormalizationService as NormalizedTherapy
42+
from therapy.schemas import (
43+
NamespacePrefix as TherapyNamespacePrefix,
44+
)
45+
from therapy.schemas import (
46+
NormalizationService as NormalizedTherapy,
47+
)
4348

4449
from metakb import APP_ROOT, DATE_FMT
4550
from metakb.harvesters.base import _HarvestedData
@@ -603,33 +608,63 @@ def _update_mapping(
603608
normalizer_label = normalizer_resp_obj.label
604609
is_disease = isinstance(normalizer_resp, NormalizedDisease)
605610
is_gene = isinstance(normalizer_resp, NormalizedGene)
611+
is_therapy = isinstance(normalizer_resp, NormalizedTherapy)
606612

607613
normalizer_mappings = normalizer_resp_obj.mappings or []
608614
for mapping in normalizer_mappings:
609615
if normalized_id == mapping.coding.id:
610616
mappings.append(
611-
_update_mapping(mapping, normalized_id, normalizer_label)
617+
_update_mapping(
618+
mapping,
619+
normalized_id,
620+
normalizer_label,
621+
match_on_coding_id=True,
622+
)
612623
)
613624
else:
614-
if (
615-
is_disease
616-
and mapping.coding.code.root.lower().startswith(
617-
DiseaseNamespacePrefix.MONDO.value
618-
)
619-
) or (
620-
is_gene
621-
and mapping.coding.id.startswith(
622-
(GeneNamespacePrefix.NCBI.value, GeneNamespacePrefix.HGNC.value)
623-
)
625+
if is_disease and mapping.coding.code.root.lower().startswith(
626+
DiseaseNamespacePrefix.MONDO.value
624627
):
625628
mappings.append(
626629
_update_mapping(
627630
mapping,
628631
normalized_id,
629632
normalizer_label,
630-
match_on_coding_id=is_gene,
633+
match_on_coding_id=False,
631634
)
632635
)
636+
else:
637+
if (
638+
(
639+
is_gene
640+
and mapping.coding.id.startswith(
641+
(
642+
GeneNamespacePrefix.NCBI.value,
643+
GeneNamespacePrefix.HGNC.value,
644+
)
645+
)
646+
)
647+
or (
648+
is_disease
649+
and mapping.coding.id.startswith(
650+
DiseaseNamespacePrefix.DOID.value
651+
)
652+
)
653+
or (
654+
is_therapy
655+
and mapping.coding.id.startswith(
656+
TherapyNamespacePrefix.NCIT.value
657+
)
658+
)
659+
):
660+
mappings.append(
661+
_update_mapping(
662+
mapping,
663+
normalized_id,
664+
normalizer_label,
665+
match_on_coding_id=True,
666+
)
667+
)
633668
return mappings
634669

635670
def create_json(self, cdm_filepath: Path | None = None) -> None:

0 commit comments

Comments
 (0)