Skip to content

Commit 2479dcc

Browse files
committed
Catalogue concept ES modifications
1 parent 74cd56f commit 2479dcc

File tree

4 files changed

+72
-10
lines changed

4 files changed

+72
-10
lines changed

catalogue_graph/src/ingestor_loader.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,13 @@ def extract_data(start_offset: int, end_index: int, is_local: bool) -> list[dict
3636

3737
open_cypher_match_query = f"""
3838
MATCH (concept:Concept)
39-
OPTIONAL MATCH (concept)-[:HAS_SOURCE_CONCEPT]->(linked_source_concept)-[:SAME_AS*0..]->(source_concept)
40-
RETURN concept, collect(DISTINCT source_concept) AS source_concepts
41-
ORDER BY concept.id
39+
WITH concept ORDER BY concept.id
4240
SKIP {start_offset} LIMIT {limit}
41+
OPTIONAL MATCH (concept)-[:HAS_SOURCE_CONCEPT]->(linked_source_concept)-[:SAME_AS*0..]->(source_concept)
42+
RETURN
43+
concept,
44+
collect(DISTINCT linked_source_concept) AS linked_source_concepts,
45+
collect(DISTINCT source_concept) AS source_concepts
4346
"""
4447

4548
print("Running query:")

catalogue_graph/src/models/catalogue_concept.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
def get_priority_source_concept_value(values: dict) -> Any:
88
# Sources sorted by priority
9-
for source in ["nlm-mesh", "lc-names", "lc-subjects", "wikidata"]:
9+
for source in ["nlm-mesh", "lc-names", "lc-subjects", "wikidata", "label-derived"]:
1010
if (value := values.get(source)) is not None:
1111
return value
1212

@@ -26,15 +26,20 @@ class CatalogueConcept(BaseModel):
2626

2727
@classmethod
2828
def from_neptune_result(cls, data: dict) -> "CatalogueConcept":
29+
labels = {}
2930
descriptions = {}
3031
identifiers = []
3132
alternative_labels = []
32-
for source_concept in data["source_concepts"]:
33-
properties = source_concept["~properties"]
34-
source = properties["source"]
3533

36-
descriptions[source] = properties.get("description")
34+
labels["label-derived"] = data["concept"]["~properties"].get("label", "")
3735

36+
# For now, only extract labels and alternative labels only from source concepts which are explicitly linked
37+
# to the concept via HAS_SOURCE_CONCEPT edges
38+
for source_concept in data["linked_source_concepts"]:
39+
properties = source_concept["~properties"]
40+
source = properties["source"]
41+
42+
labels[source] = properties.get("label")
3843
identifiers.append(
3944
CatalogueConceptIdentifier(
4045
value=properties["id"],
@@ -46,10 +51,16 @@ def from_neptune_result(cls, data: dict) -> "CatalogueConcept":
4651
if len(label) > 0:
4752
alternative_labels.append(label)
4853

54+
# Extract descriptions from _all_ source concepts (utilising both HAS_SOURCE_CONCEPT and SAME_AS edges)
55+
for source_concept in data["source_concepts"]:
56+
properties = source_concept["~properties"]
57+
source = properties["source"]
58+
descriptions[source] = properties.get("description")
59+
4960
return CatalogueConcept(
5061
id=data["concept"]["~properties"]["id"],
51-
label=data["concept"]["~properties"].get("label", ""),
5262
type=data["concept"]["~properties"]["type"],
63+
label=get_priority_source_concept_value(labels),
5364
alternativeLabels=alternative_labels,
5465
description=get_priority_source_concept_value(descriptions),
5566
identifiers=identifiers,

catalogue_graph/tests/models/test_catalogue_concept.py

+30-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,26 @@ def test_catalogue_concept_from_neptune_result() -> None:
1111
"source": "lc-subjects",
1212
"alternative_labels": "alternativeLabels||moreAlternativeLabels",
1313
"description": "description",
14+
"label": "Priority label"
15+
}
16+
},
17+
{
18+
"~properties": {
19+
"id": "id",
20+
"source": "wikidata",
21+
"alternative_labels": "invisibleAlternativeLabel",
22+
"description": "Non-priority description",
23+
}
24+
}
25+
],
26+
"linked_source_concepts": [
27+
{
28+
"~properties": {
29+
"id": "id",
30+
"source": "lc-subjects",
31+
"alternative_labels": "alternativeLabels||moreAlternativeLabels",
32+
"description": "description",
33+
"label": "Priority label"
1434
}
1535
}
1636
],
@@ -21,7 +41,7 @@ def test_catalogue_concept_from_neptune_result() -> None:
2141
identifiers=[
2242
CatalogueConceptIdentifier(value="id", identifierType="lc-subjects")
2343
],
24-
label="label",
44+
label="Priority label",
2545
alternativeLabels=["alternativeLabels", "moreAlternativeLabels"],
2646
description="description",
2747
type="type",
@@ -46,6 +66,15 @@ def test_catalogue_concept_from_neptune_result_without_alternative_labels() -> N
4666
}
4767
}
4868
],
69+
"linked_source_concepts": [
70+
{
71+
"~properties": {
72+
"id": "id",
73+
"source": "nlm-mesh",
74+
"description": "description",
75+
}
76+
}
77+
],
4978
}
5079

5180
assert CatalogueConcept.from_neptune_result(neptune_result) == CatalogueConcept(

catalogue_graph/tests/test_ingestor_loader.py

+19
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,16 @@ def build_test_matrix() -> list[tuple]:
4141
}
4242
}
4343
],
44+
"linked_source_concepts": [
45+
{
46+
"~properties": {
47+
"id": "456",
48+
"source": "lc-names",
49+
"alternative_labels": "alternative_label||another_alternative_label",
50+
"description": "description",
51+
}
52+
}
53+
],
4454
}
4555
]
4656
},
@@ -92,6 +102,15 @@ def build_test_matrix() -> list[tuple]:
92102
}
93103
}
94104
],
105+
"linked_source_concepts": [
106+
{
107+
"~properties": {
108+
"id": "456",
109+
"source": "lc-names",
110+
"description": "description",
111+
}
112+
}
113+
],
95114
}
96115
]
97116
},

0 commit comments

Comments
 (0)