Skip to content

Commit f5bb4cb

Browse files
authored
Refactor sanitization to NCName‑safe method for RDF properties and label lookups (#96)
* Replace over‑sanitizing remove_special_characters with ncname_safe * Reformatting * Update assertion statements
1 parent 919ac99 commit f5bb4cb

File tree

4 files changed

+36
-14
lines changed

4 files changed

+36
-14
lines changed

pandasaurus_cxg/graph_generator/graph_generator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
generate_subgraph,
2525
get_cxg_dataset_url,
2626
parse_citation_field_into_dict,
27-
remove_special_characters,
27+
ncname_safe,
2828
select_node_with_property,
2929
)
3030
from pandasaurus_cxg.graph_generator.graph_namespaces import prefixes
@@ -115,13 +115,13 @@ def generate_rdf_graph(self, merge: bool = False):
115115
self.graph.add(
116116
(
117117
dataset_class,
118-
URIRef(self.ns[remove_special_characters(citation_key)]),
118+
URIRef(self.ns[ncname_safe(citation_key)]),
119119
Literal(citation_value),
120120
)
121121
)
122122

123123
self.graph.add(
124-
(dataset_class, URIRef(self.ns[remove_special_characters(key)]), Literal(value))
124+
(dataset_class, URIRef(self.ns[ncname_safe(key)]), Literal(value))
125125
)
126126
has_source = URIRef(HAS_SOURCE["iri"])
127127
self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"])))
@@ -174,7 +174,7 @@ def generate_rdf_graph(self, merge: bool = False):
174174
for k, v in inner_dict.items():
175175
if k in {"subcluster_of", "cluster_matches"}:
176176
continue
177-
self.graph.add((resource, self.ns[remove_special_characters(k)], Literal(v)))
177+
self.graph.add((resource, self.ns[ncname_safe(k)], Literal(v)))
178178

179179
# add relationship between each resource based on their predicate in the co_annotation_report
180180
subcluster = URIRef(SUBCLUSTER_OF.get("iri"))
@@ -187,7 +187,7 @@ def generate_rdf_graph(self, merge: bool = False):
187187
# Iterate through the key-predicate map and apply the same logic for both keys
188188
for key, predicate_object in key_predicate_map.items():
189189
for ik, iv in inner_dict.get(key, {}).items():
190-
predicate = self.ns[remove_special_characters(ik)]
190+
predicate = self.ns[ncname_safe(ik)]
191191
for s, _, _ in self.graph.triples((None, predicate, Literal(iv))):
192192
self.graph.add((resource, predicate_object, s))
193193

pandasaurus_cxg/graph_generator/graph_generator_utils.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,28 @@ def select_node_with_property(graph: Graph, _property: str, value: str):
127127
return [str(s) for s in graph.subjects(predicate=ns[_property], object=Literal(value))]
128128

129129

130-
def remove_special_characters(input_string: str) -> str:
131-
return re.sub(r"[^a-zA-Z0-9_]", "", input_string.replace(" ", "_"))
130+
def ncname_safe(term: str) -> str:
131+
"""Sanitize a string to be a valid XML NCName local name.
132+
133+
This function ensures that the input term conforms to the XML NCName
134+
specification for use as the local part of a QName:
135+
1. Spaces are replaced with underscores.
136+
2. Leading characters not allowed by NCName (anything other than a letter or underscore)
137+
are stripped.
138+
3. All remaining characters that are not letters, digits, underscores, hyphens,
139+
or periods are replaced with underscores.
140+
141+
Args:
142+
term: The original string to sanitize.
143+
144+
Returns:
145+
A string that is a valid NCName local name, safe for use as
146+
the local part of an XML QName.
147+
148+
"""
149+
term = term.replace(" ", "_")
150+
term = re.sub(r'^[^A-Za-z_]+', '', term)
151+
return re.sub(r'[^A-Za-z0-9_\-\.]', '_', term)
132152

133153

134154
def parse_citation_field_into_dict(value: str) -> Dict[str, str]:

test/graph_generator/test_graph_generator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def test_enrich_rdf_graph_with_merge(graph_generator_instance_for_kidney):
274274

275275
graph_generator.enrich_rdf_graph()
276276

277-
assert len(graph_generator.graph) == 1246
277+
assert len(graph_generator.graph) == 1242
278278
assert (
279279
URIRef(CONSIST_OF.get("iri")),
280280
RDFS.label,
@@ -303,7 +303,7 @@ def test_enrich_rdf_graph_without_merge(graph_generator_instance_for_kidney):
303303

304304
graph_generator.enrich_rdf_graph()
305305

306-
assert len(graph_generator.graph) == 2676
306+
assert len(graph_generator.graph) == 2674
307307

308308

309309
def test_save_rdf_graph(graph_generator_instance_for_kidney):

test/graph_generator/test_graph_generator_utils.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
add_outgoing_edges_to_subgraph,
99
find_and_rotate_center_layout,
1010
generate_subgraph,
11-
remove_special_characters,
11+
ncname_safe,
1212
select_node_with_property,
1313
)
1414
from pandasaurus_cxg.graph_generator.graph_predicates import (
@@ -190,12 +190,14 @@ def test_select_node_with_property_predicate():
190190
@pytest.mark.parametrize(
191191
"input_string, expected_output",
192192
[
193-
("Hello World!", "Hello_World"),
194-
("123abc$%^", "123abc"),
193+
("Hello World!", "Hello_World_"),
194+
("123abc$%^", "abc___"),
195195
("!@#$%^&*()_", "_"),
196196
("_This_is_a_test_", "_This_is_a_test_"),
197197
("", ""),
198+
("author.cell_type", "author.cell_type"),
199+
("-._bad:key", "_bad_key"),
198200
],
199201
)
200-
def test_remove_special_characters(input_string, expected_output):
201-
assert remove_special_characters(input_string) == expected_output
202+
def test_ncname_safe(input_string, expected_output):
203+
assert ncname_safe(input_string) == expected_output

0 commit comments

Comments
 (0)