Refactor sanitization to NCName‑safe method for RDF properties and label lookups (#96)

ubyndr · web-flow · commit f5bb4cbb6933 · 2025-08-19T10:02:25.000+01:00
* Replace over‑sanitizing remove_special_characters with ncname_safe

* Reformatting

* Update assertion statements
diff --git a/pandasaurus_cxg/graph_generator/graph_generator.py b/pandasaurus_cxg/graph_generator/graph_generator.py
@@ -24,7 +24,7 @@
     generate_subgraph,
     get_cxg_dataset_url,
     parse_citation_field_into_dict,
-    remove_special_characters,
+    ncname_safe,
     select_node_with_property,
 )
 from pandasaurus_cxg.graph_generator.graph_namespaces import prefixes
@@ -115,13 +115,13 @@ def generate_rdf_graph(self, merge: bool = False):
                     self.graph.add(
                         (
                             dataset_class,
-                            URIRef(self.ns[remove_special_characters(citation_key)]),
+                            URIRef(self.ns[ncname_safe(citation_key)]),
                             Literal(citation_value),
                         )
                     )
 
             self.graph.add(
-                (dataset_class, URIRef(self.ns[remove_special_characters(key)]), Literal(value))
+                (dataset_class, URIRef(self.ns[ncname_safe(key)]), Literal(value))
             )
         has_source = URIRef(HAS_SOURCE["iri"])
         self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"])))
@@ -174,7 +174,7 @@ def generate_rdf_graph(self, merge: bool = False):
             for k, v in inner_dict.items():
                 if k in {"subcluster_of", "cluster_matches"}:
                     continue
-                self.graph.add((resource, self.ns[remove_special_characters(k)], Literal(v)))
+                self.graph.add((resource, self.ns[ncname_safe(k)], Literal(v)))
 
         # add relationship between each resource based on their predicate in the co_annotation_report
         subcluster = URIRef(SUBCLUSTER_OF.get("iri"))
@@ -187,7 +187,7 @@ def generate_rdf_graph(self, merge: bool = False):
             # Iterate through the key-predicate map and apply the same logic for both keys
             for key, predicate_object in key_predicate_map.items():
                 for ik, iv in inner_dict.get(key, {}).items():
-                    predicate = self.ns[remove_special_characters(ik)]
+                    predicate = self.ns[ncname_safe(ik)]
                     for s, _, _ in self.graph.triples((None, predicate, Literal(iv))):
                         self.graph.add((resource, predicate_object, s))
 
diff --git a/pandasaurus_cxg/graph_generator/graph_generator_utils.py b/pandasaurus_cxg/graph_generator/graph_generator_utils.py
@@ -127,8 +127,28 @@ def select_node_with_property(graph: Graph, _property: str, value: str):
         return [str(s) for s in graph.subjects(predicate=ns[_property], object=Literal(value))]
 
 
-def remove_special_characters(input_string: str) -> str:
-    return re.sub(r"[^a-zA-Z0-9_]", "", input_string.replace(" ", "_"))
+def ncname_safe(term: str) -> str:
+    """Sanitize a string to be a valid XML NCName local name.
+
+    This function ensures that the input term conforms to the XML NCName
+    specification for use as the local part of a QName:
+      1. Spaces are replaced with underscores.
+      2. Leading characters not allowed by NCName (anything other than a letter or underscore)
+         are stripped.
+      3. All remaining characters that are not letters, digits, underscores, hyphens,
+         or periods are replaced with underscores.
+
+    Args:
+        term: The original string to sanitize.
+
+    Returns:
+        A string that is a valid NCName local name, safe for use as
+        the local part of an XML QName.
+
+    """
+    term = term.replace(" ", "_")
+    term = re.sub(r'^[^A-Za-z_]+', '', term)
+    return re.sub(r'[^A-Za-z0-9_\-\.]', '_', term)
 
 
 def parse_citation_field_into_dict(value: str) -> Dict[str, str]:
diff --git a/test/graph_generator/test_graph_generator.py b/test/graph_generator/test_graph_generator.py
@@ -274,7 +274,7 @@ def test_enrich_rdf_graph_with_merge(graph_generator_instance_for_kidney):
 
     graph_generator.enrich_rdf_graph()
 
-    assert len(graph_generator.graph) == 1246
+    assert len(graph_generator.graph) == 1242
     assert (
         URIRef(CONSIST_OF.get("iri")),
         RDFS.label,
@@ -303,7 +303,7 @@ def test_enrich_rdf_graph_without_merge(graph_generator_instance_for_kidney):
 
     graph_generator.enrich_rdf_graph()
 
-    assert len(graph_generator.graph) == 2676
+    assert len(graph_generator.graph) == 2674
 
 
 def test_save_rdf_graph(graph_generator_instance_for_kidney):
diff --git a/test/graph_generator/test_graph_generator_utils.py b/test/graph_generator/test_graph_generator_utils.py
@@ -8,7 +8,7 @@
     add_outgoing_edges_to_subgraph,
     find_and_rotate_center_layout,
     generate_subgraph,
-    remove_special_characters,
+    ncname_safe,
     select_node_with_property,
 )
 from pandasaurus_cxg.graph_generator.graph_predicates import (
@@ -190,12 +190,14 @@ def test_select_node_with_property_predicate():
 @pytest.mark.parametrize(
     "input_string, expected_output",
     [
-        ("Hello World!", "Hello_World"),
-        ("123abc$%^", "123abc"),
+        ("Hello World!", "Hello_World_"),
+        ("123abc$%^", "abc___"),
         ("!@#$%^&*()_", "_"),
         ("_This_is_a_test_", "_This_is_a_test_"),
         ("", ""),
+        ("author.cell_type", "author.cell_type"),
+        ("-._bad:key", "_bad_key"),
     ],
 )
-def test_remove_special_characters(input_string, expected_output):
-    assert remove_special_characters(input_string) == expected_output
+def test_ncname_safe(input_string, expected_output):
+    assert ncname_safe(input_string) == expected_output