NHSDigital
diff --git a/‎src/dve/core_engine/backends/implementations/duckdb/readers/xml.py‎
Lines changed: 15 additions & 1 deletion b/‎src/dve/core_engine/backends/implementations/duckdb/readers/xml.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/dve/core_engine/backends/implementations/spark/readers/xml.py‎
Lines changed: 30 additions & 8 deletions b/‎src/dve/core_engine/backends/implementations/spark/readers/xml.py‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎src/dve/core_engine/backends/readers/xml.py‎
Lines changed: 17 additions & 0 deletions b/‎src/dve/core_engine/backends/readers/xml.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/dve/core_engine/backends/readers/xml_linting.py‎
Lines changed: 146 additions & 0 deletions b/‎src/dve/core_engine/backends/readers/xml_linting.py‎
Lines changed: 146 additions & 0 deletions
@@ -12,19 +12,33 @@
 from dve.core_engine.backends.readers.xml import XMLStreamReader
 from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
 from dve.core_engine.type_hints import URI
+from dve.parser.file_handling.service import get_parent
+from dve.pipeline.utils import dump_errors
 
 
 @duckdb_write_parquet
 class DuckDBXMLStreamReader(XMLStreamReader):
     """A reader for XML files"""
 
-    def __init__(self, ddb_connection: Optional[DuckDBPyConnection] = None, **kwargs):
+    def __init__(self,
+                 ddb_connection: Optional[DuckDBPyConnection] = None,
+                 **kwargs):
         self.ddb_connection = ddb_connection if ddb_connection else default_connection
         super().__init__(**kwargs)
 
     @read_function(DuckDBPyRelation)
     def read_to_relation(self, resource: URI, entity_name: str, schema: Type[BaseModel]):
         """Returns a relation object from the source xml"""
+        if self.xsd_location:
+            msg = self._run_xmllint(file_uri=resource)
+            if msg:
+                working_folder = get_parent(resource)
+                dump_errors(
+                    working_folder=working_folder,
+                    step_name="file_transformation",
+                    messages=[msg]
+                    )
+                
         polars_schema: Dict[str, pl.DataType] = {  # type: ignore
             fld.name: get_polars_type_from_annotation(fld.annotation)
             for fld in stringify_model(schema).__fields__.values()
 
@@ -18,10 +18,12 @@
     get_type_from_annotation,
     spark_write_parquet,
 )
-from dve.core_engine.backends.readers.xml import XMLStreamReader
+from dve.core_engine.backends.readers.xml import BasicXMLFileReader, XMLStreamReader
+from dve.core_engine.backends.readers.xml_linting import run_xmllint
 from dve.core_engine.type_hints import URI, EntityName
-from dve.parser.file_handling import get_content_length
+from dve.parser.file_handling import get_content_length, get_parent
 from dve.parser.file_handling.service import open_stream
+from dve.pipeline.utils import dump_errors
 
 SparkXMLMode = Literal["PERMISSIVE", "FAILFAST", "DROPMALFORMED"]
 """The mode to use when parsing XML files with Spark."""
@@ -51,7 +53,7 @@ def read_to_dataframe(
 
 
 @spark_write_parquet
-class SparkXMLReader(BaseFileReader):  # pylint: disable=too-many-instance-attributes
+class SparkXMLReader(BasicXMLFileReader):  # pylint: disable=too-many-instance-attributes
     """A reader for XML files built atop Spark-XML."""
 
     def __init__(
@@ -69,21 +71,31 @@ def __init__(
         sanitise_multiline: bool = True,
         namespace=None,
         trim_cells=True,
+        xsd_location: Optional[URI] = None,
+        xsd_error_code: Optional[str] = None,
+        xsd_error_message: Optional[str] = None
         **_,
     ) -> None:
-        self.record_tag = record_tag
+        
+        super().__init__(
+            record_tag=record_tag,
+            root_tag=root_tag,
+            trim_cells=trim_cells,
+            null_values=null_values,
+            sanitise_multiline=sanitise_multiline,
+            xsd_location=xsd_location,
+            xsd_error_code=xsd_error_code,
+            xsd_error_message=xsd_error_message
+        )
+
         self.spark_session = spark_session or SparkSession.builder.getOrCreate()
         self.sampling_ratio = sampling_ratio
         self.exclude_attribute = exclude_attribute
         self.mode = mode
         self.infer_schema = infer_schema
         self.ignore_namespace = ignore_namespace
-        self.root_tag = root_tag
         self.sanitise_multiline = sanitise_multiline
-        self.null_values = null_values
         self.namespace = namespace
-        self.trim_cells = trim_cells
-        super().__init__()
 
     def read_to_py_iterator(
         self, resource: URI, entity_name: EntityName, schema: Type[BaseModel]
@@ -104,6 +116,16 @@ def read_to_dataframe(
         """
         if get_content_length(resource) == 0:
             raise EmptyFileError(f"File at {resource} is empty.")
+        
+        if self.xsd_location:
+            msg = self._run_xmllint(file_uri=resource)
+            if msg:
+                working_folder = get_parent(resource)
+                dump_errors(
+                    working_folder=working_folder,
+                    step_name="file_transformation",
+                    messages=[msg]
+                    )
 
         spark_schema: StructType = get_type_from_annotation(schema)
         kwargs = {
 
@@ -11,8 +11,10 @@
 
 from dve.core_engine.backends.base.reader import BaseFileReader
 from dve.core_engine.backends.exceptions import EmptyFileError
+from dve.core_engine.backends.readers.xml_linting import run_xmllint
 from dve.core_engine.backends.utilities import get_polars_type_from_annotation, stringify_model
 from dve.core_engine.loggers import get_logger
+from dve.core_engine.message import FeedbackMessage
 from dve.core_engine.type_hints import URI, EntityName
 from dve.parser.file_handling import NonClosingTextIOWrapper, get_content_length, open_stream
 from dve.parser.file_handling.implementations.file import (
@@ -114,6 +116,9 @@ def __init__(
         sanitise_multiline: bool = True,
         encoding: str = "utf-8-sig",
         n_records_to_read: Optional[int] = None,
+        xsd_location: Optional[URI] = None,
+        xsd_error_code: Optional[str] = None,
+        xsd_error_message: Optional[str] = None,
         **_,
     ):
         """Init function for the base XML reader.
@@ -148,6 +153,12 @@ def __init__(
         """Encoding of the XML file."""
         self.n_records_to_read = n_records_to_read
         """The maximum number of records to read from a document."""
+        self.xsd_location = xsd_location
+        """The relative URI of the xsd file if wishing to perform xsd validation"""
+        self.xsd_error_code = xsd_error_code
+        """The error code to be reported if xsd validation fails (if xsd)"""
+        self.xsd_error_message = xsd_error_message
+        """The error message to be reported if xsd validation fails"""
         super().__init__()
         self._logger = get_logger(__name__)
 
@@ -260,6 +271,12 @@ def _parse_xml(
 
         for element in elements:
             yield self._parse_element(element, template_row)
+    
+    def _run_xmllint(self, file_uri: URI) -> FeedbackMessage:
+        return run_xmllint(file_uri=file_uri,
+                        schema_uri=self.xsd_location,
+                        error_code=self.xsd_error_code,
+                        error_message=self.xsd_error_message)
 
     def read_to_py_iterator(
         self,
 
@@ -0,0 +1,146 @@
+"""Implement XML linting for files."""
+
+import shutil
+import tempfile
+from contextlib import ExitStack
+from pathlib import Path
+from subprocess import PIPE, STDOUT, Popen
+from typing import Sequence, Union
+from uuid import uuid4
+
+from dve.core_engine.message import FeedbackMessage
+from dve.parser.file_handling import (
+    copy_resource,
+    get_file_name,
+    get_resource_exists,
+    open_stream,
+)
+from dve.parser.file_handling.implementations.file import file_uri_to_local_path
+from dve.parser.type_hints import URI
+
+ErrorMessage = str
+"""Error message for xml issues"""
+ErrorCode = str
+"""Error code for xml feedback errors"""
+
+FIVE_MEBIBYTES = 5 * (1024**2)
+"""The size of 5 binary megabytes, in bytes."""
+
+
+def _ensure_schema_and_resources(
+    schema_uri: URI, schema_resources: Sequence[URI], temp_dir: Path
+) -> Path:
+    """Given the schema and schema resource URIs and a temp dir, if the resources
+    are remote or exist in different directories, copy them to the temp dir.
+
+    Return the local schema path.
+
+    """
+    if not get_resource_exists(schema_uri):
+        raise IOError(f"No resource accessible at schema URI {schema_uri!r}")
+
+    missing_resources = list(
+        filter(lambda resource: not get_resource_exists(resource), schema_resources)
+    )
+    if missing_resources:
+        raise IOError(f"Some schema resources missing: {missing_resources!r}")
+
+    all_resources = [schema_uri, *schema_resources]
+
+    schemas_are_files = all(map(lambda resource: resource.startswith("file:"), all_resources))
+    if schemas_are_files:
+        paths = list(map(file_uri_to_local_path, all_resources))
+        all_paths_have_same_parent = len({path.parent for path in paths}) == 1
+
+        if all_paths_have_same_parent:
+            schema_path = paths[0]
+            return schema_path
+
+    for resource_uri in all_resources:
+        local_path = temp_dir.joinpath(get_file_name(resource_uri))
+        copy_resource(resource_uri, local_path.as_uri())
+
+    schema_path = temp_dir.joinpath(get_file_name(schema_uri))
+    return schema_path
+
+
+def run_xmllint(
+    file_uri: URI,
+    schema_uri: URI,
+    *schema_resources: URI,
+    error_code: ErrorCode,
+    error_message: ErrorMessage,
+) -> Union[None, FeedbackMessage]:
+    """Run `xmllint`, given a file and information about the schemas to apply.
+
+    The schema and associated resources will be copied to a temporary directory
+    for validation, unless they are all already in the same local folder.
+
+    Args:
+     - `file_uri`: the URI of the file to be streamed into `xmllint`
+     - `schema_uri`: the URI of the XSD schema for the file.
+     - `*schema_resources`: URIs for additional XSD files required by the schema.
+     - `error_code`: The error_code to use in FeedbackMessage if the linting fails.
+     - `error_message`: The error_message to use in FeedbackMessage if the linting fails.
+
+    Returns a deque of messages produced by the linting.
+
+    """
+    if not shutil.which("xmllint"):
+        raise OSError("Unable to find `xmllint` binary")
+
+    if not get_resource_exists(file_uri):
+        raise IOError(f"No resource accessible at file URI {file_uri!r}")
+
+    # Ensure the schema and resources are local file paths so they can be
+    # read by xmllint.
+    # Lots of resources to manage here.
+    with tempfile.TemporaryDirectory() as temp_dir_str:
+        temp_dir = Path(temp_dir_str)
+        schema_path = _ensure_schema_and_resources(schema_uri, schema_resources, temp_dir)
+        message_file_path = temp_dir.joinpath(uuid4().hex)
+
+        with ExitStack() as linting_context:
+            # Need to write lint output to a file to avoid deadlock. Kinder to mem this way anyway.
+            message_file_bytes = linting_context.enter_context(message_file_path.open("wb"))
+
+            # Open an `xmllint` process to pipe into.
+            command = ["xmllint", "--stream", "--schema", str(schema_path), "-"]
+            process = linting_context.enter_context(
+                Popen(command, stdin=PIPE, stdout=message_file_bytes, stderr=STDOUT)
+            )
+            # This should never trigger, bad typing in stdlib.
+            if process.stdin is None:
+                raise ValueError("Unable to pipe file into subprocess")
+
+            # Pipe the XML file contents into xmllint.
+            block = b""
+            try:
+                with open_stream(file_uri, "rb") as byte_stream:
+                    while True:
+                        block = byte_stream.read(FIVE_MEBIBYTES)
+                        if not block:
+                            break
+                        process.stdin.write(block)
+            except BrokenPipeError:
+                pass
+            finally:
+                # Close the input stream and await the response code.
+                # Output will be written to the message file.
+                process.stdin.close()
+                # TODO: Identify an appropriate timeout.
+                return_code = process.wait()
+
+        if return_code == 0:
+            return None
+
+        return FeedbackMessage(
+            entity="xsd_validation",
+            record={},
+            failure_type="submission",
+            is_informational=False,
+            error_type="xsd check",
+            error_location="Whole File",
+            error_message=error_message,
+            error_code=error_code,
+        )