|
| 1 | +"""Implement XML linting for files.""" |
| 2 | + |
| 3 | +import shutil |
| 4 | +import tempfile |
| 5 | +from contextlib import ExitStack |
| 6 | +from pathlib import Path |
| 7 | +from subprocess import PIPE, STDOUT, Popen |
| 8 | +from typing import Sequence, Union |
| 9 | +from uuid import uuid4 |
| 10 | + |
| 11 | +from dve.core_engine.message import FeedbackMessage |
| 12 | +from dve.parser.file_handling import ( |
| 13 | + copy_resource, |
| 14 | + get_file_name, |
| 15 | + get_resource_exists, |
| 16 | + open_stream, |
| 17 | +) |
| 18 | +from dve.parser.file_handling.implementations.file import file_uri_to_local_path |
| 19 | +from dve.parser.type_hints import URI |
| 20 | + |
| 21 | +ErrorMessage = str |
| 22 | +"""Error message for xml issues""" |
| 23 | +ErrorCode = str |
| 24 | +"""Error code for xml feedback errors""" |
| 25 | + |
| 26 | +FIVE_MEBIBYTES = 5 * (1024**2) |
| 27 | +"""The size of 5 binary megabytes, in bytes.""" |
| 28 | + |
| 29 | + |
| 30 | +def _ensure_schema_and_resources( |
| 31 | + schema_uri: URI, schema_resources: Sequence[URI], temp_dir: Path |
| 32 | +) -> Path: |
| 33 | + """Given the schema and schema resource URIs and a temp dir, if the resources |
| 34 | + are remote or exist in different directories, copy them to the temp dir. |
| 35 | +
|
| 36 | + Return the local schema path. |
| 37 | +
|
| 38 | + """ |
| 39 | + if not get_resource_exists(schema_uri): |
| 40 | + raise IOError(f"No resource accessible at schema URI {schema_uri!r}") |
| 41 | + |
| 42 | + missing_resources = list( |
| 43 | + filter(lambda resource: not get_resource_exists(resource), schema_resources) |
| 44 | + ) |
| 45 | + if missing_resources: |
| 46 | + raise IOError(f"Some schema resources missing: {missing_resources!r}") |
| 47 | + |
| 48 | + all_resources = [schema_uri, *schema_resources] |
| 49 | + |
| 50 | + schemas_are_files = all(map(lambda resource: resource.startswith("file:"), all_resources)) |
| 51 | + if schemas_are_files: |
| 52 | + paths = list(map(file_uri_to_local_path, all_resources)) |
| 53 | + all_paths_have_same_parent = len({path.parent for path in paths}) == 1 |
| 54 | + |
| 55 | + if all_paths_have_same_parent: |
| 56 | + schema_path = paths[0] |
| 57 | + return schema_path |
| 58 | + |
| 59 | + for resource_uri in all_resources: |
| 60 | + local_path = temp_dir.joinpath(get_file_name(resource_uri)) |
| 61 | + copy_resource(resource_uri, local_path.as_uri()) |
| 62 | + |
| 63 | + schema_path = temp_dir.joinpath(get_file_name(schema_uri)) |
| 64 | + return schema_path |
| 65 | + |
| 66 | + |
| 67 | +def run_xmllint( |
| 68 | + file_uri: URI, |
| 69 | + schema_uri: URI, |
| 70 | + *schema_resources: URI, |
| 71 | + error_code: ErrorCode, |
| 72 | + error_message: ErrorMessage, |
| 73 | +) -> Union[None, FeedbackMessage]: |
| 74 | + """Run `xmllint`, given a file and information about the schemas to apply. |
| 75 | +
|
| 76 | + The schema and associated resources will be copied to a temporary directory |
| 77 | + for validation, unless they are all already in the same local folder. |
| 78 | +
|
| 79 | + Args: |
| 80 | + - `file_uri`: the URI of the file to be streamed into `xmllint` |
| 81 | + - `schema_uri`: the URI of the XSD schema for the file. |
| 82 | + - `*schema_resources`: URIs for additional XSD files required by the schema. |
| 83 | + - `error_code`: The error_code to use in FeedbackMessage if the linting fails. |
| 84 | + - `error_message`: The error_message to use in FeedbackMessage if the linting fails. |
| 85 | +
|
| 86 | + Returns a deque of messages produced by the linting. |
| 87 | +
|
| 88 | + """ |
| 89 | + if not shutil.which("xmllint"): |
| 90 | + raise OSError("Unable to find `xmllint` binary") |
| 91 | + |
| 92 | + if not get_resource_exists(file_uri): |
| 93 | + raise IOError(f"No resource accessible at file URI {file_uri!r}") |
| 94 | + |
| 95 | + # Ensure the schema and resources are local file paths so they can be |
| 96 | + # read by xmllint. |
| 97 | + # Lots of resources to manage here. |
| 98 | + with tempfile.TemporaryDirectory() as temp_dir_str: |
| 99 | + temp_dir = Path(temp_dir_str) |
| 100 | + schema_path = _ensure_schema_and_resources(schema_uri, schema_resources, temp_dir) |
| 101 | + message_file_path = temp_dir.joinpath(uuid4().hex) |
| 102 | + |
| 103 | + with ExitStack() as linting_context: |
| 104 | + # Need to write lint output to a file to avoid deadlock. Kinder to mem this way anyway. |
| 105 | + message_file_bytes = linting_context.enter_context(message_file_path.open("wb")) |
| 106 | + |
| 107 | + # Open an `xmllint` process to pipe into. |
| 108 | + command = ["xmllint", "--stream", "--schema", str(schema_path), "-"] |
| 109 | + process = linting_context.enter_context( |
| 110 | + Popen(command, stdin=PIPE, stdout=message_file_bytes, stderr=STDOUT) |
| 111 | + ) |
| 112 | + # This should never trigger, bad typing in stdlib. |
| 113 | + if process.stdin is None: |
| 114 | + raise ValueError("Unable to pipe file into subprocess") |
| 115 | + |
| 116 | + # Pipe the XML file contents into xmllint. |
| 117 | + block = b"" |
| 118 | + try: |
| 119 | + with open_stream(file_uri, "rb") as byte_stream: |
| 120 | + while True: |
| 121 | + block = byte_stream.read(FIVE_MEBIBYTES) |
| 122 | + if not block: |
| 123 | + break |
| 124 | + process.stdin.write(block) |
| 125 | + except BrokenPipeError: |
| 126 | + pass |
| 127 | + finally: |
| 128 | + # Close the input stream and await the response code. |
| 129 | + # Output will be written to the message file. |
| 130 | + process.stdin.close() |
| 131 | + # TODO: Identify an appropriate timeout. |
| 132 | + return_code = process.wait() |
| 133 | + |
| 134 | + if return_code == 0: |
| 135 | + return None |
| 136 | + |
| 137 | + return FeedbackMessage( |
| 138 | + entity="xsd_validation", |
| 139 | + record={}, |
| 140 | + failure_type="submission", |
| 141 | + is_informational=False, |
| 142 | + error_type="xsd check", |
| 143 | + error_location="Whole File", |
| 144 | + error_message=error_message, |
| 145 | + error_code=error_code, |
| 146 | + ) |
0 commit comments