|
| 1 | +"""Utilities to support reporting""" |
| 2 | + |
| 3 | +import datetime as dt |
| 4 | +import json |
| 5 | +import logging |
| 6 | +from collections.abc import Iterable |
| 7 | +from itertools import chain |
| 8 | +from multiprocessing import Queue |
| 9 | +from threading import Thread |
| 10 | +from typing import Optional, Union |
| 11 | + |
| 12 | +import dve.parser.file_handling as fh |
| 13 | +from dve.core_engine.exceptions import CriticalProcessingError |
| 14 | +from dve.core_engine.loggers import get_logger |
| 15 | +from dve.core_engine.message import UserMessage |
| 16 | +from dve.core_engine.type_hints import URI, DVEStageName, Messages |
| 17 | + |
| 18 | + |
| 19 | +def get_feedback_errors_uri(working_folder: URI, step_name: DVEStageName) -> URI: |
| 20 | + """Determine the location of json lines file containing all errors generated in a step""" |
| 21 | + return fh.joinuri(working_folder, "errors", f"{step_name}_errors.jsonl") |
| 22 | + |
| 23 | + |
| 24 | +def get_processing_errors_uri(working_folder: URI) -> URI: |
| 25 | + """Determine the location of json lines file containing all processing |
| 26 | + errors generated from DVE run""" |
| 27 | + return fh.joinuri(working_folder, "processing_errors", "processing_errors.jsonl") |
| 28 | + |
| 29 | + |
| 30 | +def dump_feedback_errors( |
| 31 | + working_folder: URI, |
| 32 | + step_name: DVEStageName, |
| 33 | + messages: Messages, |
| 34 | + key_fields: Optional[dict[str, list[str]]] = None, |
| 35 | +) -> URI: |
| 36 | + """Write out captured feedback error messages.""" |
| 37 | + if not working_folder: |
| 38 | + raise AttributeError("processed files path not passed") |
| 39 | + |
| 40 | + if not key_fields: |
| 41 | + key_fields = {} |
| 42 | + |
| 43 | + error_file = get_feedback_errors_uri(working_folder, step_name) |
| 44 | + processed = [] |
| 45 | + |
| 46 | + for message in messages: |
| 47 | + if message.original_entity is not None: |
| 48 | + primary_keys = key_fields.get(message.original_entity, []) |
| 49 | + elif message.entity is not None: |
| 50 | + primary_keys = key_fields.get(message.entity, []) |
| 51 | + else: |
| 52 | + primary_keys = [] |
| 53 | + |
| 54 | + error = message.to_dict( |
| 55 | + key_field=primary_keys, |
| 56 | + value_separator=" -- ", |
| 57 | + max_number_of_values=10, |
| 58 | + record_converter=None, |
| 59 | + ) |
| 60 | + error["Key"] = conditional_cast(error["Key"], primary_keys, value_separator=" -- ") |
| 61 | + processed.append(error) |
| 62 | + |
| 63 | + with fh.open_stream(error_file, "a") as f: |
| 64 | + f.write("\n".join([json.dumps(rec, default=str) for rec in processed]) + "\n") |
| 65 | + return error_file |
| 66 | + |
| 67 | + |
| 68 | +def dump_processing_errors( |
| 69 | + working_folder: URI, step_name: str, errors: list[CriticalProcessingError] |
| 70 | +): |
| 71 | + """Write out critical processing errors""" |
| 72 | + if not working_folder: |
| 73 | + raise AttributeError("processed files path not passed") |
| 74 | + if not step_name: |
| 75 | + raise AttributeError("step name not passed") |
| 76 | + if not errors: |
| 77 | + raise AttributeError("errors list not passed") |
| 78 | + |
| 79 | + error_file: URI = get_processing_errors_uri(working_folder) |
| 80 | + processed = [] |
| 81 | + |
| 82 | + for error in errors: |
| 83 | + processed.append( |
| 84 | + { |
| 85 | + "step_name": step_name, |
| 86 | + "error_location": "processing", |
| 87 | + "error_level": "integrity", |
| 88 | + "error_message": error.error_message, |
| 89 | + "error_traceback": error.messages, |
| 90 | + } |
| 91 | + ) |
| 92 | + |
| 93 | + with fh.open_stream(error_file, "a") as f: |
| 94 | + f.write("\n".join([json.dumps(rec, default=str) for rec in processed]) + "\n") |
| 95 | + |
| 96 | + return error_file |
| 97 | + |
| 98 | + |
| 99 | +def load_feedback_messages(feedback_messages_uri: URI) -> Iterable[UserMessage]: |
| 100 | + """Load user messages from jsonl file""" |
| 101 | + if not fh.get_resource_exists(feedback_messages_uri): |
| 102 | + return |
| 103 | + with fh.open_stream(feedback_messages_uri) as errs: |
| 104 | + yield from (UserMessage(**json.loads(err)) for err in errs.readlines()) |
| 105 | + |
| 106 | + |
| 107 | +def load_all_error_messages(error_directory_uri: URI) -> Iterable[UserMessage]: |
| 108 | + "Load user messages from all jsonl files" |
| 109 | + return chain.from_iterable( |
| 110 | + [ |
| 111 | + load_feedback_messages(err_file) |
| 112 | + for err_file, _ in fh.iter_prefix(error_directory_uri) |
| 113 | + if err_file.endswith(".jsonl") |
| 114 | + ] |
| 115 | + ) |
| 116 | + |
| 117 | + |
| 118 | +class BackgroundMessageWriter: |
| 119 | + """Controls batch writes to error jsonl files""" |
| 120 | + |
| 121 | + def __init__( |
| 122 | + self, |
| 123 | + working_directory: URI, |
| 124 | + dve_stage: DVEStageName, |
| 125 | + key_fields: Optional[dict[str, list[str]]] = None, |
| 126 | + logger: Optional[logging.Logger] = None, |
| 127 | + ): |
| 128 | + self._working_directory = working_directory |
| 129 | + self._dve_stage = dve_stage |
| 130 | + self._feedback_message_uri = get_feedback_errors_uri( |
| 131 | + self._working_directory, self._dve_stage |
| 132 | + ) |
| 133 | + self._key_fields = key_fields |
| 134 | + self.logger = logger or get_logger(type(self).__name__) |
| 135 | + self._write_thread: Optional[Thread] = None |
| 136 | + self._queue: Queue = Queue() |
| 137 | + |
| 138 | + @property |
| 139 | + def write_queue(self) -> Queue: # type: ignore |
| 140 | + """Queue for storing batches of messages to be written""" |
| 141 | + return self._queue |
| 142 | + |
| 143 | + @property |
| 144 | + def write_thread(self) -> Thread: # type: ignore |
| 145 | + """Thread to write batches of messages to jsonl file""" |
| 146 | + if not self._write_thread: |
| 147 | + self._write_thread = Thread(target=self._write_process_wrapper) |
| 148 | + return self._write_thread |
| 149 | + |
| 150 | + def _write_process_wrapper(self): |
| 151 | + """Wrapper for dump feedback errors to run in background process""" |
| 152 | + # writing thread will block if nothing in queue |
| 153 | + while True: |
| 154 | + if msgs := self.write_queue.get(): |
| 155 | + dump_feedback_errors( |
| 156 | + self._working_directory, self._dve_stage, msgs, self._key_fields |
| 157 | + ) |
| 158 | + else: |
| 159 | + break |
| 160 | + |
| 161 | + def __enter__(self) -> "BackgroundMessageWriter": |
| 162 | + self.write_thread.start() |
| 163 | + return self |
| 164 | + |
| 165 | + def __exit__(self, exc_type, exc_value, traceback): |
| 166 | + if exc_type: |
| 167 | + self.logger.exception( |
| 168 | + "Issue occured during background write process:", |
| 169 | + exc_info=(exc_type, exc_value, traceback), |
| 170 | + ) |
| 171 | + # None value in queue will trigger break in target |
| 172 | + self.write_queue.put(None) |
| 173 | + self.write_thread.join() |
| 174 | + |
| 175 | + |
| 176 | +def conditional_cast(value, primary_keys: list[str], value_separator: str) -> Union[list[str], str]: |
| 177 | + """Determines what to do with a value coming back from the error list""" |
| 178 | + if isinstance(value, list): |
| 179 | + casts = [ |
| 180 | + conditional_cast(val, primary_keys, value_separator) for val in value |
| 181 | + ] # type: ignore |
| 182 | + return value_separator.join( |
| 183 | + [f"{pk}: {id}" if pk else "" for pk, id in zip(primary_keys, casts)] |
| 184 | + ) |
| 185 | + if isinstance(value, dt.date): |
| 186 | + return value.isoformat() |
| 187 | + if isinstance(value, dict): |
| 188 | + return "" |
| 189 | + return str(value) |
0 commit comments