|
| 1 | +import boto3 |
| 2 | +import smart_open |
| 3 | +from pydantic import BaseModel, typing |
| 4 | + |
| 5 | +from clients.metric_reporter import MetricReporter |
| 6 | +from config import INGESTOR_S3_BUCKET, INGESTOR_S3_PREFIX |
| 7 | +from ingestor_indexer import IngestorIndexerLambdaEvent |
| 8 | +from models.step_events import IngestorMonitorStepEvent |
| 9 | + |
| 10 | + |
| 11 | +class IngestorLoaderMonitorLambdaEvent(IngestorMonitorStepEvent): |
| 12 | + events: list[IngestorIndexerLambdaEvent] |
| 13 | + |
| 14 | + |
| 15 | +class IngestorLoaderMonitorConfig(IngestorMonitorStepEvent): |
| 16 | + loader_s3_bucket: str = INGESTOR_S3_BUCKET |
| 17 | + loader_s3_prefix: str = INGESTOR_S3_PREFIX |
| 18 | + percentage_threshold: float = 0.1 |
| 19 | + |
| 20 | + is_local: bool = False |
| 21 | + |
| 22 | + |
| 23 | +class LoaderReport(BaseModel): |
| 24 | + pipeline_date: str |
| 25 | + job_id: str |
| 26 | + record_count: int |
| 27 | + total_file_size: int |
| 28 | + |
| 29 | + |
| 30 | +def run_check( |
| 31 | + event: IngestorLoaderMonitorLambdaEvent, config: IngestorLoaderMonitorConfig |
| 32 | +) -> LoaderReport: |
| 33 | + pipeline_date = event.events[0].pipeline_date or "dev" |
| 34 | + assert all([(e.pipeline_date or "dev") == pipeline_date for e in event.events]), ( |
| 35 | + "pipeline_date mismatch! Stopping." |
| 36 | + ) |
| 37 | + job_id = event.events[0].job_id |
| 38 | + assert all([e.job_id == job_id for e in event.events]), "job_id mismatch! Stopping." |
| 39 | + force_pass = config.force_pass or event.force_pass |
| 40 | + |
| 41 | + print( |
| 42 | + f"Checking loader events for pipeline_date: {pipeline_date}:{job_id}, force_pass: {force_pass} ..." |
| 43 | + ) |
| 44 | + |
| 45 | + # assert there are no empty content lengths |
| 46 | + assert all([e.object_to_index.content_length for e in event.events]), ( |
| 47 | + "Empty content length found! Stopping." |
| 48 | + ) |
| 49 | + sum_file_size = sum([(e.object_to_index.content_length or 0) for e in event.events]) |
| 50 | + |
| 51 | + # assert there are no empty record counts |
| 52 | + assert all([e.object_to_index.record_count for e in event.events]), ( |
| 53 | + "Empty record count found! Stopping." |
| 54 | + ) |
| 55 | + sum_record_count = sum( |
| 56 | + [(e.object_to_index.record_count or 0) for e in event.events] |
| 57 | + ) |
| 58 | + |
| 59 | + current_report = LoaderReport( |
| 60 | + pipeline_date=pipeline_date, |
| 61 | + job_id=job_id or "dev", |
| 62 | + record_count=sum_record_count, |
| 63 | + total_file_size=sum_file_size, |
| 64 | + ) |
| 65 | + |
| 66 | + s3_report_name = "report.loader.json" |
| 67 | + s3_url_current_job = f"s3://{config.loader_s3_bucket}/{config.loader_s3_prefix}/{pipeline_date}/{job_id}/{s3_report_name}" |
| 68 | + s3_url_latest = f"s3://{config.loader_s3_bucket}/{config.loader_s3_prefix}/{pipeline_date}/{s3_report_name}" |
| 69 | + |
| 70 | + # open with smart_open, check for file existence |
| 71 | + latest_report = None |
| 72 | + try: |
| 73 | + with smart_open.open(s3_url_latest, "r") as f: |
| 74 | + latest_report = LoaderReport.model_validate_json(f.read()) |
| 75 | + |
| 76 | + # if file does not exist, ignore |
| 77 | + except (OSError, KeyError) as e: |
| 78 | + print(f"No latest report found: {e}") |
| 79 | + |
| 80 | + if latest_report is not None: |
| 81 | + # check if the sum file size has changed by more than the threshold, |
| 82 | + # we are ignoring the record count for now, as this will be the same as the trigger step |
| 83 | + delta = current_report.total_file_size - latest_report.total_file_size |
| 84 | + percentage = abs(delta) / latest_report.total_file_size |
| 85 | + |
| 86 | + if percentage > config.percentage_threshold: |
| 87 | + error_message = f"Percentage change {percentage} exceeds threshold {config.percentage_threshold}!" |
| 88 | + if force_pass: |
| 89 | + print(f"Force pass enabled: {error_message}, but continuing.") |
| 90 | + else: |
| 91 | + raise ValueError(error_message) |
| 92 | + else: |
| 93 | + print( |
| 94 | + f"Percentage change {percentage} ({delta}/{latest_report.total_file_size}) is within threshold {config.percentage_threshold}." |
| 95 | + ) |
| 96 | + |
| 97 | + transport_params = {"client": boto3.client("s3")} |
| 98 | + |
| 99 | + # write the current report to s3 as latest |
| 100 | + with smart_open.open(s3_url_latest, "w", transport_params=transport_params) as f: |
| 101 | + f.write(current_report.model_dump_json()) |
| 102 | + |
| 103 | + # write the current report to s3 as job_id |
| 104 | + with smart_open.open( |
| 105 | + s3_url_current_job, "w", transport_params=transport_params |
| 106 | + ) as f: |
| 107 | + f.write(current_report.model_dump_json()) |
| 108 | + |
| 109 | + return current_report |
| 110 | + |
| 111 | + |
| 112 | +def report_results( |
| 113 | + report: LoaderReport, |
| 114 | + send_report: bool, |
| 115 | +) -> None: |
| 116 | + dimensions = { |
| 117 | + "pipeline_date": report.pipeline_date, |
| 118 | + "step": "ingestor_loader_monitor", |
| 119 | + "job_id": report.job_id, |
| 120 | + } |
| 121 | + |
| 122 | + print(f"Reporting results {report}, {dimensions} ...") |
| 123 | + if send_report: |
| 124 | + reporter = MetricReporter("catalogue_graph_ingestor") |
| 125 | + reporter.put_metric_data( |
| 126 | + metric_name="total_file_size", |
| 127 | + value=report.total_file_size, |
| 128 | + dimensions=dimensions, |
| 129 | + ) |
| 130 | + else: |
| 131 | + print("Skipping sending report metrics.") |
| 132 | + |
| 133 | + return |
| 134 | + |
| 135 | + |
| 136 | +def handler( |
| 137 | + event: IngestorLoaderMonitorLambdaEvent, config: IngestorLoaderMonitorConfig |
| 138 | +) -> None: |
| 139 | + print("Checking output of ingestor_loader ...") |
| 140 | + send_report = event.report_results or config.report_results |
| 141 | + |
| 142 | + try: |
| 143 | + report = run_check(event, config) |
| 144 | + report_results(report, send_report) |
| 145 | + except ValueError as e: |
| 146 | + print(f"Check failed: {e}") |
| 147 | + raise e |
| 148 | + |
| 149 | + print("Check complete.") |
| 150 | + return |
| 151 | + |
| 152 | + |
| 153 | +def lambda_handler( |
| 154 | + event: list[IngestorIndexerLambdaEvent] | IngestorLoaderMonitorLambdaEvent, |
| 155 | + context: typing.Any, |
| 156 | +) -> list[dict]: |
| 157 | + handler_event = None |
| 158 | + if isinstance(event, list): |
| 159 | + handler_event = IngestorLoaderMonitorLambdaEvent(events=event) |
| 160 | + else: |
| 161 | + handler_event = event |
| 162 | + |
| 163 | + handler( |
| 164 | + event=handler_event, |
| 165 | + config=IngestorLoaderMonitorConfig(), |
| 166 | + ) |
| 167 | + |
| 168 | + return [e.model_dump() for e in handler_event.events] |
0 commit comments