Merge pull request #147 from MITLibraries/TIMX-496-establish-migrations-and-backfill-migration

ghukill · web-flow · commit 0a20234caa76 · 2025-06-03T13:59:42.000-04:00
TIMX 496 - add migrations folder and run_timestamp migration
diff --git a/migrations/001_2025_05_30_backfill_run_timestamp_column.py b/migrations/001_2025_05_30_backfill_run_timestamp_column.py
@@ -0,0 +1,217 @@
+# ruff: noqa: BLE001, D212, TRY300, TRY400
+"""
+Date: 2025-05-30
+
+Description:
+
+After the creation of a new run_timestamp column as part of Jira ticket TIMX-496, there
+was a need to backfill a run timestamp for all parquet files in the dataset.
+
+This migration performs the following:
+1. retrieves all parquet file from the dataset
+2. for each parquet file:
+    a. if the run_timestamp column already exists, skip
+    b. retrieve the file creation date of the parquet file, this becomes the run_timestamp
+    c. rewrite the parquet file with a new run_timestamp column
+
+Side effects:
+
+1- Loss of "Last Modified" date in S3
+
+This migration is using the original "Last Modified" date in S3 that was minted when the
+parquet file was written.  It is storing that data in a `run_timestamp` column and thus
+will persist, but the actual parquet file will LOSE this "Last Modified" date when it is
+recreated.
+
+Usage:
+
+pipenv run python migrations/001_2025_05_30_backfill_run_timestamp_column.py \
+<DATASET_LOCATION> \
+--dry-run
+"""
+
+import argparse
+import json
+import time
+from datetime import UTC, datetime
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet as pq
+from pyarrow import fs
+
+from timdex_dataset_api.config import configure_dev_logger, configure_logger
+from timdex_dataset_api.dataset import TIMDEX_DATASET_SCHEMA, TIMDEXDataset
+
+configure_dev_logger()
+
+logger = configure_logger(__name__)
+
+
+def backfill_dataset(location: str, *, dry_run: bool = False) -> None:
+    """Main entrypoint for backfill script.
+
+    Loop through all parquet files in the dataset and, if the run_timestamp column does
+    not exist, create it using the S3 object creation date.
+    """
+    start_time = time.perf_counter()
+    td = TIMDEXDataset(location)
+    td.load()
+
+    parquet_files = td.dataset.files  # type: ignore[attr-defined]
+    logger.info(f"Found {len(parquet_files)} parquet files in dataset.")
+
+    success_count = 0
+    skip_count = 0
+    error_count = 0
+
+    for i, parquet_file in enumerate(parquet_files):
+        logger.info(
+            f"Working on parquet file {i + 1}/{len(parquet_files)}: {parquet_file}"
+        )
+
+        success, result = backfill_parquet_file(parquet_file, td.dataset, dry_run=dry_run)
+
+        if success:
+            if result and "skipped" in result:
+                skip_count += 1
+            else:
+                success_count += 1
+        else:
+            error_count += 1
+
+        logger.info(json.dumps(result))
+
+    logger.info(
+        f"Backfill complete. Elapsed: {time.perf_counter()-start_time}, "
+        f"Success: {success_count}, Skipped: {skip_count}, Errors: {error_count}"
+    )
+
+
+def backfill_parquet_file(
+    parquet_filepath: str,
+    dataset: ds.Dataset,
+    *,
+    dry_run: bool = False,
+) -> tuple[bool, dict]:
+    """Backfill a single parquet file with run_timestamp column.
+
+    Args:
+        parquet_filepath: Path to the parquet file
+        dataset: PyArrow dataset instance
+        dry_run: If True, don't actually write changes
+
+    Returns:
+        Tuple of (success: bool, result: dict)
+    """
+    start_time = time.perf_counter()
+    try:
+        parquet_file = pq.ParquetFile(parquet_filepath, filesystem=dataset.filesystem)  # type: ignore[attr-defined]
+
+        # Check if run_timestamp column already exists
+        if "run_timestamp" in parquet_file.schema.names:
+            logger.info(
+                f"Parquet already has 'run_timestamp', skipping: {parquet_filepath}"
+            )
+            return True, {"file_path": parquet_filepath, "skipped": True}
+
+        # Read all rows from the parquet file into a pyarrow Table
+        # NOTE: memory intensive for very large parquet files, though suitable for onetime
+        #  migration work.
+        table = parquet_file.read()
+
+        # Get S3 object creation date
+        creation_date = get_s3_object_creation_date(parquet_filepath, dataset.filesystem)  # type: ignore[attr-defined]
+
+        # Create run_timestamp column using the exact schema definition
+        num_rows = len(table)
+        run_timestamp_field = TIMDEX_DATASET_SCHEMA.field("run_timestamp")
+        run_timestamp_array = pa.array(
+            [creation_date] * num_rows, type=run_timestamp_field.type
+        )
+
+        # Add the run_timestamp column to the table
+        table_with_timestamp = table.append_column("run_timestamp", run_timestamp_array)
+
+        # Write the updated table back to the same file
+        if not dry_run:
+            pq.write_table(
+                table_with_timestamp,  # type: ignore[attr-defined]
+                parquet_filepath,
+                filesystem=dataset.filesystem,  # type: ignore[attr-defined]
+            )
+            logger.info(f"Successfully updated file: {parquet_filepath}")
+        else:
+            logger.info(f"DRY RUN: Would update file: {parquet_filepath}")
+
+        update_details = {
+            "file_path": parquet_filepath,
+            "rows_updated": num_rows,
+            "run_timestamp_added": creation_date.isoformat(),
+            "elapsed": time.perf_counter() - start_time,
+            "dry_run": dry_run,
+        }
+
+        return True, update_details
+
+    except Exception as e:
+        logger.error(f"Error processing parquet file {parquet_filepath}: {e}")
+        return False, {
+            "file_path": parquet_filepath,
+            "error": str(e),
+            "elapsed": time.perf_counter() - start_time,
+            "dry_run": dry_run,
+        }
+
+
+def get_s3_object_creation_date(file_path: str, filesystem: fs.FileSystem) -> datetime:
+    """Get the creation date of an S3 object.
+
+    This function assumes that all datetimes coming back are coming from the same source
+    and will be formatted similarly, which means either all values are timezone aware or
+    not.
+
+    Args:
+        file_path: Path to the S3 object
+        filesystem: PyArrow S3 filesystem instance
+
+    Returns:
+        datetime: Creation date of the S3 object in UTC
+    """
+    try:
+        # Get creation date of S3 object
+        file_info = filesystem.get_file_info(file_path)
+        creation_date: datetime = file_info.mtime  # type: ignore[assignment]
+
+        # Ensure it's timezone-aware and in UTC
+        if creation_date.tzinfo is None:
+            creation_date = creation_date.replace(tzinfo=UTC)
+        elif creation_date.tzinfo != UTC:
+            creation_date = creation_date.astimezone(UTC)
+
+        return creation_date
+
+    except Exception as e:
+        logger.error(f"Error getting S3 object creation date for {file_path}: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=(
+            "Backfill run_timestamp column in TIMDEX parquet files "
+            "using S3 creation dates"
+        )
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Scan files and report what would be done without making changes",
+    )
+    parser.add_argument(
+        "dataset_location", help="Path to the dataset (local path or s3://bucket/path)"
+    )
+
+    args = parser.parse_args()
+
+    backfill_dataset(args.dataset_location, dry_run=args.dry_run)
diff --git a/migrations/README.md b/migrations/README.md
@@ -0,0 +1,65 @@
+# TIMDEX Dataset Migrations
+
+This directory stores data and/or schema modifications that were made to the TIMDEX parquet dataset.  Consider them like ["migrations"](https://en.wikipedia.org/wiki/Schema_migration) for a SQL database, but -- at least at the time of this writing -- considerably more informal and ad-hoc.
+
+Unless otherwise noted, it assumed that these migrations were:
+
+  * manually run by a developer, either on a local machine or some cloud operations
+  * have been performed already, should not be performed again
+  * the migration script does not contain a way to rollback the changes
+
+##  Structure
+
+Each migration is either a single python file, or a dedicated directory, that follow this naming convention:
+
+  - `###_`: incrementing migration sequence number
+  - `YYYY_MM_DD_`: approximate date of migration creation and run
+  - `short_name.py` (file) or `short_name` (directory): short migration name
+
+Examples:
+
+  - `001_2025_05_30_backfill_run_timestamp_column.py` --> single file
+  - `002_2025_06_15_remove_errant_parquet_files` --> directory that contains 1+ files
+
+Files inside a migration directory like `002_2025_06_15_remove_errant_parquet_files` are _not_ expected to follow any particular format (though a `README.md` is encourage to inform future developers how it was performed!).
+
+The entrypoint for each migration should contain a docstring at the root of the file with a structure like:
+
+```python
+"""
+Date: YYYY-MM-DD
+
+Description:
+
+Description here about the nature of the migration...
+
+Usage:
+
+Explanation here for how to run it...
+"""
+```
+
+Example:
+```python
+"""
+Date: 2025-05-30
+
+Description:
+
+After the creation of a new run_timestamp column as part of Jira ticket TIMX-496, there
+was a need to backfill a run timestamp for all parquet files in the dataset.
+
+This migration performs the following:
+1. retrieves all parquet file from the dataset
+2. for each parquet file:
+    a. if the run_timestamp column already exists, skip
+    b. retrieve the file creation date of the parquet file, this becomes the run_timestamp
+    c. rewrite the parquet file with a new run_timestamp column
+
+Usage:
+PYTHONPATH=. \
+pipenv run python migrations/001_2025_05_30_backfill_run_timestamp_column.py \
+<DATASET_LOCATION> \
+--dry-run
+"""
+```
diff --git a/migrations/__init__.py b/migrations/__init__.py