Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 176 additions & 1 deletion dcpy/lifecycle/scripts/version_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,163 @@
from dcpy.lifecycle import product_metadata
import pandas as pd

from datetime import datetime
from dateutil.parser import parse as dateutil_parse
import re


class FuzzyVersion:
"""A version string that supports fuzzy comparison including with various date formats."""

def __init__(self, version_string):
self.original = version_string
self.normalized = self._normalize() if version_string else version_string

def probably_equals(self, other: "str | FuzzyVersion"):
fuzzy_other = FuzzyVersion(other) if isinstance(other, str) else other

if not (self.normalized and fuzzy_other.normalized):
return False

return self.normalized == fuzzy_other.normalized

def _normalize(self):
"""
Convert various date formats to a standardized form (YYYYMM).

Returns:
str: Normalized version string in YYYYMM format, or original if no pattern matches
"""
if not self.original:
return self.original

version = self.original.lower().strip()

# Handle quarter notation (e.g., "25q1", "24Q2")
quarter_match = re.match(r"^(\d{2})q([1-4])$", version)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this be q|Q? I feel like we see both

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry! We've lowered at this point

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indeed - though latest commit adds an explicit test case for that (there was something close before, but not quite) and changes the comment

if quarter_match:
year_suffix = quarter_match.group(1)
quarter = int(quarter_match.group(2))
# Convert 2-digit year to 4-digit (assuming 20XX)
year = 2000 + int(year_suffix)
# Quarter to month mapping: Q1=March, Q2=June, Q3=September, Q4=December
month = quarter * 3
return f"{year:04d}{month:02d}"

# Handle YYYYMMDD format
if re.match(r"^\d{8}$", version):
return version[:6] # Take first 6 digits (YYYYMM)

# Handle YYYYMM format (already in target format)
if re.match(r"^\d{6}$", version):
return version

# Handle month name + year using dateutil, but be selective
# Only try to parse if it contains month names or reasonable date patterns
if any(
month in version
for month in [
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
"jan",
"feb",
"mar",
"apr",
"may",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec",
]
):
try:
parsed_date = dateutil_parse(
version, fuzzy=True, default=datetime(2000, 1, 1)
)
# Only return if the parsed date seems reasonable (not the default year)
if parsed_date.year >= 2000:
return f"{parsed_date.year:04d}{parsed_date.month:02d}"
except (ValueError, TypeError):
pass

# Return original if no pattern matches
return version

def __str__(self):
return self.original or ""

def __repr__(self):
return f"FuzzyVersion({self.original!r})"

def __eq__(self, other):
"""Strict equality - delegates to probably_equals for fuzzy comparison."""
if isinstance(other, FuzzyVersion):
return self.original == other.original
return False

def __hash__(self):
return hash(self.original)


def sort_by_outdated_products(df):
"""
Sort dataframe to show products with outdated datasets first.
Products with any outdated datasets appear at the top.
Also prioritizes products with open_data_versions over those with all blank versions.
"""
# Create a summary of outdated status by product
product_status = (
df.groupby("product")["up_to_date"].agg(["all", "sum", "count"]).reset_index()
)
product_status["has_outdated"] = ~product_status["all"]
product_status["outdated_count"] = product_status["count"] - product_status["sum"]

# Add flag for products that have any open_data_versions (not all blank/missing)
product_has_data = (
df.groupby("product")["open_data_versions"]
.apply(
lambda x: x.apply(
lambda v: bool(v and (v != [] if isinstance(v, list) else True))
).any()
)
.reset_index()
)
product_has_data.columns = ["product", "has_open_data"]
product_status = product_status.merge(product_has_data, on="product")

# Sort products:
# 1. Those with outdated datasets first
# 2. Those with open data first
# 3. Then by number of outdated datasets
product_order = product_status.sort_values(
["has_outdated", "has_open_data", "outdated_count"],
ascending=[False, False, False],
)["product"].tolist()

# Reorder the dataframe based on product order
df_sorted = df.reset_index()
df_sorted["product_order"] = df_sorted["product"].map(
{prod: i for i, prod in enumerate(product_order)}
)
df_sorted = df_sorted.sort_values(["product_order", "product", "dataset"]).drop(
"product_order", axis=1
)

return df_sorted.set_index(["product", "dataset"])


def get_all_open_data_keys():
"""retrieve all product.dataset.destination_ids"""
Expand Down Expand Up @@ -40,21 +197,39 @@ def make_comparison_dataframe(bytes_versions, open_data_versions):
product, dataset, destination_id = key.split(".")
bytes_version = bytes_versions.get(f"{product}.{dataset}")
open_data_vers = open_data_versions.get(key, [])

# Determine if versions are up to date using fuzzy comparison
up_to_date = False
try:
up_to_date = FuzzyVersion(bytes_version).probably_equals(
FuzzyVersion(open_data_vers)
)
except Exception:
pass

rows.append(
{
"product": product,
"dataset": dataset,
"destination_id": destination_id,
"bytes_version": bytes_version,
"open_data_versions": open_data_vers,
"up_to_date": up_to_date,
}
)
df = pd.DataFrame(rows).set_index(["product", "dataset"]).sort_index()

# Add product-level up-to-date flag
# A product is up-to-date if ALL its datasets are up-to-date
product_status = df.groupby("product")["up_to_date"].all()
df["product_up_to_date"] = df.index.get_level_values("product").map(product_status)

return df


def run():
all_keys = get_all_open_data_keys()
open_data_versions = get_open_data_versions(all_keys)
bytes_versions = get_bytes_versions(all_keys)
return make_comparison_dataframe(bytes_versions, open_data_versions)
df = make_comparison_dataframe(bytes_versions, open_data_versions)
return sort_by_outdated_products(df)
Loading