Skip to content

Commit c4e9c7c

Browse files
authored
Add Fuzzy Versions, to compare bytes <> socrata (#2061)
* Add Fuzzy Versions, to compare bytes <> socrata * post-review: remove the blub in `probably_equals` * make quarter notation a little more explicit * Fix ruff issue
1 parent 5ebd86b commit c4e9c7c

File tree

2 files changed

+442
-1
lines changed

2 files changed

+442
-1
lines changed

dcpy/lifecycle/scripts/version_compare.py

Lines changed: 176 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,163 @@
22
from dcpy.lifecycle import product_metadata
33
import pandas as pd
44

5+
from datetime import datetime
6+
from dateutil.parser import parse as dateutil_parse
7+
import re
8+
9+
10+
class FuzzyVersion:
11+
"""A version string that supports fuzzy comparison including with various date formats."""
12+
13+
def __init__(self, version_string):
14+
self.original = version_string
15+
self.normalized = self._normalize() if version_string else version_string
16+
17+
def probably_equals(self, other: "str | FuzzyVersion"):
18+
fuzzy_other = FuzzyVersion(other) if isinstance(other, str) else other
19+
20+
if not (self.normalized and fuzzy_other.normalized):
21+
return False
22+
23+
return self.normalized == fuzzy_other.normalized
24+
25+
def _normalize(self):
26+
"""
27+
Convert various date formats to a standardized form (YYYYMM).
28+
29+
Returns:
30+
str: Normalized version string in YYYYMM format, or original if no pattern matches
31+
"""
32+
if not self.original:
33+
return self.original
34+
35+
version = self.original.lower().strip()
36+
37+
# Handle quarter notation (e.g., "25q1", "24Q2")
38+
quarter_match = re.match(r"^(\d{2})q([1-4])$", version)
39+
if quarter_match:
40+
year_suffix = quarter_match.group(1)
41+
quarter = int(quarter_match.group(2))
42+
# Convert 2-digit year to 4-digit (assuming 20XX)
43+
year = 2000 + int(year_suffix)
44+
# Quarter to month mapping: Q1=March, Q2=June, Q3=September, Q4=December
45+
month = quarter * 3
46+
return f"{year:04d}{month:02d}"
47+
48+
# Handle YYYYMMDD format
49+
if re.match(r"^\d{8}$", version):
50+
return version[:6] # Take first 6 digits (YYYYMM)
51+
52+
# Handle YYYYMM format (already in target format)
53+
if re.match(r"^\d{6}$", version):
54+
return version
55+
56+
# Handle month name + year using dateutil, but be selective
57+
# Only try to parse if it contains month names or reasonable date patterns
58+
if any(
59+
month in version
60+
for month in [
61+
"january",
62+
"february",
63+
"march",
64+
"april",
65+
"may",
66+
"june",
67+
"july",
68+
"august",
69+
"september",
70+
"october",
71+
"november",
72+
"december",
73+
"jan",
74+
"feb",
75+
"mar",
76+
"apr",
77+
"may",
78+
"jun",
79+
"jul",
80+
"aug",
81+
"sep",
82+
"oct",
83+
"nov",
84+
"dec",
85+
]
86+
):
87+
try:
88+
parsed_date = dateutil_parse(
89+
version, fuzzy=True, default=datetime(2000, 1, 1)
90+
)
91+
# Only return if the parsed date seems reasonable (not the default year)
92+
if parsed_date.year >= 2000:
93+
return f"{parsed_date.year:04d}{parsed_date.month:02d}"
94+
except (ValueError, TypeError):
95+
pass
96+
97+
# Return original if no pattern matches
98+
return version
99+
100+
def __str__(self):
101+
return self.original or ""
102+
103+
def __repr__(self):
104+
return f"FuzzyVersion({self.original!r})"
105+
106+
def __eq__(self, other):
107+
"""Strict equality - delegates to probably_equals for fuzzy comparison."""
108+
if isinstance(other, FuzzyVersion):
109+
return self.original == other.original
110+
return False
111+
112+
def __hash__(self):
113+
return hash(self.original)
114+
115+
116+
def sort_by_outdated_products(df):
117+
"""
118+
Sort dataframe to show products with outdated datasets first.
119+
Products with any outdated datasets appear at the top.
120+
Also prioritizes products with open_data_versions over those with all blank versions.
121+
"""
122+
# Create a summary of outdated status by product
123+
product_status = (
124+
df.groupby("product")["up_to_date"].agg(["all", "sum", "count"]).reset_index()
125+
)
126+
product_status["has_outdated"] = ~product_status["all"]
127+
product_status["outdated_count"] = product_status["count"] - product_status["sum"]
128+
129+
# Add flag for products that have any open_data_versions (not all blank/missing)
130+
product_has_data = (
131+
df.groupby("product")["open_data_versions"]
132+
.apply(
133+
lambda x: x.apply(
134+
lambda v: bool(v and (v != [] if isinstance(v, list) else True))
135+
).any()
136+
)
137+
.reset_index()
138+
)
139+
product_has_data.columns = ["product", "has_open_data"]
140+
product_status = product_status.merge(product_has_data, on="product")
141+
142+
# Sort products:
143+
# 1. Those with outdated datasets first
144+
# 2. Those with open data first
145+
# 3. Then by number of outdated datasets
146+
product_order = product_status.sort_values(
147+
["has_outdated", "has_open_data", "outdated_count"],
148+
ascending=[False, False, False],
149+
)["product"].tolist()
150+
151+
# Reorder the dataframe based on product order
152+
df_sorted = df.reset_index()
153+
df_sorted["product_order"] = df_sorted["product"].map(
154+
{prod: i for i, prod in enumerate(product_order)}
155+
)
156+
df_sorted = df_sorted.sort_values(["product_order", "product", "dataset"]).drop(
157+
"product_order", axis=1
158+
)
159+
160+
return df_sorted.set_index(["product", "dataset"])
161+
5162

6163
def get_all_open_data_keys():
7164
"""retrieve all product.dataset.destination_ids"""
@@ -40,21 +197,39 @@ def make_comparison_dataframe(bytes_versions, open_data_versions):
40197
product, dataset, destination_id = key.split(".")
41198
bytes_version = bytes_versions.get(f"{product}.{dataset}")
42199
open_data_vers = open_data_versions.get(key, [])
200+
201+
# Determine if versions are up to date using fuzzy comparison
202+
up_to_date = False
203+
try:
204+
up_to_date = FuzzyVersion(bytes_version).probably_equals(
205+
FuzzyVersion(open_data_vers)
206+
)
207+
except Exception:
208+
pass
209+
43210
rows.append(
44211
{
45212
"product": product,
46213
"dataset": dataset,
47214
"destination_id": destination_id,
48215
"bytes_version": bytes_version,
49216
"open_data_versions": open_data_vers,
217+
"up_to_date": up_to_date,
50218
}
51219
)
52220
df = pd.DataFrame(rows).set_index(["product", "dataset"]).sort_index()
221+
222+
# Add product-level up-to-date flag
223+
# A product is up-to-date if ALL its datasets are up-to-date
224+
product_status = df.groupby("product")["up_to_date"].all()
225+
df["product_up_to_date"] = df.index.get_level_values("product").map(product_status)
226+
53227
return df
54228

55229

56230
def run():
57231
all_keys = get_all_open_data_keys()
58232
open_data_versions = get_open_data_versions(all_keys)
59233
bytes_versions = get_bytes_versions(all_keys)
60-
return make_comparison_dataframe(bytes_versions, open_data_versions)
234+
df = make_comparison_dataframe(bytes_versions, open_data_versions)
235+
return sort_by_outdated_products(df)

0 commit comments

Comments
 (0)