Skip to content

Commit dacb0ed

Browse files
committed
Add Fuzzy Versions, to compare bytes <> socrata
1 parent 751229e commit dacb0ed

File tree

2 files changed

+447
-1
lines changed

2 files changed

+447
-1
lines changed

dcpy/lifecycle/scripts/version_compare.py

Lines changed: 182 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,169 @@
22
from dcpy.lifecycle import product_metadata
33
import pandas as pd
44

5+
from datetime import datetime
6+
from dateutil.parser import parse as dateutil_parse
7+
import re
8+
9+
10+
class FuzzyVersion:
11+
"""A version string that supports fuzzy comparison including with various date formats."""
12+
13+
def __init__(self, version_string):
14+
self.original = version_string
15+
self.normalized = self._normalize() if version_string else version_string
16+
17+
def probably_equals(self, other):
18+
if not isinstance(other, FuzzyVersion):
19+
raise TypeError("Can only compare with another FuzzyVersion")
20+
21+
if not self.original or not other.original:
22+
return False
23+
24+
# Direct string comparison (handles case differences)
25+
if self.original.lower().strip() == other.original.lower().strip():
26+
return True
27+
28+
# Compare normalized versions
29+
return self.normalized == other.normalized
30+
31+
def _normalize(self):
32+
"""
33+
Convert various date formats to a standardized form (YYYYMM).
34+
35+
Returns:
36+
str: Normalized version string in YYYYMM format, or original if no pattern matches
37+
"""
38+
if not self.original:
39+
return self.original
40+
41+
version = self.original.lower().strip()
42+
43+
# Handle quarter notation (e.g., "25q1", "24q2")
44+
quarter_match = re.match(r"^(\d{2})q([1-4])$", version)
45+
if quarter_match:
46+
year_suffix = quarter_match.group(1)
47+
quarter = int(quarter_match.group(2))
48+
# Convert 2-digit year to 4-digit (assuming 20XX)
49+
year = 2000 + int(year_suffix)
50+
# Quarter to month mapping: Q1=March, Q2=June, Q3=September, Q4=December
51+
month = quarter * 3
52+
return f"{year:04d}{month:02d}"
53+
54+
# Handle YYYYMMDD format
55+
if re.match(r"^\d{8}$", version):
56+
return version[:6] # Take first 6 digits (YYYYMM)
57+
58+
# Handle YYYYMM format (already in target format)
59+
if re.match(r"^\d{6}$", version):
60+
return version
61+
62+
# Handle month name + year using dateutil, but be selective
63+
# Only try to parse if it contains month names or reasonable date patterns
64+
if any(
65+
month in version
66+
for month in [
67+
"january",
68+
"february",
69+
"march",
70+
"april",
71+
"may",
72+
"june",
73+
"july",
74+
"august",
75+
"september",
76+
"october",
77+
"november",
78+
"december",
79+
"jan",
80+
"feb",
81+
"mar",
82+
"apr",
83+
"may",
84+
"jun",
85+
"jul",
86+
"aug",
87+
"sep",
88+
"oct",
89+
"nov",
90+
"dec",
91+
]
92+
):
93+
try:
94+
parsed_date = dateutil_parse(
95+
version, fuzzy=True, default=datetime(2000, 1, 1)
96+
)
97+
# Only return if the parsed date seems reasonable (not the default year)
98+
if parsed_date.year >= 2000:
99+
return f"{parsed_date.year:04d}{parsed_date.month:02d}"
100+
except (ValueError, TypeError):
101+
pass
102+
103+
# Return original if no pattern matches
104+
return version
105+
106+
def __str__(self):
107+
return self.original or ""
108+
109+
def __repr__(self):
110+
return f"FuzzyVersion({self.original!r})"
111+
112+
def __eq__(self, other):
113+
"""Strict equality - delegates to probably_equals for fuzzy comparison."""
114+
if isinstance(other, FuzzyVersion):
115+
return self.original == other.original
116+
return False
117+
118+
def __hash__(self):
119+
return hash(self.original)
120+
121+
122+
def sort_by_outdated_products(df):
123+
"""
124+
Sort dataframe to show products with outdated datasets first.
125+
Products with any outdated datasets appear at the top.
126+
Also prioritizes products with open_data_versions over those with all blank versions.
127+
"""
128+
# Create a summary of outdated status by product
129+
product_status = (
130+
df.groupby("product")["up_to_date"].agg(["all", "sum", "count"]).reset_index()
131+
)
132+
product_status["has_outdated"] = ~product_status["all"]
133+
product_status["outdated_count"] = product_status["count"] - product_status["sum"]
134+
135+
# Add flag for products that have any open_data_versions (not all blank/missing)
136+
product_has_data = (
137+
df.groupby("product")["open_data_versions"]
138+
.apply(
139+
lambda x: x.apply(
140+
lambda v: bool(v and (v != [] if isinstance(v, list) else True))
141+
).any()
142+
)
143+
.reset_index()
144+
)
145+
product_has_data.columns = ["product", "has_open_data"]
146+
product_status = product_status.merge(product_has_data, on="product")
147+
148+
# Sort products:
149+
# 1. Those with outdated datasets first
150+
# 2. Those with open data first
151+
# 3. Then by number of outdated datasets
152+
product_order = product_status.sort_values(
153+
["has_outdated", "has_open_data", "outdated_count"],
154+
ascending=[False, False, False],
155+
)["product"].tolist()
156+
157+
# Reorder the dataframe based on product order
158+
df_sorted = df.reset_index()
159+
df_sorted["product_order"] = df_sorted["product"].map(
160+
{prod: i for i, prod in enumerate(product_order)}
161+
)
162+
df_sorted = df_sorted.sort_values(["product_order", "product", "dataset"]).drop(
163+
"product_order", axis=1
164+
)
165+
166+
return df_sorted.set_index(["product", "dataset"])
167+
5168

6169
def get_all_open_data_keys():
7170
"""retrieve all product.dataset.destination_ids"""
@@ -40,21 +203,39 @@ def make_comparison_dataframe(bytes_versions, open_data_versions):
40203
product, dataset, destination_id = key.split(".")
41204
bytes_version = bytes_versions.get(f"{product}.{dataset}")
42205
open_data_vers = open_data_versions.get(key, [])
206+
207+
# Determine if versions are up to date using fuzzy comparison
208+
up_to_date = False
209+
try:
210+
up_to_date = FuzzyVersion(bytes_version).probably_equals(
211+
FuzzyVersion(open_data_vers)
212+
)
213+
except Exception:
214+
pass
215+
43216
rows.append(
44217
{
45218
"product": product,
46219
"dataset": dataset,
47220
"destination_id": destination_id,
48221
"bytes_version": bytes_version,
49222
"open_data_versions": open_data_vers,
223+
"up_to_date": up_to_date,
50224
}
51225
)
52226
df = pd.DataFrame(rows).set_index(["product", "dataset"]).sort_index()
227+
228+
# Add product-level up-to-date flag
229+
# A product is up-to-date if ALL its datasets are up-to-date
230+
product_status = df.groupby("product")["up_to_date"].all()
231+
df["product_up_to_date"] = df.index.get_level_values("product").map(product_status)
232+
53233
return df
54234

55235

56236
def run():
57237
all_keys = get_all_open_data_keys()
58238
open_data_versions = get_open_data_versions(all_keys)
59239
bytes_versions = get_bytes_versions(all_keys)
60-
return make_comparison_dataframe(bytes_versions, open_data_versions)
240+
df = make_comparison_dataframe(bytes_versions, open_data_versions)
241+
return sort_by_outdated_products(df)

0 commit comments

Comments
 (0)