Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/702.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Aligned SPM threshold recalculation with published Census thresholds and tenure-specific geographic adjustments, and kept zero-weight clone priors near zero during enhanced CPS reweighting.
38 changes: 24 additions & 14 deletions policyengine_us_data/calibration/calibration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
import pandas as pd
from scipy import sparse

from spm_calculator import SPMCalculator, spm_equivalence_scale
from spm_calculator.geoadj import calculate_geoadj_from_rent

from policyengine_us_data.utils.spm import TENURE_CODE_MAP
from policyengine_us_data.utils.spm import (
TENURE_CODE_MAP,
calculate_geoadj_from_rent,
get_spm_reference_thresholds,
spm_equivalence_scale,
)
from policyengine_us.variables.household.demographic.geographic.state_name import (
StateName,
)
Expand Down Expand Up @@ -541,7 +543,7 @@ def load_geo_labels(path) -> List[str]:

def load_cd_geoadj_values(
cds_to_calibrate: List[str],
) -> Dict[str, float]:
) -> Dict[str, Dict[str, float]]:
"""
Load geographic adjustment factors from rent data CSV.
Uses median 2BR rent by CD vs national median to compute geoadj.
Expand All @@ -557,14 +559,19 @@ def load_cd_geoadj_values(
# Convert zero-padded cd_id to match code format (e.g., "0101" -> "101")
rent_df["cd_geoid"] = rent_df["cd_id"].apply(lambda x: str(int(x)))

tenure_keys = tuple(TENURE_CODE_MAP.values())

# Build lookup from rent data
rent_lookup = {}
for _, row in rent_df.iterrows():
geoadj = calculate_geoadj_from_rent(
local_rent=row["median_2br_rent"],
national_rent=row["national_median_2br_rent"],
)
rent_lookup[row["cd_geoid"]] = geoadj
rent_lookup[row["cd_geoid"]] = {
tenure: calculate_geoadj_from_rent(
local_rent=row["median_2br_rent"],
national_rent=row["national_median_2br_rent"],
tenure=tenure,
)
for tenure in tenure_keys
}

geoadj_dict = {}
for cd in cds_to_calibrate:
Expand All @@ -586,7 +593,7 @@ def load_cd_geoadj_values(
if cd in geoadj_dict:
continue
print(f"Warning: No rent data for CD {cd}, using geoadj=1.0")
geoadj_dict[cd] = 1.0
geoadj_dict[cd] = {tenure: 1.0 for tenure in tenure_keys}

return geoadj_dict

Expand Down Expand Up @@ -617,6 +624,11 @@ def calculate_spm_thresholds_vectorized(
Returns:
Float32 array of SPM thresholds, one per SPM unit.
"""
person_ages = np.asarray(person_ages)
person_spm_unit_ids = np.asarray(person_spm_unit_ids)
spm_unit_tenure_types = np.asarray(spm_unit_tenure_types)
spm_unit_geoadj = np.asarray(spm_unit_geoadj, dtype=np.float64)

n_units = len(spm_unit_tenure_types)

# Count adults and children per SPM unit
Expand All @@ -637,9 +649,7 @@ def calculate_spm_thresholds_vectorized(
mask = spm_unit_tenure_types == tenure_str
tenure_codes[mask] = code

# Look up base thresholds
calc = SPMCalculator(year=year)
base_thresholds = calc.get_base_thresholds()
base_thresholds = get_spm_reference_thresholds(year)

thresholds = np.zeros(n_units, dtype=np.float32)
for i in range(n_units):
Expand Down
16 changes: 12 additions & 4 deletions policyengine_us_data/calibration/entity_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
calculate_spm_thresholds_vectorized,
load_cd_geoadj_values,
)
from policyengine_us_data.utils.spm import geoadj_for_tenure
from policyengine_us_data.utils.takeup import (
_sum_person_values_to_tax_units,
apply_block_takeup_to_arrays,
Expand Down Expand Up @@ -383,10 +384,6 @@ def materialize_clone_household_chunk(
np.arange(n_clones, dtype=np.int64),
entities_per_clone["spm_unit"],
)
spm_unit_geoadj = np.array(
[cd_geoadj_values[active_cd_geoids[c]] for c in spm_clone_ids],
dtype=np.float64,
)

person_ages = sim.calculate("age", map_to="person").values[person_clone_idx]
spm_tenure_holder = sim.get_holder("spm_unit_tenure_type")
Expand All @@ -405,6 +402,17 @@ def materialize_clone_household_chunk(
dtype="S30",
)

spm_unit_geoadj = np.array(
[
geoadj_for_tenure(
cd_geoadj_values[active_cd_geoids[clone_id]],
spm_tenure_cloned[spm_unit_index],
)
for spm_unit_index, clone_id in enumerate(spm_clone_ids)
],
dtype=np.float64,
)

data["spm_unit_spm_threshold"] = {
time_period: calculate_spm_thresholds_vectorized(
person_ages=person_ages,
Expand Down
18 changes: 13 additions & 5 deletions policyengine_us_data/calibration/publish_local_area.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from policyengine_us_data.calibration.block_assignment import (
derive_geography_from_blocks,
)
from policyengine_us_data.utils.spm import geoadj_for_tenure
from policyengine_us_data.utils.takeup import (
SIMPLE_TAKEUP_VARS,
_sum_person_values_to_tax_units,
Expand Down Expand Up @@ -653,15 +654,10 @@ def build_h5(
print("Recalculating SPM thresholds...")
unique_cds_list = sorted(set(active_clone_cds))
cd_geoadj_values = load_cd_geoadj_values(unique_cds_list)
# Build per-SPM-unit geoadj from clone's CD
spm_clone_ids = np.repeat(
np.arange(n_clones, dtype=np.int64),
entities_per_clone["spm_unit"],
)
spm_unit_geoadj = np.array(
[cd_geoadj_values[active_clone_cds[c]] for c in spm_clone_ids],
dtype=np.float64,
)

# Get cloned person ages and SPM tenure types
person_ages = sim.calculate("age", map_to="person").values[person_clone_idx]
Expand All @@ -682,6 +678,18 @@ def build_h5(
dtype="S30",
)

# Build per-SPM-unit geoadj from the clone's CD and SPM unit tenure.
spm_unit_geoadj = np.array(
[
geoadj_for_tenure(
cd_geoadj_values[active_clone_cds[clone_id]],
spm_tenure_cloned[spm_unit_index],
)
for spm_unit_index, clone_id in enumerate(spm_clone_ids)
],
dtype=np.float64,
)

new_spm_thresholds = calculate_spm_thresholds_vectorized(
person_ages=person_ages,
person_spm_unit_ids=new_person_entity_ids["spm_unit"],
Expand Down
40 changes: 25 additions & 15 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,29 @@
torch = None


def initialize_weight_priors(
original_weights: np.ndarray,
seed: int = 1456,
epsilon: float = 1e-6,
) -> np.ndarray:
"""Build deterministic positive priors for sparse reweighting."""

weights = np.asarray(original_weights, dtype=np.float64)
if np.any(weights < 0):
raise ValueError("original_weights must be non-negative")

priors = np.empty_like(weights, dtype=np.float64)
positive_mask = weights > 0
priors[positive_mask] = weights[positive_mask]

zero_mask = ~positive_mask
if zero_mask.any():
rng = np.random.default_rng(seed)
priors[zero_mask] = epsilon * rng.uniform(1.0, 2.0, size=zero_mask.sum())

return priors


def _to_numpy(value) -> np.ndarray:
return np.asarray(getattr(value, "values", value))

Expand Down Expand Up @@ -612,15 +635,7 @@ def generate(self):
base_year = int(sim.default_calculation_period)
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
# Seed before the initial weight jitter so the L0 optimizer's
# starting point is reproducible across runs. `reweight()` re-seeds
# inside, but that happens AFTER this perturbation, so without
# this call the jitter (and hence the final calibrated weights)
# differ run-to-run.
set_seeds(1456)
original_weights = original_weights.values + np.random.normal(
1, 0.1, len(original_weights)
)
original_weights = initialize_weight_priors(original_weights.values)

bad_targets = [
"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
Expand Down Expand Up @@ -800,12 +815,7 @@ def generate(self):
sim = Microsimulation(dataset=self.input_dataset)
data = sim.dataset.load_dataset()
original_weights = sim.calculate("household_weight")
# Seed before the jitter so the starting weights (and the final
# reweighted result) are reproducible across runs.
set_seeds(1456)
original_weights = original_weights.values + np.random.normal(
1, 0.1, len(original_weights)
)
original_weights = initialize_weight_priors(original_weights.values)
for year in [2024]:
loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
optimised_weights = reweight(original_weights, loss_matrix, targets_array)
Expand Down
22 changes: 16 additions & 6 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ def _calculate_spm_thresholds_from_assigned_geography(
load_cd_geoadj_values,
)
from policyengine_us_data.utils.spm import (
TENURE_CODE_MAP,
calculate_spm_thresholds_with_geoadj,
geoadj_for_tenure,
)

spm_unit_ids = data["spm_unit_id"][time_period]
Expand All @@ -89,11 +91,7 @@ def _calculate_spm_thresholds_from_assigned_geography(
)

cd_geoadj_values = load_cd_geoadj_values(sorted(set(cd_geoids)))
household_geoadj = np.array(
[cd_geoadj_values.get(cd, 1.0) for cd in cd_geoids],
dtype=float,
)
geoadj_by_household = dict(zip(household_ids, household_geoadj))
cd_by_household = dict(zip(household_ids, cd_geoids))

person_df = pd.DataFrame(
{
Expand Down Expand Up @@ -130,7 +128,19 @@ def _calculate_spm_thresholds_from_assigned_geography(
.values
)

geoadj = spm_df["household_id"].map(geoadj_by_household).fillna(1.0).values
geoadj = np.array(
[
geoadj_for_tenure(
cd_geoadj_values.get(cd_by_household.get(household_id), 1.0),
TENURE_CODE_MAP.get(int(tenure_code), "renter"),
)
for household_id, tenure_code in zip(
spm_df["household_id"].values,
tenure_codes,
)
],
dtype=float,
)
return calculate_spm_thresholds_with_geoadj(
num_adults=spm_df["num_adults"].fillna(0).values,
num_children=spm_df["num_children"].fillna(0).values,
Expand Down
Loading
Loading