Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/spm-geographic-adjustment.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Store SPM geographic adjustment inputs instead of materialized CPS SPM thresholds.
72 changes: 0 additions & 72 deletions policyengine_us_data/calibration/calibration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
from policyengine_us_data.utils.spm import (
TENURE_CODE_MAP,
calculate_geoadj_from_rent,
get_spm_reference_thresholds,
spm_equivalence_scale,
)
from policyengine_us.variables.household.demographic.geographic.state_name import (
StateName,
Expand Down Expand Up @@ -184,13 +182,6 @@
56: StateCode.WY,
}

# SPM Tenure Type Mappings
SPM_TENURE_STRING_TO_CODE = {
"OWNER_WITH_MORTGAGE": 1,
"OWNER_WITHOUT_MORTGAGE": 2,
"RENTER": 3,
}


def get_calculated_variables(sim) -> List[str]:
"""
Expand Down Expand Up @@ -596,66 +587,3 @@ def load_cd_geoadj_values(
geoadj_dict[cd] = {tenure: 1.0 for tenure in tenure_keys}

return geoadj_dict


def calculate_spm_thresholds_vectorized(
person_ages: np.ndarray,
person_spm_unit_ids: np.ndarray,
spm_unit_tenure_types: np.ndarray,
spm_unit_geoadj: np.ndarray,
year: int,
) -> np.ndarray:
"""Calculate SPM thresholds for cloned SPM units from raw arrays.

Works without a Microsimulation instance. Counts adults/children
per SPM unit from person-level arrays, then computes
base_threshold * equivalence_scale * geoadj for each unit.

Args:
person_ages: Age per cloned person.
person_spm_unit_ids: New SPM unit ID per cloned person
(0-based contiguous).
spm_unit_tenure_types: Tenure type string per cloned SPM
unit (e.g. b"RENTER", b"OWNER_WITH_MORTGAGE").
spm_unit_geoadj: Geographic adjustment factor per cloned
SPM unit.
year: Tax year for base threshold lookup.

Returns:
Float32 array of SPM thresholds, one per SPM unit.
"""
person_ages = np.asarray(person_ages)
person_spm_unit_ids = np.asarray(person_spm_unit_ids)
spm_unit_tenure_types = np.asarray(spm_unit_tenure_types)
spm_unit_geoadj = np.asarray(spm_unit_geoadj, dtype=np.float64)

n_units = len(spm_unit_tenure_types)

# Count adults and children per SPM unit
is_adult = person_ages >= 18
num_adults = np.zeros(n_units, dtype=np.int32)
num_children = np.zeros(n_units, dtype=np.int32)
np.add.at(num_adults, person_spm_unit_ids, is_adult.astype(np.int32))
np.add.at(num_children, person_spm_unit_ids, (~is_adult).astype(np.int32))

# Map tenure type strings to codes
tenure_codes = np.full(n_units, 3, dtype=np.int32)
for tenure_str, code in SPM_TENURE_STRING_TO_CODE.items():
tenure_bytes = (
tenure_str.encode() if isinstance(tenure_str, str) else tenure_str
)
mask = spm_unit_tenure_types == tenure_bytes
if not mask.any():
mask = spm_unit_tenure_types == tenure_str
tenure_codes[mask] = code

base_thresholds = get_spm_reference_thresholds(year)

thresholds = np.zeros(n_units, dtype=np.float32)
for i in range(n_units):
tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter")
base = base_thresholds[tenure_str]
equiv_scale = spm_equivalence_scale(int(num_adults[i]), int(num_children[i]))
thresholds[i] = base * equiv_scale * spm_unit_geoadj[i]

return thresholds
15 changes: 4 additions & 11 deletions policyengine_us_data/calibration/entity_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
derive_geography_from_blocks,
)
from policyengine_us_data.calibration.calibration_utils import (
calculate_spm_thresholds_vectorized,
load_cd_geoadj_values,
)
from policyengine_us_data.utils.spm import geoadj_for_tenure
Expand Down Expand Up @@ -270,8 +269,9 @@ def materialize_clone_household_chunk(
clone_geo = {k: v[block_inv] for k, v in unique_geo.items()}

vars_to_save = set(sim.input_variables)
vars_to_save.discard("spm_unit_spm_threshold")
vars_to_save.add("county")
vars_to_save.add("spm_unit_spm_threshold")
vars_to_save.add("spm_unit_geographic_adjustment")
vars_to_save.add("congressional_district_geoid")
for geo_var in [
"block_geoid",
Expand Down Expand Up @@ -385,7 +385,6 @@ def materialize_clone_household_chunk(
entities_per_clone["spm_unit"],
)

person_ages = sim.calculate("age", map_to="person").values[person_clone_idx]
spm_tenure_holder = sim.get_holder("spm_unit_tenure_type")
spm_tenure_periods = spm_tenure_holder.get_known_periods()
if spm_tenure_periods:
Expand Down Expand Up @@ -413,14 +412,8 @@ def materialize_clone_household_chunk(
dtype=np.float64,
)

data["spm_unit_spm_threshold"] = {
time_period: calculate_spm_thresholds_vectorized(
person_ages=person_ages,
person_spm_unit_ids=new_person_entity_ids["spm_unit"],
spm_unit_tenure_types=spm_tenure_cloned,
spm_unit_geoadj=spm_unit_geoadj,
year=time_period,
),
data["spm_unit_geographic_adjustment"] = {
time_period: spm_unit_geoadj.astype(np.float32),
}

if apply_takeup:
Expand Down
22 changes: 6 additions & 16 deletions policyengine_us_data/calibration/publish_local_area.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from policyengine_us_data.calibration.calibration_utils import (
STATE_CODES,
load_cd_geoadj_values,
calculate_spm_thresholds_vectorized,
)
from policyengine_us_data.calibration.block_assignment import (
derive_geography_from_blocks,
Expand Down Expand Up @@ -515,8 +514,9 @@ def build_h5(

# === Determine variables to save ===
vars_to_save = set(sim.input_variables)
vars_to_save.discard("spm_unit_spm_threshold")
vars_to_save.add("county")
vars_to_save.add("spm_unit_spm_threshold")
vars_to_save.add("spm_unit_geographic_adjustment")
vars_to_save.add("congressional_district_geoid")
for gv in [
"block_geoid",
Expand Down Expand Up @@ -650,18 +650,15 @@ def build_h5(
time_period: clone_cd_geoids,
}

# === SPM threshold recalculation ===
print("Recalculating SPM thresholds...")
# === SPM geographic adjustment assignment ===
print("Assigning SPM geographic adjustments...")
unique_cds_list = sorted(set(active_clone_cds))
cd_geoadj_values = load_cd_geoadj_values(unique_cds_list)
spm_clone_ids = np.repeat(
np.arange(n_clones, dtype=np.int64),
entities_per_clone["spm_unit"],
)

# Get cloned person ages and SPM tenure types
person_ages = sim.calculate("age", map_to="person").values[person_clone_idx]

spm_tenure_holder = sim.get_holder("spm_unit_tenure_type")
spm_tenure_periods = spm_tenure_holder.get_known_periods()
if spm_tenure_periods:
Expand Down Expand Up @@ -690,15 +687,8 @@ def build_h5(
dtype=np.float64,
)

new_spm_thresholds = calculate_spm_thresholds_vectorized(
person_ages=person_ages,
person_spm_unit_ids=new_person_entity_ids["spm_unit"],
spm_unit_tenure_types=spm_tenure_cloned,
spm_unit_geoadj=spm_unit_geoadj,
year=time_period,
)
data["spm_unit_spm_threshold"] = {
time_period: new_spm_thresholds,
data["spm_unit_geographic_adjustment"] = {
time_period: spm_unit_geoadj.astype(np.float32),
}

# === Apply calibration takeup draws ===
Expand Down
17 changes: 3 additions & 14 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -1385,7 +1385,7 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int):
id="add_spm_variables",
label="Add SPM Variables",
node_type="library",
description="Populate CPS supplemental poverty measure variables and thresholds.",
description="Populate CPS supplemental poverty measure variables and geographic adjustments.",
source_file="policyengine_us_data/datasets/cps/cps.py",
status="current",
stability="moving",
Expand All @@ -1394,10 +1394,6 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int):
)
)
def add_spm_variables(self, cps: h5py.File, spm_unit: DataFrame) -> None:
from policyengine_us_data.utils.spm import (
calculate_spm_thresholds_with_geoadj,
)

SPM_RENAMES = dict(
spm_unit_total_income_reported="SPM_TOTVAL",
snap_reported="SPM_SNAPSUB",
Expand All @@ -1419,15 +1415,8 @@ def add_spm_variables(self, cps: h5py.File, spm_unit: DataFrame) -> None:
if asec_variable in spm_unit.columns:
cps[openfisca_variable] = spm_unit[asec_variable]

# Calculate SPM thresholds using spm-calculator with Census-provided
# geographic adjustment factors (SPM_GEOADJ)
cps["spm_unit_spm_threshold"] = calculate_spm_thresholds_with_geoadj(
num_adults=spm_unit["SPM_NUMADULTS"].values,
num_children=spm_unit["SPM_NUMKIDS"].values,
tenure_codes=spm_unit["SPM_TENMORTSTATUS"].values,
geoadj=spm_unit["SPM_GEOADJ"].values,
year=self.time_period,
)
if "SPM_GEOADJ" in spm_unit.columns:
cps["spm_unit_geographic_adjustment"] = spm_unit["SPM_GEOADJ"].values

if "SPM_TENMORTSTATUS" in spm_unit.columns:
tenure_map = {
Expand Down
48 changes: 11 additions & 37 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,24 +68,21 @@ def _supports_structural_mortgage_inputs() -> bool:
return has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES)


def _calculate_spm_thresholds_from_assigned_geography(
def _calculate_spm_geographic_adjustments_from_assigned_geography(
data: dict[str, dict[int, np.ndarray]],
time_period: int,
) -> np.ndarray:
from policyengine_us_data.calibration.calibration_utils import (
load_cd_geoadj_values,
)
from policyengine_us_data.utils.spm import (
TENURE_CODE_MAP,
calculate_spm_thresholds_with_geoadj,
geoadj_for_tenure,
)

spm_unit_ids = data["spm_unit_id"][time_period]
person_spm_unit_ids = data["person_spm_unit_id"][time_period]
person_household_ids = data["person_household_id"][time_period]
household_ids = data["household_id"][time_period]
ages = data["age"][time_period]
cd_geoids = np.asarray(data["congressional_district_geoid"][time_period]).astype(
str
)
Expand All @@ -97,57 +94,34 @@ def _calculate_spm_thresholds_from_assigned_geography(
{
"spm_unit_id": person_spm_unit_ids,
"household_id": person_household_ids,
"is_adult": ages >= 18,
"is_child": ages < 18,
}
)
spm_df = person_df.groupby("spm_unit_id").agg(
num_adults=("is_adult", "sum"),
num_children=("is_child", "sum"),
household_id=("household_id", "first"),
)
spm_df = spm_df.reindex(spm_unit_ids)

tenure = data.get("spm_unit_tenure_type", {}).get(time_period)
tenure_codes = np.full(len(spm_unit_ids), 3, dtype=int)
tenure_values = np.full(len(spm_unit_ids), "RENTER", dtype="U30")
if tenure is not None:
tenure_values = np.asarray(tenure)
if np.issubdtype(tenure_values.dtype, np.bytes_):
tenure_values = np.char.decode(tenure_values, "utf-8")
tenure_codes = (
pd.Series(tenure_values)
.map(
{
"OWNER_WITH_MORTGAGE": 1,
"OWNER_WITHOUT_MORTGAGE": 2,
"RENTER": 3,
}
)
.fillna(3)
.astype(int)
.values
)
tenure_values = pd.Series(tenure_values).fillna("RENTER").astype(str).values

geoadj = np.array(
return np.array(
[
geoadj_for_tenure(
cd_geoadj_values.get(cd_by_household.get(household_id), 1.0),
TENURE_CODE_MAP.get(int(tenure_code), "renter"),
tenure_type,
)
for household_id, tenure_code in zip(
for household_id, tenure_type in zip(
spm_df["household_id"].values,
tenure_codes,
tenure_values,
)
],
dtype=float,
)
return calculate_spm_thresholds_with_geoadj(
num_adults=spm_df["num_adults"].fillna(0).values,
num_children=spm_df["num_children"].fillna(0).values,
tenure_codes=tenure_codes,
geoadj=geoadj,
year=time_period,
)


# CPS-only categorical features to donor-impute onto the PUF clone half.
Expand Down Expand Up @@ -965,9 +939,9 @@ def generate(self):
self.time_period,
had_positive_mortgage_input,
)
logger.info("Calculating SPM thresholds from assigned geography")
new_data["spm_unit_spm_threshold"] = {
self.time_period: _calculate_spm_thresholds_from_assigned_geography(
logger.info("Calculating SPM geographic adjustments from assigned geography")
new_data["spm_unit_geographic_adjustment"] = {
self.time_period: _calculate_spm_geographic_adjustments_from_assigned_geography(
new_data,
self.time_period,
)
Expand Down Expand Up @@ -1243,8 +1217,8 @@ def _validate_structural_mortgage_conversion(
# due to entity shape mismatch.
_KEEP_FORMULA_VARS = {
"person_id",
"spm_unit_spm_threshold",
"weeks_worked",
"spm_unit_geographic_adjustment",
"self_employed_pension_contribution_ald",
"self_employed_health_insurance_ald",
}
Expand Down
4 changes: 0 additions & 4 deletions policyengine_us_data/datasets/cps/small_enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,6 @@ def create_sparse_ecps():
dataset=EnhancedCPS_2024,
)
template_sim.set_input("household_weight", time_period, sparse_weights)
# Preserve the base-year SPM threshold when round-tripping through a
# dataframe. It is a formula variable in policyengine-us, but poverty
# projections need the input value as the geographic-adjusted anchor.
template_sim.calculate("spm_unit_spm_threshold", time_period)

df = template_sim.to_input_dataframe()
del template_sim
Expand Down
Loading
Loading