PolicyEngine · MaxGhenis · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/changelog.d/702.fixed b/changelog.d/702.fixed
@@ -0,0 +1 @@
+Aligned SPM threshold recalculation with published Census thresholds and tenure-specific geographic adjustments, and kept zero-weight clone priors near zero during enhanced CPS reweighting.
diff --git a/policyengine_us_data/calibration/calibration_utils.py b/policyengine_us_data/calibration/calibration_utils.py
@@ -8,10 +8,12 @@
 import pandas as pd
 from scipy import sparse
 
-from spm_calculator import SPMCalculator, spm_equivalence_scale
-from spm_calculator.geoadj import calculate_geoadj_from_rent
-
-from policyengine_us_data.utils.spm import TENURE_CODE_MAP
+from policyengine_us_data.utils.spm import (
+    TENURE_CODE_MAP,
+    calculate_geoadj_from_rent,
+    get_spm_reference_thresholds,
+    spm_equivalence_scale,
+)
 from policyengine_us.variables.household.demographic.geographic.state_name import (
     StateName,
 )
@@ -541,7 +543,7 @@ def load_geo_labels(path) -> List[str]:
 
 def load_cd_geoadj_values(
     cds_to_calibrate: List[str],
-) -> Dict[str, float]:
+) -> Dict[str, Dict[str, float]]:
     """
     Load geographic adjustment factors from rent data CSV.
     Uses median 2BR rent by CD vs national median to compute geoadj.
@@ -557,14 +559,19 @@ def load_cd_geoadj_values(
     # Convert zero-padded cd_id to match code format (e.g., "0101" -> "101")
     rent_df["cd_geoid"] = rent_df["cd_id"].apply(lambda x: str(int(x)))
 
+    tenure_keys = tuple(TENURE_CODE_MAP.values())
+
     # Build lookup from rent data
     rent_lookup = {}
     for _, row in rent_df.iterrows():
-        geoadj = calculate_geoadj_from_rent(
-            local_rent=row["median_2br_rent"],
-            national_rent=row["national_median_2br_rent"],
-        )
-        rent_lookup[row["cd_geoid"]] = geoadj
+        rent_lookup[row["cd_geoid"]] = {
+            tenure: calculate_geoadj_from_rent(
+                local_rent=row["median_2br_rent"],
+                national_rent=row["national_median_2br_rent"],
+                tenure=tenure,
+            )
+            for tenure in tenure_keys
+        }
 
     geoadj_dict = {}
     for cd in cds_to_calibrate:
@@ -586,7 +593,7 @@ def load_cd_geoadj_values(
             if cd in geoadj_dict:
                 continue
             print(f"Warning: No rent data for CD {cd}, using geoadj=1.0")
-            geoadj_dict[cd] = 1.0
+            geoadj_dict[cd] = {tenure: 1.0 for tenure in tenure_keys}
 
     return geoadj_dict
 
@@ -617,6 +624,11 @@ def calculate_spm_thresholds_vectorized(
     Returns:
         Float32 array of SPM thresholds, one per SPM unit.
     """
+    person_ages = np.asarray(person_ages)
+    person_spm_unit_ids = np.asarray(person_spm_unit_ids)
+    spm_unit_tenure_types = np.asarray(spm_unit_tenure_types)
+    spm_unit_geoadj = np.asarray(spm_unit_geoadj, dtype=np.float64)
+
     n_units = len(spm_unit_tenure_types)
 
     # Count adults and children per SPM unit
@@ -637,9 +649,7 @@ def calculate_spm_thresholds_vectorized(
             mask = spm_unit_tenure_types == tenure_str
         tenure_codes[mask] = code
 
-    # Look up base thresholds
-    calc = SPMCalculator(year=year)
-    base_thresholds = calc.get_base_thresholds()
+    base_thresholds = get_spm_reference_thresholds(year)
 
     thresholds = np.zeros(n_units, dtype=np.float32)
     for i in range(n_units):

diff --git a/policyengine_us_data/calibration/entity_clone.py b/policyengine_us_data/calibration/entity_clone.py
@@ -24,6 +24,7 @@
     calculate_spm_thresholds_vectorized,
     load_cd_geoadj_values,
 )
+from policyengine_us_data.utils.spm import geoadj_for_tenure
 from policyengine_us_data.utils.takeup import (
     _sum_person_values_to_tax_units,
     apply_block_takeup_to_arrays,
@@ -383,10 +384,6 @@ def materialize_clone_household_chunk(
         np.arange(n_clones, dtype=np.int64),
         entities_per_clone["spm_unit"],
     )
-    spm_unit_geoadj = np.array(
-        [cd_geoadj_values[active_cd_geoids[c]] for c in spm_clone_ids],
-        dtype=np.float64,
-    )
 
     person_ages = sim.calculate("age", map_to="person").values[person_clone_idx]
     spm_tenure_holder = sim.get_holder("spm_unit_tenure_type")
@@ -405,6 +402,17 @@ def materialize_clone_household_chunk(
             dtype="S30",
         )
 
+    spm_unit_geoadj = np.array(
+        [
+            geoadj_for_tenure(
+                cd_geoadj_values[active_cd_geoids[clone_id]],
+                spm_tenure_cloned[spm_unit_index],
+            )
+            for spm_unit_index, clone_id in enumerate(spm_clone_ids)
+        ],
+        dtype=np.float64,
+    )
+
     data["spm_unit_spm_threshold"] = {
         time_period: calculate_spm_thresholds_vectorized(
             person_ages=person_ages,

diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py
@@ -36,6 +36,7 @@
 from policyengine_us_data.calibration.block_assignment import (
     derive_geography_from_blocks,
 )
+from policyengine_us_data.utils.spm import geoadj_for_tenure
 from policyengine_us_data.utils.takeup import (
     SIMPLE_TAKEUP_VARS,
     _sum_person_values_to_tax_units,
@@ -653,15 +654,10 @@ def build_h5(
     print("Recalculating SPM thresholds...")
     unique_cds_list = sorted(set(active_clone_cds))
     cd_geoadj_values = load_cd_geoadj_values(unique_cds_list)
-    # Build per-SPM-unit geoadj from clone's CD
     spm_clone_ids = np.repeat(
         np.arange(n_clones, dtype=np.int64),
         entities_per_clone["spm_unit"],
     )
-    spm_unit_geoadj = np.array(
-        [cd_geoadj_values[active_clone_cds[c]] for c in spm_clone_ids],
-        dtype=np.float64,
-    )
 
     # Get cloned person ages and SPM tenure types
     person_ages = sim.calculate("age", map_to="person").values[person_clone_idx]
@@ -682,6 +678,18 @@ def build_h5(
             dtype="S30",
         )
 
+    # Build per-SPM-unit geoadj from the clone's CD and SPM unit tenure.
+    spm_unit_geoadj = np.array(
+        [
+            geoadj_for_tenure(
+                cd_geoadj_values[active_clone_cds[clone_id]],
+                spm_tenure_cloned[spm_unit_index],
+            )
+            for spm_unit_index, clone_id in enumerate(spm_clone_ids)
+        ],
+        dtype=np.float64,
+    )
+
     new_spm_thresholds = calculate_spm_thresholds_vectorized(
         person_ages=person_ages,
         person_spm_unit_ids=new_person_entity_ids["spm_unit"],

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -40,6 +40,29 @@
     torch = None
 
 
+def initialize_weight_priors(
+    original_weights: np.ndarray,
+    seed: int = 1456,
+    epsilon: float = 1e-6,
+) -> np.ndarray:
+    """Build deterministic positive priors for sparse reweighting."""
+
+    weights = np.asarray(original_weights, dtype=np.float64)
+    if np.any(weights < 0):
+        raise ValueError("original_weights must be non-negative")
+
+    priors = np.empty_like(weights, dtype=np.float64)
+    positive_mask = weights > 0
+    priors[positive_mask] = weights[positive_mask]
+
+    zero_mask = ~positive_mask
+    if zero_mask.any():
+        rng = np.random.default_rng(seed)
+        priors[zero_mask] = epsilon * rng.uniform(1.0, 2.0, size=zero_mask.sum())
+
+    return priors
+
+
 def _to_numpy(value) -> np.ndarray:
     return np.asarray(getattr(value, "values", value))
 
@@ -612,15 +635,7 @@ def generate(self):
         base_year = int(sim.default_calculation_period)
         data["household_weight"] = {}
         original_weights = sim.calculate("household_weight")
-        # Seed before the initial weight jitter so the L0 optimizer's
-        # starting point is reproducible across runs. `reweight()` re-seeds
-        # inside, but that happens AFTER this perturbation, so without
-        # this call the jitter (and hence the final calibrated weights)
-        # differ run-to-run.
-        set_seeds(1456)
-        original_weights = original_weights.values + np.random.normal(
-            1, 0.1, len(original_weights)
-        )
+        original_weights = initialize_weight_priors(original_weights.values)
 
         bad_targets = [
             "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
@@ -800,12 +815,7 @@ def generate(self):
         sim = Microsimulation(dataset=self.input_dataset)
         data = sim.dataset.load_dataset()
         original_weights = sim.calculate("household_weight")
-        # Seed before the jitter so the starting weights (and the final
-        # reweighted result) are reproducible across runs.
-        set_seeds(1456)
-        original_weights = original_weights.values + np.random.normal(
-            1, 0.1, len(original_weights)
-        )
+        original_weights = initialize_weight_priors(original_weights.values)
         for year in [2024]:
             loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
             optimised_weights = reweight(original_weights, loss_matrix, targets_array)

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -76,7 +76,9 @@ def _calculate_spm_thresholds_from_assigned_geography(
         load_cd_geoadj_values,
     )
     from policyengine_us_data.utils.spm import (
+        TENURE_CODE_MAP,
         calculate_spm_thresholds_with_geoadj,
+        geoadj_for_tenure,
     )
 
     spm_unit_ids = data["spm_unit_id"][time_period]
@@ -89,11 +91,7 @@ def _calculate_spm_thresholds_from_assigned_geography(
     )
 
     cd_geoadj_values = load_cd_geoadj_values(sorted(set(cd_geoids)))
-    household_geoadj = np.array(
-        [cd_geoadj_values.get(cd, 1.0) for cd in cd_geoids],
-        dtype=float,
-    )
-    geoadj_by_household = dict(zip(household_ids, household_geoadj))
+    cd_by_household = dict(zip(household_ids, cd_geoids))
 
     person_df = pd.DataFrame(
         {
@@ -130,7 +128,19 @@ def _calculate_spm_thresholds_from_assigned_geography(
             .values
         )
 
-    geoadj = spm_df["household_id"].map(geoadj_by_household).fillna(1.0).values
+    geoadj = np.array(
+        [
+            geoadj_for_tenure(
+                cd_geoadj_values.get(cd_by_household.get(household_id), 1.0),
+                TENURE_CODE_MAP.get(int(tenure_code), "renter"),
+            )
+            for household_id, tenure_code in zip(
+                spm_df["household_id"].values,
+                tenure_codes,
+            )
+        ],
+        dtype=float,
+    )
     return calculate_spm_thresholds_with_geoadj(
         num_adults=spm_df["num_adults"].fillna(0).values,
         num_children=spm_df["num_children"].fillna(0).values,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Aligned SPM threshold recalculation with published Census thresholds and tenure-specific geographic adjustments, and kept zero-weight clone priors near zero during enhanced CPS reweighting.