Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions policyengine_uk_data/targets/sources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ obr:
vintage: "march_2026"

hmrc:
spi_collated: "https://assets.publishing.service.gov.uk/media/67cabb37ade26736dbf9ffe5/Collated_Tables_3_1_to_3_17_2223.ods"
spi_geography: "https://assets.publishing.service.gov.uk/media/67cabb7f8c1076c796a45bec/Collated_Tables_3_12_to_3_15a_2223.ods"
spi_collated: "https://assets.publishing.service.gov.uk/media/69f1f12d2fae53a03709682f/Collated_Tables_3_1_to_3_11_2324.ods"
spi_geography: "https://assets.publishing.service.gov.uk/media/69f1f17cc42061e837e3ac3b/Collated_Tables_3_12_to_3_15a_2324.ods"
income_tax_liabilities: "https://www.gov.uk/government/statistics/income-tax-liabilities-statistics-tax-year-2022-to-2023-to-tax-year-2025-to-2026"
salary_sacrifice_table_6: "https://assets.publishing.service.gov.uk/media/687a294e312ee8a5f0806b6d/Tables_6_1_and_6_2.csv"

Expand Down
6 changes: 3 additions & 3 deletions policyengine_uk_data/targets/sources/hmrc_spi.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""HMRC Survey of Personal Incomes targets.

Downloads and parses the SPI ODS (Tables 3.6 and 3.7) to get income
distributions by total income band and income type for 2022-23.
distributions by total income band and income type for 2023-24.

For future year projections, the microsimulation uprates these base
year distributions forward using PolicyEngine's uprating factors.
Expand Down Expand Up @@ -54,8 +54,8 @@
]
_BAND_UPPER = _BAND_LOWER[1:] + [float("inf")]

# SPI year: the ODS is for tax year 2022-23, mapped to calendar 2023
_SPI_YEAR = 2023
# SPI year: the ODS is for tax year 2023-24, mapped to calendar 2024
_SPI_YEAR = 2024

# HMRC Property Rental Income Statistics show ~1.9x more property income
# than the SPI (£46.68bn vs £24.5bn for 2020-21), because SPI only covers
Expand Down
42 changes: 32 additions & 10 deletions policyengine_uk_data/targets/sources/ons_demographics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""ONS population projections and demographic targets.

Downloads the ONS 2022-based principal population projection for the
Downloads the ONS 2024-based principal population projection for the
UK to extract total population and gender × age band targets.

For regional age breakdowns (12 regions × 9 age bands), reads the
Expand Down Expand Up @@ -36,7 +36,7 @@
_UK_ZIP_URL = (
"https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/"
"populationandmigration/populationprojections/datasets/"
"z1zippedpopulationprojectionsdatafilesuk/2022based/uk.zip"
"z1zippedpopulationprojectionsdatafilesuk/2024based/uk.zip"
)

_REF_REGION = (
Expand Down Expand Up @@ -81,7 +81,8 @@ def _download_uk_projection() -> pd.DataFrame:
r = requests.get(_UK_ZIP_URL, headers=HEADERS, allow_redirects=True, timeout=120)
r.raise_for_status()
z = zipfile.ZipFile(io.BytesIO(r.content))
with z.open("uk/uk_ppp_machine_readable.xlsx") as f:
projection_member = _find_projection_member(z.namelist())
with z.open(projection_member) as f:
df = pd.read_excel(
io.BytesIO(f.read()),
sheet_name="Population",
Expand All @@ -90,31 +91,52 @@ def _download_uk_projection() -> pd.DataFrame:
return df


def _find_projection_member(names: list[str]) -> str:
"""Find the UK principal projection workbook inside the ONS zip."""
for name in names:
if name.endswith("uk_ppp_machine_readable.xlsx"):
return name
raise RuntimeError(
"ONS UK projection zip did not contain uk_ppp_machine_readable.xlsx"
)


def _aggregate_ages(
df: pd.DataFrame, sex: str, low: int, high: int, years: list[int]
) -> dict[int, float]:
"""Sum population for a sex and age range across years."""
sex_filter = "Females" if sex == "female" else "Males"
mask = (df["Sex"] == sex_filter) & (
df["Age"].apply(lambda a: isinstance(a, int) and low <= a <= high)
)
ages = pd.to_numeric(df["Age"], errors="coerce")
mask = (df["Sex"] == sex_filter) & ages.between(low, high)
subset = df[mask]
result = {}
for y in years:
if y in subset.columns:
result[y] = float(subset[y].sum())
column = _year_column(subset, y)
if column is not None:
result[y] = float(subset[column].sum())
return result


def _year_column(df: pd.DataFrame, year: int) -> int | str | None:
"""Return the workbook column for a year across ONS vintages."""
if year in df.columns:
return year
string_year = str(year)
if string_year in df.columns:
return string_year
return None


def _parse_uk_totals(df: pd.DataFrame) -> list[Target]:
"""Extract UK total population and gender × age bands."""
targets = []

# UK total
uk_pop = {}
for y in _YEARS:
if y in df.columns:
uk_pop[y] = float(df[y].sum())
column = _year_column(df, y)
if column is not None:
uk_pop[y] = float(df[column].sum())
if uk_pop:
targets.append(
Target(
Expand Down
2 changes: 1 addition & 1 deletion policyengine_uk_data/targets/sources/ons_households.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/"
"birthsdeathsandmarriages/families/datasets/"
"familiesandhouseholdsfamiliesandhouseholds/"
"current/familiesandhouseholdsuk2024.xlsx"
"current/familiesandhouseholdsuk2025.xlsx"
)
_REF = (
"https://www.ons.gov.uk/peoplepopulationandcommunity/"
Expand Down
24 changes: 24 additions & 0 deletions policyengine_uk_data/tests/test_ons_demographics_targets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pandas as pd
import pytest

from policyengine_uk_data.targets.sources.ons_demographics import (
_aggregate_ages,
_find_projection_member,
)


def test_aggregate_ages_accepts_string_age_values():
df = pd.DataFrame(
{
"Sex": ["Females", "Females", "Females", "Males"],
"Age": ["14", "15", "90", "15"],
2025: [1, 2, 4, 8],
}
)

assert _aggregate_ages(df, "female", 15, 90, [2025]) == {2025: 6.0}


def test_find_projection_member_fails_loudly():
with pytest.raises(RuntimeError, match="uk_ppp_machine_readable"):
_find_projection_member(["uk/readme.txt"])
2 changes: 1 addition & 1 deletion policyengine_uk_data/tests/test_population.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
def test_population(baseline):
population = baseline.calculate("people", 2025).sum() / 1e6
POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based
POPULATION_TARGET = 69.5 # ONS 2024-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2024based
# Tightened from 7% to 4% after data-pipeline improvements in April 2026
# (stage-2 QRF imputation #362, TFC target refresh #363, reported-anchor
# takeup #359) pulled the weighted UK population down from ~74M (+6.5%)
Expand Down
2 changes: 1 addition & 1 deletion policyengine_uk_data/tests/test_population_fidelity.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import numpy as np

POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions
POPULATION_TARGET = 69.5 # ONS 2024-based projection for 2025, millions
TOLERANCE = 0.04 # 4% — covers ~1.6%-3.3% stochastic calibration variance


Expand Down
16 changes: 10 additions & 6 deletions policyengine_uk_data/tests/test_property_income_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,20 @@
def test_property_income_targets_scaled():
"""Property income targets should be ~1.9x the raw SPI values.

Raw SPI 2022-23 total is ~£27bn. After scaling, targets for the
base year should be ~£52bn (matching HMRC rental income stats).
Raw SPI 2023-24 total is scaled up to better match HMRC rental
income statistics, which cover more landlords than SPI.
"""
targets = get_all_targets(year=2023)
base_year = 2024
targets = get_all_targets(year=base_year)
total = sum(
t.values[2023]
t.values[base_year]
for t in targets
if "property_income" in t.name and "count" not in t.name and 2023 in t.values
if "property_income" in t.name
and "count" not in t.name
and base_year in t.values
)
# Raw SPI gives ~£27bn, scaled by 1.9x should give ~£52bn
# Raw SPI gives roughly half of all landlord income; scaling should
# leave the current base-year target in this broad administrative range.
assert total > 45e9, (
f"Property income target total £{total / 1e9:.1f}bn is below £45bn. "
"Scaling factor may not be applied."
Expand Down