Skip to content

Fix #143: Improve GrimAge error messages for missing metadata #160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions biolearn/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,8 +850,27 @@ def from_definition(cls, clock_definition):
)

def predict(self, geo_data):
if "sex" not in geo_data.metadata or "age" not in geo_data.metadata:
raise ValueError("metadata must contain 'sex' and 'age' columns")
if "sex" not in geo_data.metadata:
raise ValueError(
"GrimAge requires 'sex' column in metadata. "
"If sex is unknown, you can predict it using the SexEstimation model:\n"
" from biolearn.model_gallery import ModelGallery\n"
" gallery = ModelGallery()\n"
" sex_pred = gallery.get('SexEstimation').predict(your_data)\n"
" # Then add sex to metadata based on predictions"
)

if "age" not in geo_data.metadata:
raise ValueError("GrimAge requires 'age' column in metadata (numeric age in years)")

# Check for NaN sex values
sex_values = geo_data.metadata["sex"]
if sex_values.isna().any():
nan_samples = sex_values[sex_values.isna()].index.tolist()
raise ValueError(
f"GrimAge cannot process samples with unknown sex: {nan_samples[:3]}.\n"
f"Either exclude these samples or predict sex using SexEstimation model."
)

df = geo_data.dnam

Expand Down Expand Up @@ -921,6 +940,14 @@ def calculate_sub_clock(self, df, coefficients):
.sum()
)

# Check for NaN results which indicate missing data
if result.isna().any():
nan_samples = result[result.isna()].index.tolist()
raise ValueError(
f"Missing methylation data for required CpG sites in samples: {nan_samples[:3]}. "
f"Ensure all required methylation sites are present in your data."
)

return result

def rename_columns(self, data, old_names, new_names):
Expand Down
66 changes: 66 additions & 0 deletions biolearn/test/test_grimage_missing_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pytest
import pandas as pd
import numpy as np
from biolearn.model_gallery import ModelGallery
from biolearn.data_library import GeoData
from biolearn.util import load_test_data_file


def test_grimage_missing_sex_column():
"""Test that missing sex column gives helpful error message."""
sample_inputs = load_test_data_file("external/DNAmTestSet.csv")
sample_metadata = load_test_data_file("external/testset_metadata.csv")

# Remove sex column
metadata_no_sex = sample_metadata.drop(columns=['sex'])
test_data = GeoData(metadata_no_sex, sample_inputs)

gallery = ModelGallery()
grimage_model = gallery.get('GrimAgeV2')

with pytest.raises(ValueError) as exc_info:
grimage_model.predict(test_data)

error_msg = str(exc_info.value)
assert "GrimAge requires 'sex' column" in error_msg
assert "SexEstimation" in error_msg


def test_grimage_missing_age_column():
"""Test that missing age column gives clear error message."""
sample_inputs = load_test_data_file("external/DNAmTestSet.csv")
sample_metadata = load_test_data_file("external/testset_metadata.csv")

# Remove age column
metadata_no_age = sample_metadata.drop(columns=['age'])
test_data = GeoData(metadata_no_age, sample_inputs)

gallery = ModelGallery()
grimage_model = gallery.get('GrimAgeV2')

with pytest.raises(ValueError) as exc_info:
grimage_model.predict(test_data)

error_msg = str(exc_info.value)
assert "GrimAge requires 'age' column" in error_msg


def test_grimage_nan_sex_values():
"""Test that NaN sex values give helpful error message."""
sample_inputs = load_test_data_file("external/DNAmTestSet.csv")
sample_metadata = load_test_data_file("external/testset_metadata.csv")

# Set some sex values to NaN
test_metadata = sample_metadata.copy()
test_metadata.loc[test_metadata.index[0], 'sex'] = np.nan
test_data = GeoData(test_metadata, sample_inputs)

gallery = ModelGallery()
grimage_model = gallery.get('GrimAgeV2')

with pytest.raises(ValueError) as exc_info:
grimage_model.predict(test_data)

error_msg = str(exc_info.value)
assert "cannot process samples with unknown sex" in error_msg
assert "SexEstimation" in error_msg
Loading