Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Gracefully handle user providing phenotypic file as .csv #212

Merged
merged 13 commits into from
Sep 26, 2023
3 changes: 1 addition & 2 deletions bagel/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
from pathlib import Path

import pandas as pd
import typer
from bids import BIDSLayout
from pydantic import ValidationError
Expand Down Expand Up @@ -62,7 +61,7 @@ def pheno(
You can upload this .jsonld file to the Neurobagel graph.
"""
data_dictionary = load_json(dictionary)
pheno_df = pd.read_csv(pheno, sep="\t", keep_default_na=False, dtype=str)
pheno_df = putil.load_pheno(pheno)
putil.validate_inputs(data_dictionary, pheno_df)

# Display validated input paths to user
Expand Down
37 changes: 37 additions & 0 deletions bagel/pheno_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from __future__ import annotations

import warnings
from collections import defaultdict
from pathlib import Path
from typing import Optional, Union

import isodate
import jsonschema
import pandas as pd
import pydantic
from pandas import DataFrame
from typer import BadParameter

from bagel import dictionary_models, mappings, models
Expand Down Expand Up @@ -35,6 +39,39 @@ def validate_portal_uri(portal: str) -> Optional[str]:
return portal


def load_pheno(input_p: Path) -> pd.DataFrame | None:
"""Load a .tsv pheno file and do some basic validation."""
if input_p.suffix == ".tsv":
pheno_df: DataFrame = pd.read_csv(
input_p, sep="\t", keep_default_na=False, dtype=str
)

if pheno_df.shape[1] > 1:
return pheno_df
elif len(pheno_df.columns[0].split(",")) > 1:
# We only extracted one column, this might mean that we have a sneaky .csv
raise ValueError(
f"Your phenotypic input file {input_p} has only one column "
f"and is therefore not valid as a neurobagel phenotypic file. "
f"Note that your phenotypic input file also looks like a .csv file "
f"as it contains several ',' commas. It is possible that "
f"you have accidentally renamed a .csv file as a .tsv"
surchs marked this conversation as resolved.
Show resolved Hide resolved
" Please provide a valid .tsv pheno file!"
)
else:
warnings.warn(
f"Your phenotypic input file {input_p} has only one column."
" Such a file is not a valid Neurobagel phenotypic data file."
" Please check your input again!"
)
return pheno_df
surchs marked this conversation as resolved.
Show resolved Hide resolved

raise ValueError(
f"Your ({input_p}) is not a .tsv file."
" Please provide a valid .tsv pheno file!"
)


def generate_context():
# Direct copy of the dandi-schema context generation function
# https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
Expand Down
37 changes: 19 additions & 18 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,25 @@

Example inputs to the CLI

| Example name | `.tsv` | `.json` | Expect |
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | ------------------ |
| 1 | invalid, non-unique combinations of `participant` and `session` IDs | valid, has `IsAbout` annotations for `participant` and `session` ID columns | fail |
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
| 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail |
| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |
| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail |
| 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning |
| 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning |
| 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass |
| 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail |
| Example name | `.tsv` | `.json` | Expect |
| ------------ | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | ------------------ |
| 1 | invalid, non-unique combinations of `participant` and `session` IDs | valid, has `IsAbout` annotations for `participant` and `session` ID columns | fail |
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
| 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail |
| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |
| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail |
| 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning |
| 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning |
| 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass |
| 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail |
| 16 | Invalid, same as example2.csv, but with a sneaky .tsv file ending | Valid, same as example2 | fail |
surchs marked this conversation as resolved.
Show resolved Hide resolved

`* this is expected to fail until we enable multiple participant_ID handling`.

Expand Down
81 changes: 81 additions & 0 deletions bagel/tests/data/example16.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
},
"Identifies": "participant"
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
},
"Identifies": "session"
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "ncit:C94342",
"Label": "Healthy Control"
}
}
}
},
"sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
}
}
},
"participant_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:iso8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
}
}
5 changes: 5 additions & 0 deletions bagel/tests/data/example16.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id,session_id,group,sex,participant_age
sub-01,ses-01,PAT,M,"P20Y6M"
sub-01,ses-02,PAT,M,"P20Y8M"
sub-02,ses-01,CTRL,F,"P25Y8M"
sub-02,ses-02,CTRL,F,"P26Y4M"
5 changes: 5 additions & 0 deletions bagel/tests/data/example2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id,session_id,group,sex,participant_age
sub-01,ses-01,PAT,M,"P20Y6M"
sub-01,ses-02,PAT,M,"P20Y8M"
sub-02,ses-01,CTRL,F,"P25Y8M"
sub-02,ses-02,CTRL,F,"P26Y4M"
5 changes: 5 additions & 0 deletions bagel/tests/data/example2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id,session_id,group,sex,participant_age
sub-01,ses-01,PAT,M,"P20Y6M"
sub-01,ses-02,PAT,M,"P20Y8M"
sub-02,ses-01,CTRL,F,"P25Y8M"
sub-02,ses-02,CTRL,F,"P26Y4M"
36 changes: 36 additions & 0 deletions bagel/tests/test_cli_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,42 @@ def test_unused_missing_values_raises_warning(
assert warn_substring in str(w[0].message.args[0])


@pytest.mark.parametrize(
"pheno_file,dictionary_file",
[
("example2.csv", "example2.json"),
("example16.tsv", "example16.json"),
("example2.txt", "example2.json"),
],
)
def test_providing_csv_file_raises_error(
pheno_file,
dictionary_file,
runner,
test_data,
tmp_path,
):
"""Providing a .csv file or a file with .tsv extension but incorrect encoding should be handled with an informative error."""
with pytest.raises(ValueError) as e:
runner.invoke(
bagel,
[
"pheno",
"--pheno",
test_data / pheno_file,
"--dictionary",
test_data / dictionary_file,
"--output",
tmp_path,
"--name",
"testing dataset",
],
catch_exceptions=False,
)

assert "Please provide a valid .tsv pheno file" in str(e.value)


def test_that_output_file_contains_dataset_level_attributes(
runner, test_data, tmp_path, load_test_json
):
Expand Down