Skip to content

Commit

Permalink
[FIX] Gracefully handle user providing phenotypic file as .csv (#212)
Browse files Browse the repository at this point in the history
* Setup basic pheno.tsv validation

for now only checks the file ending of the pheno file

* Also ensure that fake .tsv files get flagged

Someone sharing a .tsv file that really is a .csv
will also get flagged

* Use futures for python3.9 support

Co-authored-by: Alyssa Dai <[email protected]>

* Disallow all file endings other than .tsv

* Saner error handling and clearer descriptions

Co-authored-by: Alyssa Dai <[email protected]>

* Error out on single column input

* Make new output API happy

---------

Co-authored-by: Alyssa Dai <[email protected]>
  • Loading branch information
surchs and alyssadai authored Sep 26, 2023
1 parent 4434745 commit 3ae6fcc
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 20 deletions.
3 changes: 1 addition & 2 deletions bagel/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
from pathlib import Path

import pandas as pd
import typer
from bids import BIDSLayout
from pydantic import ValidationError
Expand Down Expand Up @@ -69,7 +68,7 @@ def pheno(
check_overwrite(output, overwrite)

data_dictionary = load_json(dictionary)
pheno_df = pd.read_csv(pheno, sep="\t", keep_default_na=False, dtype=str)
pheno_df = putil.load_pheno(pheno)
putil.validate_inputs(data_dictionary, pheno_df)

# Display validated input paths to user
Expand Down
35 changes: 35 additions & 0 deletions bagel/pheno_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from __future__ import annotations

import warnings
from collections import defaultdict
from pathlib import Path
from typing import Optional, Union

import isodate
import jsonschema
import pandas as pd
import pydantic
from pandas import DataFrame
from typer import BadParameter

from bagel import dictionary_models, mappings, models
Expand Down Expand Up @@ -35,6 +39,37 @@ def validate_portal_uri(portal: str) -> Optional[str]:
return portal


def load_pheno(input_p: Path) -> pd.DataFrame | None:
"""Load a .tsv pheno file and do some basic validation."""
if input_p.suffix == ".tsv":
pheno_df: DataFrame = pd.read_csv(
input_p, sep="\t", keep_default_na=False, dtype=str
)

if pheno_df.shape[1] > 1:
return pheno_df

# If we have only one column, but splitting by ',' gives us several elements
# then there is a good chance the user accidentally renamed a .csv into .tsv
# and we should give them some extra info with our error message to fix this.
note_misnamed_csv = (
"Note that your phenotypic input file also looks like a .csv file "
"as it contains several ',' commas. It is possible that "
"you have accidentally renamed a .csv file as a .tsv."
)
raise ValueError(
f"Your phenotypic input file {input_p} has only one column "
f"and is therefore not valid as a Neurobagel phenotypic file. "
" Please provide a valid .tsv pheno file!"
f"\n\n{note_misnamed_csv if len(pheno_df.columns[0].split(',')) > 1 else ''}"
)

raise ValueError(
f"Your ({input_p}) is not a .tsv file."
" Please provide a valid .tsv pheno file!"
)


def generate_context():
# Direct copy of the dandi-schema context generation function
# https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
Expand Down
37 changes: 19 additions & 18 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,25 @@

Example inputs to the CLI

| Example name | `.tsv` | `.json` | Expect |
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | ------------------ |
| 1 | invalid, non-unique combinations of `participant` and `session` IDs | valid, has `IsAbout` annotations for `participant` and `session` ID columns | fail |
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
| 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail |
| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |
| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail |
| 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning |
| 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning |
| 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass |
| 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail |
| Example name | `.tsv` | `.json` | Expect |
| ------------ | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | ------------------ |
| 1 | invalid, non-unique combinations of `participant` and `session` IDs | valid, has `IsAbout` annotations for `participant` and `session` ID columns | fail |
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
| 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail |
| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |
| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail |
| 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning |
| 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning |
| 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass |
| 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail |
| 16 | Invalid, same as example2.csv, but with a sneaky .tsv file ending | Valid, same as example2 | fail |

`* this is expected to fail until we enable multiple participant_ID handling`.

Expand Down
81 changes: 81 additions & 0 deletions bagel/tests/data/example16.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
},
"Identifies": "participant"
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
},
"Identifies": "session"
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "ncit:C94342",
"Label": "Healthy Control"
}
}
}
},
"sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
}
}
},
"participant_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:iso8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
}
}
5 changes: 5 additions & 0 deletions bagel/tests/data/example16.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id,session_id,group,sex,participant_age
sub-01,ses-01,PAT,M,"P20Y6M"
sub-01,ses-02,PAT,M,"P20Y8M"
sub-02,ses-01,CTRL,F,"P25Y8M"
sub-02,ses-02,CTRL,F,"P26Y4M"
5 changes: 5 additions & 0 deletions bagel/tests/data/example2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id,session_id,group,sex,participant_age
sub-01,ses-01,PAT,M,"P20Y6M"
sub-01,ses-02,PAT,M,"P20Y8M"
sub-02,ses-01,CTRL,F,"P25Y8M"
sub-02,ses-02,CTRL,F,"P26Y4M"
5 changes: 5 additions & 0 deletions bagel/tests/data/example2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id,session_id,group,sex,participant_age
sub-01,ses-01,PAT,M,"P20Y6M"
sub-01,ses-02,PAT,M,"P20Y8M"
sub-02,ses-01,CTRL,F,"P25Y8M"
sub-02,ses-02,CTRL,F,"P26Y4M"
38 changes: 38 additions & 0 deletions bagel/tests/test_cli_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,44 @@ def test_unused_missing_values_raises_warning(
assert warn_substring in str(w[0].message.args[0])


@pytest.mark.parametrize(
"pheno_file,dictionary_file",
[
("example2.csv", "example2.json"),
("example16.tsv", "example16.json"),
("example2.txt", "example2.json"),
],
)
def test_providing_csv_file_raises_error(
pheno_file,
dictionary_file,
runner,
test_data,
tmp_path,
default_pheno_output_path,
):
"""Providing a .csv file or a file with .tsv extension but incorrect encoding should be handled with an
informative error."""
with pytest.raises(ValueError) as e:
runner.invoke(
bagel,
[
"pheno",
"--pheno",
test_data / pheno_file,
"--dictionary",
test_data / dictionary_file,
"--output",
default_pheno_output_path,
"--name",
"testing dataset",
],
catch_exceptions=False,
)

assert "Please provide a valid .tsv pheno file" in str(e.value)


def test_that_output_file_contains_dataset_level_attributes(
runner, test_data, default_pheno_output_path, load_test_json
):
Expand Down

0 comments on commit 3ae6fcc

Please sign in to comment.