[FIX] Gracefully handle user providing phenotypic file as .csv (#212)

* Setup basic pheno.tsv validation for now only checks the file ending of the pheno file * Also ensure that fake .tsv files get flagged Someone sharing a .tsv file that really is a .csv will also get flagged * Use futures for python3.9 support Co-authored-by: Alyssa Dai <[email protected]> * Disallow all file endings other than .tsv * Saner error handling and clearer descriptions Co-authored-by: Alyssa Dai <[email protected]> * Error out on single column input * Make new output API happy --------- Co-authored-by: Alyssa Dai <[email protected]>
neurobagel · Sep 26, 2023 · 3ae6fcc · 3ae6fcc
1 parent 4434745
commit 3ae6fcc
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 20 deletions.
diff --git a/bagel/cli.py b/bagel/cli.py
@@ -1,7 +1,6 @@
 import json
 from pathlib import Path
 
-import pandas as pd
 import typer
 from bids import BIDSLayout
 from pydantic import ValidationError
@@ -69,7 +68,7 @@ def pheno(
     check_overwrite(output, overwrite)
 
     data_dictionary = load_json(dictionary)
-    pheno_df = pd.read_csv(pheno, sep="\t", keep_default_na=False, dtype=str)
+    pheno_df = putil.load_pheno(pheno)
     putil.validate_inputs(data_dictionary, pheno_df)
 
     # Display validated input paths to user

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
@@ -1,11 +1,15 @@
+from __future__ import annotations
+
 import warnings
 from collections import defaultdict
+from pathlib import Path
 from typing import Optional, Union
 
 import isodate
 import jsonschema
 import pandas as pd
 import pydantic
+from pandas import DataFrame
 from typer import BadParameter
 
 from bagel import dictionary_models, mappings, models
@@ -35,6 +39,37 @@ def validate_portal_uri(portal: str) -> Optional[str]:
     return portal
 
 
+def load_pheno(input_p: Path) -> pd.DataFrame | None:
+    """Load a .tsv pheno file and do some basic validation."""
+    if input_p.suffix == ".tsv":
+        pheno_df: DataFrame = pd.read_csv(
+            input_p, sep="\t", keep_default_na=False, dtype=str
+        )
+
+        if pheno_df.shape[1] > 1:
+            return pheno_df
+
+        # If we have only one column, but splitting by ',' gives us several elements
+        # then there is a good chance the user accidentally renamed a .csv into .tsv
+        # and we should give them some extra info with our error message to fix this.
+        note_misnamed_csv = (
+            "Note that your phenotypic input file also looks like a .csv file "
+            "as it contains several ',' commas. It is possible that "
+            "you have accidentally renamed a .csv file as a .tsv."
+        )
+        raise ValueError(
+            f"Your phenotypic input file {input_p} has only one column "
+            f"and is therefore not valid as a Neurobagel phenotypic file. "
+            " Please provide a valid .tsv pheno file!"
+            f"\n\n{note_misnamed_csv if len(pheno_df.columns[0].split(',')) > 1 else ''}"
+        )
+
+    raise ValueError(
+        f"Your ({input_p}) is not a .tsv file."
+        " Please provide a valid .tsv pheno file!"
+    )
+
+
 def generate_context():
     # Direct copy of the dandi-schema context generation function
     # https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py

diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
@@ -2,24 +2,25 @@
 
 Example inputs to the CLI
 
-| Example name | `.tsv`                                                                                                                                                                   | `.json`                                                                              | Expect             |
-| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | ------------------ |
-| 1            | invalid, non-unique combinations of `participant` and `session` IDs                                                                                                      | valid, has `IsAbout` annotations for `participant` and `session` ID columns          | fail               |
-| 2            | valid, unique `participant` and `session` IDs                                                                                                                            | same as example 1                                                                    | pass               |
-| 3            | same as example 2                                                                                                                                                        | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key     | fail               |
-| 4            | valid, has additional columns not described in `.json`                                                                                                                   | same as example 1                                                                    | pass               |
-| 5            | valid, has additional unique value, not documented in `.json`                                                                                                            | same as example 1                                                                    | fail               |
-| 6            | valid, same as example 5. has annotation tool columns                                                                                                                    | valid, contains `"MissingValues"` attribute for categorical variable                 | pass               |
-| invalid      | valid, only exists to be used together with the (invalid) .json                                                                                                          | invalid, missing the `"TermURL"` attribute for identifiers                           | fail               |
-| 7            | has fewer columns than are annotated in `.json`                                                                                                                          | same as example 1                                                                    | fail               |
-| 8            | valid, based on ex2 has multiple participant_id columns                                                                                                                  | valid, based on ex2 multiple participant_id column annotations                       | fail*              |
-| 9            | invalid, based on example 6 but contains an unannotated value for `group`                                                                                                | valid, based on example 6                                                            | fail               |
-| 10           | valid, same as example 6                                                                                                                                                 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |
-| 11           | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns                                                                                          | valid, based on example 6                                                            | fail               |
-| 12           | Valid, same as example 2                                                                                                                                                 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column       | Pass, with warning |
-| 13           | Valid, same as example_synthetic                                                                                                                                         | Valid, based on example_synthetic but with mismatched levels for group column        | Pass, with warning |
-| 14           | Valid, same as example 2                                                                                                                                                 | Valid, based on example 2, but with an extra column annotation without Neurobagel    | Pass               |
-| 15           | Valid, same as example 2                                                                                                                                                 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail                   |
+| Example name | `.tsv`                                                                          | `.json`                                                                              | Expect             |
+| ------------ | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | ------------------ |
+| 1            | invalid, non-unique combinations of `participant` and `session` IDs             | valid, has `IsAbout` annotations for `participant` and `session` ID columns          | fail               |
+| 2            | valid, unique `participant` and `session` IDs                                   | same as example 1                                                                    | pass               |
+| 3            | same as example 2                                                               | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key     | fail               |
+| 4            | valid, has additional columns not described in `.json`                          | same as example 1                                                                    | pass               |
+| 5            | valid, has additional unique value, not documented in `.json`                   | same as example 1                                                                    | fail               |
+| 6            | valid, same as example 5. has annotation tool columns                           | valid, contains `"MissingValues"` attribute for categorical variable                 | pass               |
+| invalid      | valid, only exists to be used together with the (invalid) .json                 | invalid, missing the `"TermURL"` attribute for identifiers                           | fail               |
+| 7            | has fewer columns than are annotated in `.json`                                 | same as example 1                                                                    | fail               |
+| 8            | valid, based on ex2 has multiple participant_id columns                         | valid, based on ex2 multiple participant_id column annotations                       | fail*              |
+| 9            | invalid, based on example 6 but contains an unannotated value for `group`       | valid, based on example 6                                                            | fail               |
+| 10           | valid, same as example 6                                                        | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |
+| 11           | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6                                                            | fail               |
+| 12           | Valid, same as example 2                                                        | Valid, based on example 2 but missing BIDS "Levels" attribute for group column       | Pass, with warning |
+| 13           | Valid, same as example_synthetic                                                | Valid, based on example_synthetic but with mismatched levels for group column        | Pass, with warning |
+| 14           | Valid, same as example 2                                                        | Valid, based on example 2, but with an extra column annotation without Neurobagel    | Pass               |
+| 15           | Valid, same as example 2                                                        | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations  | Fail               |
+| 16           | Invalid, same as example2.csv, but with a sneaky .tsv file ending               | Valid, same as example2                                                              | fail               |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
 

diff --git a/bagel/tests/data/example16.json b/bagel/tests/data/example16.json
@@ -0,0 +1,81 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      },
+      "Identifies": "participant"
+    }
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      },
+      "Identifies": "session"
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PAT": "Patient",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PAT": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "ncit:C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+  "sex": {
+    "Description": "Sex variable",
+    "Levels": {
+      "M": "Male",
+      "F": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "M": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "F": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "participant_age": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:iso8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  }
+}
diff --git a/bagel/tests/data/example16.tsv b/bagel/tests/data/example16.tsv
@@ -0,0 +1,5 @@
+participant_id,session_id,group,sex,participant_age
+sub-01,ses-01,PAT,M,"P20Y6M"
+sub-01,ses-02,PAT,M,"P20Y8M"
+sub-02,ses-01,CTRL,F,"P25Y8M"
+sub-02,ses-02,CTRL,F,"P26Y4M"
diff --git a/bagel/tests/data/example2.csv b/bagel/tests/data/example2.csv
@@ -0,0 +1,5 @@
+participant_id,session_id,group,sex,participant_age
+sub-01,ses-01,PAT,M,"P20Y6M"
+sub-01,ses-02,PAT,M,"P20Y8M"
+sub-02,ses-01,CTRL,F,"P25Y8M"
+sub-02,ses-02,CTRL,F,"P26Y4M"
diff --git a/bagel/tests/data/example2.txt b/bagel/tests/data/example2.txt
@@ -0,0 +1,5 @@
+participant_id,session_id,group,sex,participant_age
+sub-01,ses-01,PAT,M,"P20Y6M"
+sub-01,ses-02,PAT,M,"P20Y8M"
+sub-02,ses-01,CTRL,F,"P25Y8M"
+sub-02,ses-02,CTRL,F,"P26Y4M"
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
@@ -272,6 +272,44 @@ def test_unused_missing_values_raises_warning(
         assert warn_substring in str(w[0].message.args[0])
 
 
+@pytest.mark.parametrize(
+    "pheno_file,dictionary_file",
+    [
+        ("example2.csv", "example2.json"),
+        ("example16.tsv", "example16.json"),
+        ("example2.txt", "example2.json"),
+    ],
+)
+def test_providing_csv_file_raises_error(
+    pheno_file,
+    dictionary_file,
+    runner,
+    test_data,
+    tmp_path,
+    default_pheno_output_path,
+):
+    """Providing a .csv file or a file with .tsv extension but incorrect encoding should be handled with an
+    informative error."""
+    with pytest.raises(ValueError) as e:
+        runner.invoke(
+            bagel,
+            [
+                "pheno",
+                "--pheno",
+                test_data / pheno_file,
+                "--dictionary",
+                test_data / dictionary_file,
+                "--output",
+                default_pheno_output_path,
+                "--name",
+                "testing dataset",
+            ],
+            catch_exceptions=False,
+        )
+
+    assert "Please provide a valid .tsv pheno file" in str(e.value)
+
+
 def test_that_output_file_contains_dataset_level_attributes(
     runner, test_data, default_pheno_output_path, load_test_json
 ):