cytomining · d33bs · Jan 31, 2025 · Feb 2, 2025 · Feb 4, 2025 · Feb 26, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,33 +1,40 @@
 repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.3.4"
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: "v0.9.4"
     hooks:
-      - id: ruff
+    -   id: ruff
         exclude: tutorials/nbconverted/
-      - id: ruff-format
+    -   id: ruff-format
         exclude: tutorials/nbconverted/
-  - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: "v2.7.1"
+-   repo: https://github.com/pre-commit/mirrors-prettier
+    rev: "v3.1.0"
     hooks:
-      - id: prettier
-  - repo: https://github.com/python-poetry/poetry
-    rev: "1.5.1"
+    -   id: prettier
+        exclude: .pre-commit-config.yaml
+-   repo: https://github.com/python-poetry/poetry
+    rev: "2.0.1"
     hooks:
-      - id: poetry-check
-  - repo: https://github.com/rhysd/actionlint
-    rev: v1.6.26
+    -   id: poetry-check
+-   repo: https://github.com/rhysd/actionlint
+    rev: v1.7.7
     hooks:
-      - id: actionlint
-  - repo: https://github.com/hadolint/hadolint
-    rev: v2.12.1-beta
+    -   id: actionlint
+-   repo: https://github.com/hadolint/hadolint
+    rev: v2.12.0
     hooks:
-      - id: hadolint-docker
-  - repo: https://github.com/tox-dev/pyproject-fmt
-    rev: "2.2.4"
+    -   id: hadolint-docker
+-   repo: https://github.com/tox-dev/pyproject-fmt
+    rev: "v2.5.0"
     hooks:
-      - id: pyproject-fmt
-  # validates CITATION.cff file formatting expectations
-  - repo: https://github.com/citation-file-format/cffconvert
+    -   id: pyproject-fmt
+    # validates CITATION.cff file formatting expectations
+-   repo: https://github.com/citation-file-format/cffconvert
     rev: b6045d78aac9e02b039703b030588d54d53262ac
     hooks:
-      - id: validate-cff
+    -   id: validate-cff
+    # update pre-commit hooks if necessary
+-   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
+    rev: v0.6.0
+    hooks:
+    -   id: pre-commit-update
+        args: ["--keep", "pre-commit-update", "--keep", "cffconvert"]
diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py
@@ -171,20 +171,19 @@ def _download_s3(self, uri: str):
 
         bucket, key = self._parse_s3_path(uri)
 
-        tmp_file = tempfile.NamedTemporaryFile(
+        with tempfile.NamedTemporaryFile(
             delete=False, suffix=pathlib.Path(key).name
-        )
-
-        self.s3.download_file(bucket, key, tmp_file.name)
+        ) as tmp_file:
+            self.s3.download_file(bucket, key, tmp_file.name)
 
-        # Check if the downloaded file exists and has a size greater than 0
-        tmp_file_path = pathlib.Path(tmp_file.name)
-        if tmp_file_path.exists() and tmp_file_path.stat().st_size > 0:
-            return tmp_file.name
-        else:
-            raise ValueError(
-                f"Downloaded file '{tmp_file.name}' is empty or does not exist."
-            )
+            # Check if the downloaded file exists and has a size greater than 0
+            tmp_file_path = pathlib.Path(tmp_file.name)
+            if tmp_file_path.exists() and tmp_file_path.stat().st_size > 0:
+                return tmp_file.name
+            else:
+                raise ValueError(
+                    f"Downloaded file '{tmp_file.name}' is empty or does not exist."
+                )
 
     def _load_metadata(self):
         """Load the metadata into a Pandas DataFrame

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
@@ -624,7 +624,9 @@ def _compartment_df_generator(
 
         assert (  # noqa: S101
             n_aggregation_memory_strata > 0
-        ), "Number of strata to pull into memory at once (n_aggregation_memory_strata) must be > 0"
+        ), (
+            "Number of strata to pull into memory at once (n_aggregation_memory_strata) must be > 0"
+        )
 
         # Obtain data types of all columns of the compartment table
         cols = "*"

diff --git a/pycytominer/cyto_utils/load.py b/pycytominer/cyto_utils/load.py
@@ -38,11 +38,9 @@ def is_path_a_parquet_file(file: Union[str, pathlib.PurePath]) -> bool:
     except FileNotFoundError as e:
         print("load_profiles() didn't find the path.", e, sep="\n")
 
-    # Check if file path is a parquet file
-    if file.suffix.lower() == ".parquet":
-        return True
-
-    return False
+    # return boolean based on whether
+    # file path is a parquet file
+    return file.suffix.lower() == ".parquet"
 
 
 def infer_delim(file: str):

diff --git a/pycytominer/cyto_utils/single_cell_ingest_utils.py b/pycytominer/cyto_utils/single_cell_ingest_utils.py
@@ -71,7 +71,9 @@ def assert_linking_cols_complete(linking_cols="default", compartments="default")
     diff_column = set(compartments).difference(unique_linking_cols)
     assert (  # noqa: S101
         unique_linking_cols == sorted(compartments)
-    ), f"All compartments must be specified in the linking_cols, {diff_column} is missing"
+    ), (
+        f"All compartments must be specified in the linking_cols, {diff_column} is missing"
+    )
 
 
 def provide_linking_cols_feature_name_update(linking_cols="default"):

diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py
@@ -159,7 +159,9 @@ def check_consensus_operation(operation):
     except AssertionError:
         assert (  # noqa: S101
             operation in avail_ops
-        ), f"operation {operation} not supported, select one of {avail_ops} or see aggregate.py"
+        ), (
+            f"operation {operation} not supported, select one of {avail_ops} or see aggregate.py"
+        )
 
     return operation
 

diff --git a/pycytominer/cyto_utils/write_gct.py b/pycytominer/cyto_utils/write_gct.py
@@ -81,7 +81,9 @@ def write_gct(
         nrow_metadata = feature_metadata.shape[1]
         assert (  # noqa: S101
             "id" in feature_metadata.index.tolist()
-        ), "make sure feature metadata has row named 'id' that stores feature metadata names!"
+        ), (
+            "make sure feature metadata has row named 'id' that stores feature metadata names!"
+        )
         full_df = feature_metadata.merge(
             full_df, how="right", left_index=True, right_index=True
         )

diff --git a/pycytominer/operations/noise_removal.py b/pycytominer/operations/noise_removal.py
@@ -60,7 +60,7 @@ def noise_removal(
         # Check if the column exists
         if noise_removal_perturb_groups not in population_df.columns:
             raise ValueError(
-                'f"{perturb} not found. Are you sure it is a ' "metadata column?"
+                'f"{perturb} not found. Are you sure it is a metadata column?'
             )
         # Assign the group info to the specified column
         group_info = population_df[noise_removal_perturb_groups]

diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py
@@ -324,8 +324,8 @@ def test_output_type():
     parquet_df = pd.read_parquet(test_output_file_parquet)
 
     # check to make sure the files were read in corrrectly as a pd.Dataframe
-    assert type(csv_df) == pd.DataFrame
-    assert type(parquet_df) == pd.DataFrame
+    assert isinstance(csv_df, pd.DataFrame)
+    assert isinstance(parquet_df, pd.DataFrame)
 
     # check to make sure both dataframes are the same regardless of the output_type
     pd.testing.assert_frame_equal(csv_df, parquet_df)
diff --git a/tests/test_annotate.py b/tests/test_annotate.py
@@ -203,8 +203,8 @@ def test_output_type():
     parquet_df = pd.read_parquet(OUTPUT_FILE_PARQUET)
 
     # check to make sure the files were read in corrrectly as a pd.Dataframe
-    assert type(csv_df) == pd.DataFrame
-    assert type(parquet_df) == pd.DataFrame
+    assert isinstance(csv_df, pd.DataFrame)
+    assert isinstance(parquet_df, pd.DataFrame)
 
     # check to make sure both dataframes are the same regardless of the output_type
     pd.testing.assert_frame_equal(csv_df, parquet_df)
diff --git a/tests/test_consensus.py b/tests/test_consensus.py
@@ -119,8 +119,8 @@ def test_output_type():
     parquet_df = pd.read_parquet(output_test_file_parquet)
 
     # check to make sure the files were read in corrrectly as a pd.Dataframe
-    assert type(csv_df) == pd.DataFrame
-    assert type(parquet_df) == pd.DataFrame
+    assert isinstance(csv_df, pd.DataFrame)
+    assert isinstance(parquet_df, pd.DataFrame)
 
     # check to make sure both dataframes are the same regardless of the output_type
     pd.testing.assert_frame_equal(csv_df, parquet_df)
diff --git a/tests/test_cyto_utils/conftest.py b/tests/test_cyto_utils/conftest.py
@@ -46,7 +46,7 @@ def fixture_metadata_input_file_s3() -> str:
     """
     Provide a metadata input file for cell_locations test data
     """
-    return "s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet"
+    return "s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.csv"
 
 
 @pytest.fixture(name="single_cell_input_file_s3")

diff --git a/tests/test_cyto_utils/test_cell_locations.py b/tests/test_cyto_utils/test_cell_locations.py
@@ -13,18 +13,35 @@ def get_metadata_input_dataframe(cell_loc: CellLocation) -> pd.DataFrame:
     from a CellLocation object.
     """
 
-    return (
-        pd.read_parquet(
-            cell_loc.metadata_input,
-            # set storage options if we have an s3 path
-            storage_options={"anon": True}
-            if isinstance(cell_loc.metadata_input, str)
-            and cell_loc.metadata_input.startswith("s3://")
-            else None,
+    # return a dataframe if it is already a dataframe
+    if isinstance(cell_loc.metadata_input, pd.DataFrame):
+        return cell_loc.metadata_input
+
+    # try to process a string-based path
+    if isinstance(cell_loc.metadata_input, str):
+        storage_opts = (
+            {"anon": True} if cell_loc.metadata_input.startswith("s3://") else None
         )
-        if isinstance(cell_loc.metadata_input, str)
-        else cell_loc.metadata_input
-    )
+        return (
+            # read from parquet if we have a parquet object path
+            pd.read_parquet(
+                path=cell_loc.metadata_input,
+                # set storage options if we have an s3 path
+                storage_options=storage_opts,
+            )
+            if cell_loc.metadata_input.endswith(".parquet")
+            # read from csv if we have a csv object path
+            else (
+                pd.read_csv(
+                    filepath_or_buffer=cell_loc.metadata_input,
+                    # set storage options if we have an s3 path
+                    storage_options=storage_opts,
+                )
+            )
+        )
+    else:
+        # otherwise raise an error as we don't have a supported format
+        raise ValueError("Unsupported metadata_input type")
 
 
 @pytest.mark.parametrize(
@@ -80,11 +97,15 @@ def test_output_value_correctness(
     cell_loc = cls_cell_loc.add_cell_location()
     metadata_input_dataframe = get_metadata_input_dataframe(cell_loc=cls_cell_loc)
 
+    # Cast cell_loc columns to the data types of metadata_input_dataframe columns
+    # (observed metadata_site as having different types)
+    for col in metadata_input_dataframe.columns:
+        cell_loc[col] = cell_loc[col].astype(metadata_input_dataframe[col].dtype)
+
     # if we restrict the columns of cell_loc to the ones in metadata_input_dataframe, we should get the same dataframe
-    assert (
-        cell_loc[metadata_input_dataframe.columns]
-        .reset_index(drop=True)
-        .equals(metadata_input_dataframe.reset_index(drop=True))
+    pd.testing.assert_frame_equal(
+        cell_loc[metadata_input_dataframe.columns].reset_index(drop=True),
+        metadata_input_dataframe.reset_index(drop=True),
     )
 
     # gather an engine from the cell_loc class

diff --git a/tests/test_cyto_utils/test_cells.py b/tests/test_cyto_utils/test_cells.py
@@ -305,9 +305,11 @@ def test_load_compartment():
         if pd.api.types.is_float(CELLS_DF[colname].dtype)
         # check for columns which are of 'int64' type
         # note: pd.api.types.is_integer sometimes is unable to detect int64
-        or CELLS_DF[colname].dtype == "int64"
-        # avoid recasting the metadata_types
-        and colname not in metadata_types
+        or (
+            CELLS_DF[colname].dtype == "int64"
+            # avoid recasting the metadata_types
+            and colname not in metadata_types
+        )
     }
 
     # create deep copy of CELLS_DF with manually re-typed float columns as float32

diff --git a/tests/test_cyto_utils/test_util.py b/tests/test_cyto_utils/test_util.py
@@ -76,7 +76,7 @@ def test_check_compartments_not_valid():
 
 def test_get_default_compartments():
     default_comparments = get_default_compartments()
-    assert ["cells", "cytoplasm", "nuclei"] == default_comparments
+    assert default_comparments == ["cells", "cytoplasm", "nuclei"]
 
 
 def test_load_known_metadata_dictionary():

diff --git a/tests/test_feature_select.py b/tests/test_feature_select.py
@@ -489,8 +489,8 @@ def test_output_type():
     parquet_df = pd.read_parquet(output_test_file_parquet)
 
     # check to make sure the files were read in corrrectly as a pd.Dataframe
-    assert type(csv_df) == pd.DataFrame
-    assert type(parquet_df) == pd.DataFrame
+    assert isinstance(csv_df, pd.DataFrame)
+    assert isinstance(parquet_df, pd.DataFrame)
 
     # check to make sure both dataframes are the same regardless of the output_type
     pd.testing.assert_frame_equal(csv_df, parquet_df)
@@ -537,9 +537,9 @@ def test_samples_parameter_in_feature_select():
         )
 
         # checking if no rows were not removed
-        assert (
-            results6a.shape[0] == data_unique_test_df3.shape[0]
-        ), f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {operation}"
+        assert results6a.shape[0] == data_unique_test_df3.shape[0], (
+            f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {operation}"
+        )
 
         # testing multiple operations (continually appends operations)
         concat_operations = all_operations[: operation_idx + 1]
@@ -553,6 +553,6 @@ def test_samples_parameter_in_feature_select():
         )
 
         # checking if no rows were not removed
-        assert (
-            results6b.shape[0] == data_unique_test_df3.shape[0]
-        ), f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {concat_operations}"
+        assert results6b.shape[0] == data_unique_test_df3.shape[0], (
+            f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {concat_operations}"
+        )
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
@@ -556,8 +556,8 @@ def test_output_type():
     parquet_df = pd.read_parquet(output_test_file_parquet)
 
     # check to make sure the files were read in corrrectly as a pd.Dataframe
-    assert type(csv_df) == pd.DataFrame
-    assert type(parquet_df) == pd.DataFrame
+    assert isinstance(csv_df, pd.DataFrame)
+    assert isinstance(parquet_df, pd.DataFrame)
 
     # check to make sure both dataframes are the same regardless of the output_type
     pd.testing.assert_frame_equal(csv_df, parquet_df)