Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: add pre-commit-update and related updates #507

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 29 additions & 22 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,33 +1,40 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.3.4"
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.9.4"
hooks:
- id: ruff
- id: ruff
exclude: tutorials/nbconverted/
- id: ruff-format
- id: ruff-format
exclude: tutorials/nbconverted/
- repo: https://github.com/pre-commit/mirrors-prettier
rev: "v2.7.1"
- repo: https://github.com/pre-commit/mirrors-prettier
rev: "v3.1.0"
hooks:
- id: prettier
- repo: https://github.com/python-poetry/poetry
rev: "1.5.1"
- id: prettier
exclude: .pre-commit-config.yaml
- repo: https://github.com/python-poetry/poetry
rev: "2.0.1"
hooks:
- id: poetry-check
- repo: https://github.com/rhysd/actionlint
rev: v1.6.26
- id: poetry-check
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
- id: actionlint
- repo: https://github.com/hadolint/hadolint
rev: v2.12.1-beta
- id: actionlint
- repo: https://github.com/hadolint/hadolint
rev: v2.12.0
hooks:
- id: hadolint-docker
- repo: https://github.com/tox-dev/pyproject-fmt
rev: "2.2.4"
- id: hadolint-docker
- repo: https://github.com/tox-dev/pyproject-fmt
rev: "v2.5.0"
hooks:
- id: pyproject-fmt
# validates CITATION.cff file formatting expectations
- repo: https://github.com/citation-file-format/cffconvert
- id: pyproject-fmt
# validates CITATION.cff file formatting expectations
- repo: https://github.com/citation-file-format/cffconvert
rev: b6045d78aac9e02b039703b030588d54d53262ac
hooks:
- id: validate-cff
- id: validate-cff
# update pre-commit hooks if necessary
- repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
rev: v0.6.0
hooks:
- id: pre-commit-update
args: ["--keep", "pre-commit-update", "--keep", "cffconvert"]
23 changes: 11 additions & 12 deletions pycytominer/cyto_utils/cell_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,20 +171,19 @@ def _download_s3(self, uri: str):

bucket, key = self._parse_s3_path(uri)

tmp_file = tempfile.NamedTemporaryFile(
with tempfile.NamedTemporaryFile(
delete=False, suffix=pathlib.Path(key).name
)

self.s3.download_file(bucket, key, tmp_file.name)
) as tmp_file:
self.s3.download_file(bucket, key, tmp_file.name)

# Check if the downloaded file exists and has a size greater than 0
tmp_file_path = pathlib.Path(tmp_file.name)
if tmp_file_path.exists() and tmp_file_path.stat().st_size > 0:
return tmp_file.name
else:
raise ValueError(
f"Downloaded file '{tmp_file.name}' is empty or does not exist."
)
# Check if the downloaded file exists and has a size greater than 0
tmp_file_path = pathlib.Path(tmp_file.name)
if tmp_file_path.exists() and tmp_file_path.stat().st_size > 0:
return tmp_file.name
else:
raise ValueError(
f"Downloaded file '{tmp_file.name}' is empty or does not exist."
)

def _load_metadata(self):
"""Load the metadata into a Pandas DataFrame
Expand Down
4 changes: 3 additions & 1 deletion pycytominer/cyto_utils/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,9 @@ def _compartment_df_generator(

assert ( # noqa: S101
n_aggregation_memory_strata > 0
), "Number of strata to pull into memory at once (n_aggregation_memory_strata) must be > 0"
), (
"Number of strata to pull into memory at once (n_aggregation_memory_strata) must be > 0"
)

# Obtain data types of all columns of the compartment table
cols = "*"
Expand Down
8 changes: 3 additions & 5 deletions pycytominer/cyto_utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,9 @@ def is_path_a_parquet_file(file: Union[str, pathlib.PurePath]) -> bool:
except FileNotFoundError as e:
print("load_profiles() didn't find the path.", e, sep="\n")

# Check if file path is a parquet file
if file.suffix.lower() == ".parquet":
return True

return False
# return boolean based on whether
# file path is a parquet file
return file.suffix.lower() == ".parquet"


def infer_delim(file: str):
Expand Down
4 changes: 3 additions & 1 deletion pycytominer/cyto_utils/single_cell_ingest_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ def assert_linking_cols_complete(linking_cols="default", compartments="default")
diff_column = set(compartments).difference(unique_linking_cols)
assert ( # noqa: S101
unique_linking_cols == sorted(compartments)
), f"All compartments must be specified in the linking_cols, {diff_column} is missing"
), (
f"All compartments must be specified in the linking_cols, {diff_column} is missing"
)


def provide_linking_cols_feature_name_update(linking_cols="default"):
Expand Down
4 changes: 3 additions & 1 deletion pycytominer/cyto_utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ def check_consensus_operation(operation):
except AssertionError:
assert ( # noqa: S101
operation in avail_ops
), f"operation {operation} not supported, select one of {avail_ops} or see aggregate.py"
), (
f"operation {operation} not supported, select one of {avail_ops} or see aggregate.py"
)

return operation

Expand Down
4 changes: 3 additions & 1 deletion pycytominer/cyto_utils/write_gct.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ def write_gct(
nrow_metadata = feature_metadata.shape[1]
assert ( # noqa: S101
"id" in feature_metadata.index.tolist()
), "make sure feature metadata has row named 'id' that stores feature metadata names!"
), (
"make sure feature metadata has row named 'id' that stores feature metadata names!"
)
full_df = feature_metadata.merge(
full_df, how="right", left_index=True, right_index=True
)
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/operations/noise_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def noise_removal(
# Check if the column exists
if noise_removal_perturb_groups not in population_df.columns:
raise ValueError(
'f"{perturb} not found. Are you sure it is a ' "metadata column?"
'f"{perturb} not found. Are you sure it is a metadata column?'
)
# Assign the group info to the specified column
group_info = population_df[noise_removal_perturb_groups]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,8 @@ def test_output_type():
parquet_df = pd.read_parquet(test_output_file_parquet)

# check to make sure the files were read in corrrectly as a pd.Dataframe
assert type(csv_df) == pd.DataFrame
assert type(parquet_df) == pd.DataFrame
assert isinstance(csv_df, pd.DataFrame)
assert isinstance(parquet_df, pd.DataFrame)

# check to make sure both dataframes are the same regardless of the output_type
pd.testing.assert_frame_equal(csv_df, parquet_df)
4 changes: 2 additions & 2 deletions tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ def test_output_type():
parquet_df = pd.read_parquet(OUTPUT_FILE_PARQUET)

# check to make sure the files were read in corrrectly as a pd.Dataframe
assert type(csv_df) == pd.DataFrame
assert type(parquet_df) == pd.DataFrame
assert isinstance(csv_df, pd.DataFrame)
assert isinstance(parquet_df, pd.DataFrame)

# check to make sure both dataframes are the same regardless of the output_type
pd.testing.assert_frame_equal(csv_df, parquet_df)
4 changes: 2 additions & 2 deletions tests/test_consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def test_output_type():
parquet_df = pd.read_parquet(output_test_file_parquet)

# check to make sure the files were read in corrrectly as a pd.Dataframe
assert type(csv_df) == pd.DataFrame
assert type(parquet_df) == pd.DataFrame
assert isinstance(csv_df, pd.DataFrame)
assert isinstance(parquet_df, pd.DataFrame)

# check to make sure both dataframes are the same regardless of the output_type
pd.testing.assert_frame_equal(csv_df, parquet_df)
2 changes: 1 addition & 1 deletion tests/test_cyto_utils/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def fixture_metadata_input_file_s3() -> str:
"""
Provide a metadata input file for cell_locations test data
"""
return "s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet"
return "s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.csv"


@pytest.fixture(name="single_cell_input_file_s3")
Expand Down
51 changes: 36 additions & 15 deletions tests/test_cyto_utils/test_cell_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,35 @@ def get_metadata_input_dataframe(cell_loc: CellLocation) -> pd.DataFrame:
from a CellLocation object.
"""

return (
pd.read_parquet(
cell_loc.metadata_input,
# set storage options if we have an s3 path
storage_options={"anon": True}
if isinstance(cell_loc.metadata_input, str)
and cell_loc.metadata_input.startswith("s3://")
else None,
# return a dataframe if it is already a dataframe
if isinstance(cell_loc.metadata_input, pd.DataFrame):
return cell_loc.metadata_input

# try to process a string-based path
if isinstance(cell_loc.metadata_input, str):
storage_opts = (
{"anon": True} if cell_loc.metadata_input.startswith("s3://") else None
)
if isinstance(cell_loc.metadata_input, str)
else cell_loc.metadata_input
)
return (
# read from parquet if we have a parquet object path
pd.read_parquet(
path=cell_loc.metadata_input,
# set storage options if we have an s3 path
storage_options=storage_opts,
)
if cell_loc.metadata_input.endswith(".parquet")
# read from csv if we have a csv object path
else (
pd.read_csv(
filepath_or_buffer=cell_loc.metadata_input,
# set storage options if we have an s3 path
storage_options=storage_opts,
)
)
)
else:
# otherwise raise an error as we don't have a supported format
raise ValueError("Unsupported metadata_input type")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -80,11 +97,15 @@ def test_output_value_correctness(
cell_loc = cls_cell_loc.add_cell_location()
metadata_input_dataframe = get_metadata_input_dataframe(cell_loc=cls_cell_loc)

# Cast cell_loc columns to the data types of metadata_input_dataframe columns
# (observed metadata_site as having different types)
for col in metadata_input_dataframe.columns:
cell_loc[col] = cell_loc[col].astype(metadata_input_dataframe[col].dtype)

# if we restrict the columns of cell_loc to the ones in metadata_input_dataframe, we should get the same dataframe
assert (
cell_loc[metadata_input_dataframe.columns]
.reset_index(drop=True)
.equals(metadata_input_dataframe.reset_index(drop=True))
pd.testing.assert_frame_equal(
cell_loc[metadata_input_dataframe.columns].reset_index(drop=True),
metadata_input_dataframe.reset_index(drop=True),
)

# gather an engine from the cell_loc class
Expand Down
8 changes: 5 additions & 3 deletions tests/test_cyto_utils/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,9 +305,11 @@ def test_load_compartment():
if pd.api.types.is_float(CELLS_DF[colname].dtype)
# check for columns which are of 'int64' type
# note: pd.api.types.is_integer sometimes is unable to detect int64
or CELLS_DF[colname].dtype == "int64"
# avoid recasting the metadata_types
and colname not in metadata_types
or (
CELLS_DF[colname].dtype == "int64"
# avoid recasting the metadata_types
and colname not in metadata_types
)
}

# create deep copy of CELLS_DF with manually re-typed float columns as float32
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cyto_utils/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_check_compartments_not_valid():

def test_get_default_compartments():
default_comparments = get_default_compartments()
assert ["cells", "cytoplasm", "nuclei"] == default_comparments
assert default_comparments == ["cells", "cytoplasm", "nuclei"]


def test_load_known_metadata_dictionary():
Expand Down
16 changes: 8 additions & 8 deletions tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,8 +489,8 @@ def test_output_type():
parquet_df = pd.read_parquet(output_test_file_parquet)

# check to make sure the files were read in corrrectly as a pd.Dataframe
assert type(csv_df) == pd.DataFrame
assert type(parquet_df) == pd.DataFrame
assert isinstance(csv_df, pd.DataFrame)
assert isinstance(parquet_df, pd.DataFrame)

# check to make sure both dataframes are the same regardless of the output_type
pd.testing.assert_frame_equal(csv_df, parquet_df)
Expand Down Expand Up @@ -537,9 +537,9 @@ def test_samples_parameter_in_feature_select():
)

# checking if no rows were not removed
assert (
results6a.shape[0] == data_unique_test_df3.shape[0]
), f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {operation}"
assert results6a.shape[0] == data_unique_test_df3.shape[0], (
f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {operation}"
)

# testing multiple operations (continually appends operations)
concat_operations = all_operations[: operation_idx + 1]
Expand All @@ -553,6 +553,6 @@ def test_samples_parameter_in_feature_select():
)

# checking if no rows were not removed
assert (
results6b.shape[0] == data_unique_test_df3.shape[0]
), f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {concat_operations}"
assert results6b.shape[0] == data_unique_test_df3.shape[0], (
f"Row counts do not match: {results6a[0]} != {data_unique_test_df3.shape[0]} in operation: {concat_operations}"
)
4 changes: 2 additions & 2 deletions tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,8 @@ def test_output_type():
parquet_df = pd.read_parquet(output_test_file_parquet)

# check to make sure the files were read in corrrectly as a pd.Dataframe
assert type(csv_df) == pd.DataFrame
assert type(parquet_df) == pd.DataFrame
assert isinstance(csv_df, pd.DataFrame)
assert isinstance(parquet_df, pd.DataFrame)

# check to make sure both dataframes are the same regardless of the output_type
pd.testing.assert_frame_equal(csv_df, parquet_df)
Loading