Skip to content

Handle invalid obsm keys with actionable error message (SCP-5496) #390

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions ingest/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,13 @@ def create_parser():
help="Accepted values: 'pairwise' or 'rest' (default)",
)

parser_differential_expression.add_argument(
"--raw-location",
required=True,
help="location of raw counts. '.raw' for raw slot, "
"else adata.layers key value",
)

parser_differential_expression.add_argument(
"--study-accession",
required=True,
Expand Down
2 changes: 1 addition & 1 deletion ingest/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def write_de_result(adata, group, annotation, rank_key, cluster_name, extra_para
clean_group = DifferentialExpression.sanitize_string(group)
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
DifferentialExpression.de_logger.info(
f"Writing DE output for {clean_group} vs restq"
f"Writing DE output for {clean_group} vs rest"
)
elif de_type == "pairwise":
# rank_genes_groups accepts a list. For SCP pairwise, should be a list with one item
Expand Down
23 changes: 18 additions & 5 deletions ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,12 +546,25 @@ def extract_from_anndata(self):
if not self.kwargs["obsm_keys"]:
self.kwargs["obsm_keys"] = ["X_tsne"]
# TODO (SCP-5104): perform check for successful extraction or report failure and exit
for key in self.kwargs["obsm_keys"]:
AnnDataIngestor.generate_cluster_header(self.anndata.adata, key)
AnnDataIngestor.generate_cluster_type_declaration(
self.anndata.adata, key
try:
for key in self.kwargs["obsm_keys"]:
AnnDataIngestor.generate_cluster_header(self.anndata.adata, key)
AnnDataIngestor.generate_cluster_type_declaration(
self.anndata.adata, key
)
AnnDataIngestor.generate_cluster_body(self.anndata.adata, key)
except KeyError as e:
msg = f"Unable to extract cluster data from anndata file. Please check the provided obsm key, {e}."
self.report_validation("failure")
log_exception(
IngestPipeline.dev_logger, IngestPipeline.user_logger, msg
)
AnnDataIngestor.generate_cluster_body(self.anndata.adata, key)
return 1
except Exception as e:
log_exception(
IngestPipeline.dev_logger, IngestPipeline.user_logger, e
)
return 1
# process matrix data
### TODO (SCP-5102, SCP-5103): how to associate "raw_count" cells to anndata file
if self.kwargs.get("extract") and "processed_expression" in self.kwargs.get(
Expand Down
11 changes: 11 additions & 0 deletions tests/test_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,17 @@ def test_set_output_filename(self):
"h5ad_frag.cluster.X_Umap.tsv",
)

def test_invalid_obsm_key(self):
valid_synthetic_anndata = AnnDataIngestor(*self.synthetic_args)
invalid_cluster_name = "foo"
self.assertRaisesRegex(
KeyError,
"foo",
lambda: self.anndata_ingest.generate_cluster_header(
valid_synthetic_anndata.obtain_adata(), invalid_cluster_name
),
)

def test_generate_cluster_header(self):
self.anndata_ingest.generate_cluster_header(
self.anndata_ingest.obtain_adata(), self.cluster_name
Expand Down
85 changes: 41 additions & 44 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
pytest --cov=../ingest/

"""

import unittest
from unittest.mock import patch, MagicMock
from test_dense import mock_load_r_files
Expand All @@ -47,7 +48,7 @@
IngestPipeline,
exit_pipeline,
run_ingest,
get_action_from_args
get_action_from_args,
)
from expression_files.expression_files import GeneExpression

Expand Down Expand Up @@ -102,8 +103,7 @@ def execute_ingest(args, mock_storage_client, mock_storage_blob):
return_value=True,
)
def test_ingest_dense_matrix(self, mock_check_unique_cells):
"""Ingest Pipeline should extract, transform, and load dense matrices
"""
"""Ingest Pipeline should extract, transform, and load dense matrices"""
args = [
"--study-id",
"5d276a50421aa9117c982845",
Expand Down Expand Up @@ -132,8 +132,7 @@ def test_ingest_dense_matrix(self, mock_check_unique_cells):
return_value=True,
)
def test_ingest_local_dense_matrix(self, mock_check_unique_cells):
"""Ingest Pipeline should extract and transform local dense matrices
"""
"""Ingest Pipeline should extract and transform local dense matrices"""

args = [
"--study-id",
Expand Down Expand Up @@ -164,7 +163,7 @@ def test_ingest_local_dense_matrix(self, mock_check_unique_cells):
)
def test_ingest_local_compressed_dense_matrix(self, mock_check_unique_cells):
"""Ingest Pipeline should extract and transform local dense matrices
from compressed file in the same manner as uncompressed file
from compressed file in the same manner as uncompressed file
"""

args = [
Expand All @@ -191,8 +190,7 @@ def test_ingest_local_compressed_dense_matrix(self, mock_check_unique_cells):
self.execute_ingest(args)

def test_empty_dense_file(self):
"""Ingest Pipeline should fail gracefully when an empty file is given
"""
"""Ingest Pipeline should fail gracefully when an empty file is given"""

args = [
"--study-id",
Expand Down Expand Up @@ -222,8 +220,7 @@ def test_empty_dense_file(self):
self.assertEqual(cm.exception.code, 1)

def test_empty_mtx_file(self):
"""Ingest Pipeline should fail gracefully when an empty file is given
"""
"""Ingest Pipeline should fail gracefully when an empty file is given"""

args = [
"--study-id",
Expand Down Expand Up @@ -261,8 +258,7 @@ def test_empty_mtx_file(self):
return_value=True,
)
def test_ingest_mtx_matrix(self, mock_check_unique_cells):
"""Ingest Pipeline should extract and transform MTX matrix bundles
"""
"""Ingest Pipeline should extract and transform MTX matrix bundles"""

args = [
"--study-id",
Expand Down Expand Up @@ -296,8 +292,7 @@ def test_ingest_mtx_matrix(self, mock_check_unique_cells):
return_value=True,
)
def test_ingest_unsorted_mtx_matrix(self, mock_check_unique_cells):
"""Ingest Pipeline should extract and transform unsorted MTX matrix bundles
"""
"""Ingest Pipeline should extract and transform unsorted MTX matrix bundles"""

args = [
"--study-id",
Expand Down Expand Up @@ -331,8 +326,7 @@ def test_ingest_unsorted_mtx_matrix(self, mock_check_unique_cells):
return_value=True,
)
def test_ingest_zipped_mtx_matrix(self, mock_check_unique_cells):
"""Ingest Pipeline should extract and transform MTX matrix bundles
"""
"""Ingest Pipeline should extract and transform MTX matrix bundles"""

args = [
"--study-id",
Expand Down Expand Up @@ -366,8 +360,7 @@ def test_ingest_zipped_mtx_matrix(self, mock_check_unique_cells):
return_value=True,
)
def test_remote_mtx_bundles(self, mock_check_unique_cells):
"""Ingest Pipeline should handle MTX matrix files fetched from bucket
"""
"""Ingest Pipeline should handle MTX matrix files fetched from bucket"""

args = [
"--study-id",
Expand Down Expand Up @@ -402,8 +395,7 @@ def test_remote_mtx_bundles(self, mock_check_unique_cells):
return_value=True,
)
def test_mtx_bundle_argument_validation(self, mock_check_unique_cells):
"""Omitting --gene-file and --barcode-file in MTX ingest should error
"""
"""Omitting --gene-file and --barcode-file in MTX ingest should error"""

args = [
"--study-id",
Expand Down Expand Up @@ -452,8 +444,7 @@ def test_r_file_dense(self, mock_check_unique_cells, mock_load):
self.execute_ingest(args)

def test_good_metadata_file(self):
"""Ingest Pipeline should succeed for properly formatted metadata file
"""
"""Ingest Pipeline should succeed for properly formatted metadata file"""
args = [
"--study-id",
"5d276a50421aa9117c982845",
Expand Down Expand Up @@ -488,7 +479,7 @@ def test_exponential_back_off_expression_file(
self, mock_check_unique_cells, mock_load
):
"""Ingest Pipeline should not succeed if mongo cannot connect after 5
reconnection tries.
reconnection tries.
"""
args = [
"--study-id",
Expand Down Expand Up @@ -522,8 +513,7 @@ def test_exponential_back_off_expression_file(
self.assertEqual(cm.exception.code, 1)

def test_bad_metadata_file(self):
"""Ingest Pipeline should not succeed for misformatted metadata file
"""
"""Ingest Pipeline should not succeed for misformatted metadata file"""

args = [
"--study-id",
Expand Down Expand Up @@ -565,8 +555,7 @@ def test_bad_metadata_file_contains_coordinates(self):
self.assertEqual(cm.exception.code, 1)

def test_good_cluster_file(self):
"""Ingest Pipeline should succeed for properly formatted cluster file
"""
"""Ingest Pipeline should succeed for properly formatted cluster file"""
args = [
"--study-id",
"5d276a50421aa9117c982845",
Expand All @@ -590,8 +579,7 @@ def test_good_cluster_file(self):
self.assertEqual(status[0], None)

def test_bad_cluster_file(self):
"""Ingest Pipeline should fail for misformatted cluster file
"""
"""Ingest Pipeline should fail for misformatted cluster file"""
args = [
"--study-id",
"5d276a50421aa9117c982845",
Expand All @@ -611,8 +599,7 @@ def test_bad_cluster_file(self):
self.assertEqual(cm.exception.code, 1)

def test_bad_cluster_missing_coordinate_file(self):
"""Ingest Pipeline should fail for missing coordinate in cluster file
"""
"""Ingest Pipeline should fail for missing coordinate in cluster file"""
args = [
"--study-id",
"5d276a50421aa9117c982845",
Expand All @@ -633,8 +620,7 @@ def test_bad_cluster_missing_coordinate_file(self):

@patch("ingest_pipeline.IngestPipeline.load_subsample", return_value=0)
def test_subsample(self, mock_load_subsample):
"""When cell values in cluster are present in cell metadata file ingest should succeed.
"""
"""When cell values in cluster are present in cell metadata file ingest should succeed."""
args = [
"--study-id",
"5d276a50421aa9117c982845",
Expand Down Expand Up @@ -689,8 +675,7 @@ def test_get_cluster_query(self):

@patch("ingest_pipeline.IngestPipeline.load_subsample", return_value=0)
def test_subsample_no_cell_intersection(self, mock_load_subsample):
"""When cell values in cluster are not present in cell metadata file ingest should fail.
"""
"""When cell values in cluster are not present in cell metadata file ingest should fail."""
args = [
"--study-id",
"5d276a50421aa9117c982845",
Expand Down Expand Up @@ -737,6 +722,24 @@ def test_extract_cluster_file_from_anndata(self):
except:
print(f"Error while deleting file : {filename}")

def test_invalid_obsm_key_for_anndata(self):
args = [
"--study-id",
"5d276a50421aa9117c982845",
"--study-file-id",
"5dd5ae25421aa910a723a337",
"ingest_anndata",
"--ingest-anndata",
"--extract",
"['cluster']",
"--anndata-file",
"../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad",
"--obsm-keys",
"['foo']",
]
ingest, arguments, status, status_cell_metadata = self.execute_ingest(args)
self.assertEqual(status[0], 1)

def test_extract_metadata_file_from_anndata(self):
args = [
"--study-id",
Expand Down Expand Up @@ -857,16 +860,12 @@ def test_insert_reconnect(self):
client_mock["data_arrays"].insert_many.assert_called_with(docs)

client_mock["data_arrays"].insert_many.side_effect = ValueError("Foo")
self.assertRaises(
Exception, ingest.insert_many, "data_arrays", docs
)
self.assertRaises(Exception, ingest.insert_many, "data_arrays", docs)
client_mock.reset_mock()

# Test exponential back off for auto reconnect
client_mock["data_arrays"].insert_many.side_effect = AutoReconnect
self.assertRaises(
AutoReconnect, ingest.insert_many, "data_arrays", docs
)
self.assertRaises(AutoReconnect, ingest.insert_many, "data_arrays", docs)
self.assertEqual(client_mock["data_arrays"].insert_many.call_count, 3)
client_mock.reset_mock()

Expand All @@ -890,9 +889,7 @@ def raiseError(*args, **kwargs):

# Test exponential back off for BulkWriteError
client_mock["data_arrays"].insert_many.side_effect = raiseError
self.assertRaises(
BulkWriteError, ingest.insert_many, "data_arrays", docs
)
self.assertRaises(BulkWriteError, ingest.insert_many, "data_arrays", docs)
self.assertEqual(client_mock["data_arrays"].insert_many.call_count, 3)


Expand Down