Skip to content

Fail AnnData ingest if expected raw data is missing (SCP-5956) #388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions ingest/anndata_.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
IngestFiles.__init__(
self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
)
self.kwargs = kwargs

def obtain_adata(self):
try:
Expand All @@ -58,6 +59,36 @@ def basic_validation(self):
except ValueError:
return False

def validate_raw_location(self):
"""
Confirm file has data at raw_location
"""
adata = self.obtain_adata()
raw_location = self.kwargs.get("raw_location")
if raw_location is not None:
try:
if raw_location == ".raw":
if adata.raw is None:
msg = f'No data found in .raw slot'
log_exception(
IngestFiles.dev_logger, IngestFiles.user_logger, msg
)
raise ValueError(msg)
else:
if raw_location not in adata.layers.keys():
msg = f'No data found at adata.layers["{raw_location}"]'
log_exception(
IngestFiles.dev_logger, IngestFiles.user_logger, msg
)
raise ValueError(msg)
return True
except ValueError:
return False
else:
msg = 'Must specify location of raw counts in AnnData object'
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
return False

def create_cell_data_arrays(self):
"""Extract cell name DataArray documents for raw data"""
adata = self.obtain_adata()
Expand Down
13 changes: 6 additions & 7 deletions ingest/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,13 +280,6 @@ def create_parser():
help="Accepted values: 'pairwise' or 'rest' (default)",
)

parser_differential_expression.add_argument(
"--raw-location",
required=True,
help="location of raw counts. '.raw' for raw slot, "
"else adata.layers key value",
)

parser_differential_expression.add_argument(
"--study-accession",
required=True,
Expand Down Expand Up @@ -457,6 +450,12 @@ def create_parser():
help="Array of obsm key(s) to extract as cluster files",
)

parser_anndata.add_argument(
"--raw-location",
help="location of raw counts. '.raw' for raw slot, "
"else adata.layers key value or None if no raw counts",
)

parser_anndata.add_argument(
"--extract",
type=ast.literal_eval,
Expand Down
9 changes: 7 additions & 2 deletions ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['processed_expression']"

# Ingest AnnData - happy path raw count cell name only extraction
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']"
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']" --raw-location ".raw"

# Ingest AnnData - happy path cluster and metadata extraction
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['cluster', 'metadata']" --obsm-keys "['X_umap','X_tsne']"
Expand All @@ -80,6 +80,7 @@

# Pairwise differential expression analysis (h5ad matrix, raw count in raw slot)
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type pairwise --group1 "mature B cell" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression

"""

import json
Expand Down Expand Up @@ -561,7 +562,11 @@ def extract_from_anndata(self):
if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get(
'extract'
):
self.anndata.ingest_raw_cells()
if self.anndata.validate_raw_location():
self.anndata.ingest_raw_cells()
else:
self.report_validation("failure")
return 1
self.report_validation("success")
return 0
# scanpy unable to open AnnData file
Expand Down
95 changes: 71 additions & 24 deletions tests/test_anndata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" test_anndata.py
verify basic AnnData validation works as expected
"""test_anndata.py
verify basic AnnData validation works as expected
"""

import unittest
Expand All @@ -25,6 +25,7 @@ class TestAnnDataIngestor(unittest.TestCase):
def setup_class(self):
filepath_valid = "../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad"
filepath_invalid = "../tests/data/anndata/bad.h5"
filepath_layers = "../tests/data/anndata/compliant_liver_layers_counts.h5ad"
filepath_dup_feature = "../tests/data/anndata/dup_feature.h5ad"
filepath_dup_cell = "../tests/data/anndata/dup_cell.h5ad"
filepath_nan = "../tests/data/anndata/nan_value.h5ad"
Expand All @@ -34,6 +35,7 @@ def setup_class(self):
self.study_file_id = "dec0dedfeed0000000000000"
self.valid_args = [filepath_valid, self.study_id, self.study_file_id]
self.invalid_args = [filepath_invalid, self.study_id, self.study_file_id]
self.layers_args = [filepath_layers, self.study_id, self.study_file_id]
self.dup_feature_args = [
filepath_dup_feature,
self.study_id,
Expand All @@ -44,7 +46,7 @@ def setup_class(self):
self.synthetic_args = [filepath_synthetic, self.study_id, self.study_file_id]
self.boolean_args = [filepath_boolean, self.study_id, self.study_file_id]
self.cluster_name = 'X_tsne'
self.valid_kwargs = {'obsm_keys': [self.cluster_name]}
self.valid_kwargs = {'obsm_keys': [self.cluster_name], 'raw_location': '.raw'}
self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs)
self.cluster_filename = f"h5ad_frag.cluster.{self.cluster_name}.tsv"
self.metadata_filename = "h5ad_frag.metadata.tsv"
Expand Down Expand Up @@ -158,7 +160,9 @@ def test_generate_metadata_file(self):
"library_preparation_protocol__ontology_label\n",
]
self.assertEqual(
expected_names, name_line, 'did not get expected headers from metadata body'
expected_names,
name_line,
'did not get expected headers from metadata body',
)
type_line = metadata_body.readline().split("\t")
expected_types = [
Expand All @@ -180,43 +184,71 @@ def test_generate_metadata_file(self):
"GROUP\n",
]
self.assertEqual(
expected_types, type_line, 'did not get expected types from metadata body'
expected_types,
type_line,
'did not get expected types from metadata body',
)

def test_generate_metadata_with_boolean(self):
boolean_ingest = AnnDataIngestor(*self.boolean_args, **self.valid_kwargs)
adata = boolean_ingest.obtain_adata()
boolean_filename = "h5ad_frag.metadata_boolean.tsv"
boolean_ingest.generate_metadata_file(
adata, boolean_filename
)
boolean_ingest.generate_metadata_file(adata, boolean_filename)
self.assertEqual(
'bool', adata.obs['is_primary_data'].dtype.name,
'did not correctly get "bool" dtype for "is_primary_data"'
'bool',
adata.obs['is_primary_data'].dtype.name,
'did not correctly get "bool" dtype for "is_primary_data"',
)
compressed_file = boolean_filename + ".gz"
with gzip.open(compressed_file, "rt", encoding="utf-8-sig") as metadata_body:
name_line = metadata_body.readline().split("\t")
expected_headers = [
'NAME', 'donor_id', 'biosample_id', 'sex', 'species', 'species__ontology_label',
'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'organ',
'organ__ontology_label', 'disease', 'disease__ontology_label', "is_primary_data\n"
'NAME',
'donor_id',
'biosample_id',
'sex',
'species',
'species__ontology_label',
'library_preparation_protocol',
'library_preparation_protocol__ontology_label',
'organ',
'organ__ontology_label',
'disease',
'disease__ontology_label',
"is_primary_data\n",
]
self.assertEqual(
expected_headers, name_line, 'did not get expected headers from metadata body'
expected_headers,
name_line,
'did not get expected headers from metadata body',
)
expected_types = [
'TYPE', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP',
'GROUP', "GROUP\n"
'TYPE',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
'GROUP',
"GROUP\n",
]
type_line = metadata_body.readline().split("\t")
self.assertEqual(
expected_types, type_line, 'did not get expected types from metadata body'
expected_types,
type_line,
'did not get expected types from metadata body',
)
for line in metadata_body.readlines():
is_primary_data = line.split("\t")[12].strip()
self.assertEqual(
"False", is_primary_data, 'did not correctly read boolean value as string from data'
"False",
is_primary_data,
'did not correctly read boolean value as string from data',
)

def test_gene_id_indexed_generate_processed_matrix(self):
Expand Down Expand Up @@ -248,14 +280,16 @@ def test_gene_id_indexed_generate_processed_matrix(self):
filtered_adata.write('indexed_by_gene_id.h5ad')
"""
indexed_by_geneid = AnnDataIngestor(
"../tests/data/anndata/indexed_by_gene_id.h5ad", self.study_id, self.study_file_id
"../tests/data/anndata/indexed_by_gene_id.h5ad",
self.study_id,
self.study_file_id,
)
adata = indexed_by_geneid.obtain_adata()
self.anndata_ingest.generate_processed_matrix(adata)

now = time.time() # current time (ms since epoch)
now = time.time() # current time (ms since epoch)
expected_features_fp = 'h5ad_frag.features.processed.tsv.gz'
mtime = os.path.getmtime(expected_features_fp) # modified time (ms since epoch)
mtime = os.path.getmtime(expected_features_fp) # modified time (ms since epoch)
self.assertTrue(abs(now - mtime) < 1000)

with gzip.open(expected_features_fp, 'rt') as f:
Expand All @@ -269,14 +303,18 @@ def test_gene_id_indexed_generate_processed_matrix(self):
def test_check_if_indexed_by_gene_id(self):
# check var.index.name
feature_name = AnnDataIngestor(
"../tests/data/anndata/indexed_by_gene_id.h5ad", self.study_id, self.study_file_id
"../tests/data/anndata/indexed_by_gene_id.h5ad",
self.study_id,
self.study_file_id,
)
adata = feature_name.obtain_adata()
self.assertTrue(feature_name.check_ensembl_index(adata))

# check data inspection
data_inspect = AnnDataIngestor(
"../tests/data/anndata/cellxgene.human_liver_b_cells.h5ad", self.study_id, self.study_file_id
"../tests/data/anndata/cellxgene.human_liver_b_cells.h5ad",
self.study_id,
self.study_file_id,
)
liver_adata = data_inspect.obtain_adata()
self.assertTrue(data_inspect.check_ensembl_index(liver_adata))
Expand Down Expand Up @@ -318,8 +356,17 @@ def test_create_raw_cells_arrays(self):
self.assertEqual('h5ad_frag.matrix.raw.mtx.gz Cells', data_array['name'])
self.assertEqual(2638, len(data_array['values']))


def test_ingest_raw_cells(self):
with patch('anndata_.bypass_mongo_writes', return_value=False):
self.anndata_ingest.ingest_raw_cells()
self.assertEqual(1, self.anndata_ingest.models_processed)

def test_validate_raw_location(self):
result = self.anndata_ingest.validate_raw_location()
self.assertTrue(result)

def test_invalid_raw_location(self):
self.invalid_kwargs = {'obsm_keys': [self.cluster_name], 'raw_location': 'foo'}
self.anndata_ingest = AnnDataIngestor(*self.layers_args, **self.invalid_kwargs)
result = self.anndata_ingest.validate_raw_location()
self.assertFalse(result)