Skip to content

Commit 9c81680

Browse files
authored
Merge pull request #388 from broadinstitute/jlc_fail_missing_raw
Fail AnnData ingest if expected raw data is missing (SCP-5956)
2 parents 17f7483 + 15e2ec8 commit 9c81680

File tree

4 files changed

+115
-33
lines changed

4 files changed

+115
-33
lines changed

ingest/anndata_.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
3636
IngestFiles.__init__(
3737
self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
3838
)
39+
self.kwargs = kwargs
3940

4041
def obtain_adata(self):
4142
try:
@@ -58,6 +59,36 @@ def basic_validation(self):
5859
except ValueError:
5960
return False
6061

62+
def validate_raw_location(self):
63+
"""
64+
Confirm file has data at raw_location
65+
"""
66+
adata = self.obtain_adata()
67+
raw_location = self.kwargs.get("raw_location")
68+
if raw_location is not None:
69+
try:
70+
if raw_location == ".raw":
71+
if adata.raw is None:
72+
msg = f'No data found in .raw slot'
73+
log_exception(
74+
IngestFiles.dev_logger, IngestFiles.user_logger, msg
75+
)
76+
raise ValueError(msg)
77+
else:
78+
if raw_location not in adata.layers.keys():
79+
msg = f'No data found at adata.layers["{raw_location}"]'
80+
log_exception(
81+
IngestFiles.dev_logger, IngestFiles.user_logger, msg
82+
)
83+
raise ValueError(msg)
84+
return True
85+
except ValueError:
86+
return False
87+
else:
88+
msg = 'Must specify location of raw counts in AnnData object'
89+
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
90+
return False
91+
6192
def create_cell_data_arrays(self):
6293
"""Extract cell name DataArray documents for raw data"""
6394
adata = self.obtain_adata()

ingest/cli_parser.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -280,13 +280,6 @@ def create_parser():
280280
help="Accepted values: 'pairwise' or 'rest' (default)",
281281
)
282282

283-
parser_differential_expression.add_argument(
284-
"--raw-location",
285-
required=True,
286-
help="location of raw counts. '.raw' for raw slot, "
287-
"else adata.layers key value",
288-
)
289-
290283
parser_differential_expression.add_argument(
291284
"--study-accession",
292285
required=True,
@@ -457,6 +450,12 @@ def create_parser():
457450
help="Array of obsm key(s) to extract as cluster files",
458451
)
459452

453+
parser_anndata.add_argument(
454+
"--raw-location",
455+
help="location of raw counts. '.raw' for raw slot, "
456+
"else adata.layers key value or None if no raw counts",
457+
)
458+
460459
parser_anndata.add_argument(
461460
"--extract",
462461
type=ast.literal_eval,

ingest/ingest_pipeline.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['processed_expression']"
5656
5757
# Ingest AnnData - happy path raw count cell name only extraction
58-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']"
58+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']" --raw-location ".raw"
5959
6060
# Ingest AnnData - happy path cluster and metadata extraction
6161
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['cluster', 'metadata']" --obsm-keys "['X_umap','X_tsne']"
@@ -80,6 +80,7 @@
8080
8181
# Pairwise differential expression analysis (h5ad matrix, raw count in raw slot)
8282
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type pairwise --group1 "mature B cell" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
83+
8384
"""
8485

8586
import json
@@ -561,7 +562,11 @@ def extract_from_anndata(self):
561562
if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get(
562563
'extract'
563564
):
564-
self.anndata.ingest_raw_cells()
565+
if self.anndata.validate_raw_location():
566+
self.anndata.ingest_raw_cells()
567+
else:
568+
self.report_validation("failure")
569+
return 1
565570
self.report_validation("success")
566571
return 0
567572
# scanpy unable to open AnnData file

tests/test_anndata.py

Lines changed: 71 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
""" test_anndata.py
2-
verify basic AnnData validation works as expected
1+
"""test_anndata.py
2+
verify basic AnnData validation works as expected
33
"""
44

55
import unittest
@@ -25,6 +25,7 @@ class TestAnnDataIngestor(unittest.TestCase):
2525
def setup_class(self):
2626
filepath_valid = "../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad"
2727
filepath_invalid = "../tests/data/anndata/bad.h5"
28+
filepath_layers = "../tests/data/anndata/compliant_liver_layers_counts.h5ad"
2829
filepath_dup_feature = "../tests/data/anndata/dup_feature.h5ad"
2930
filepath_dup_cell = "../tests/data/anndata/dup_cell.h5ad"
3031
filepath_nan = "../tests/data/anndata/nan_value.h5ad"
@@ -34,6 +35,7 @@ def setup_class(self):
3435
self.study_file_id = "dec0dedfeed0000000000000"
3536
self.valid_args = [filepath_valid, self.study_id, self.study_file_id]
3637
self.invalid_args = [filepath_invalid, self.study_id, self.study_file_id]
38+
self.layers_args = [filepath_layers, self.study_id, self.study_file_id]
3739
self.dup_feature_args = [
3840
filepath_dup_feature,
3941
self.study_id,
@@ -44,7 +46,7 @@ def setup_class(self):
4446
self.synthetic_args = [filepath_synthetic, self.study_id, self.study_file_id]
4547
self.boolean_args = [filepath_boolean, self.study_id, self.study_file_id]
4648
self.cluster_name = 'X_tsne'
47-
self.valid_kwargs = {'obsm_keys': [self.cluster_name]}
49+
self.valid_kwargs = {'obsm_keys': [self.cluster_name], 'raw_location': '.raw'}
4850
self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs)
4951
self.cluster_filename = f"h5ad_frag.cluster.{self.cluster_name}.tsv"
5052
self.metadata_filename = "h5ad_frag.metadata.tsv"
@@ -158,7 +160,9 @@ def test_generate_metadata_file(self):
158160
"library_preparation_protocol__ontology_label\n",
159161
]
160162
self.assertEqual(
161-
expected_names, name_line, 'did not get expected headers from metadata body'
163+
expected_names,
164+
name_line,
165+
'did not get expected headers from metadata body',
162166
)
163167
type_line = metadata_body.readline().split("\t")
164168
expected_types = [
@@ -180,43 +184,71 @@ def test_generate_metadata_file(self):
180184
"GROUP\n",
181185
]
182186
self.assertEqual(
183-
expected_types, type_line, 'did not get expected types from metadata body'
187+
expected_types,
188+
type_line,
189+
'did not get expected types from metadata body',
184190
)
185191

186192
def test_generate_metadata_with_boolean(self):
187193
boolean_ingest = AnnDataIngestor(*self.boolean_args, **self.valid_kwargs)
188194
adata = boolean_ingest.obtain_adata()
189195
boolean_filename = "h5ad_frag.metadata_boolean.tsv"
190-
boolean_ingest.generate_metadata_file(
191-
adata, boolean_filename
192-
)
196+
boolean_ingest.generate_metadata_file(adata, boolean_filename)
193197
self.assertEqual(
194-
'bool', adata.obs['is_primary_data'].dtype.name,
195-
'did not correctly get "bool" dtype for "is_primary_data"'
198+
'bool',
199+
adata.obs['is_primary_data'].dtype.name,
200+
'did not correctly get "bool" dtype for "is_primary_data"',
196201
)
197202
compressed_file = boolean_filename + ".gz"
198203
with gzip.open(compressed_file, "rt", encoding="utf-8-sig") as metadata_body:
199204
name_line = metadata_body.readline().split("\t")
200205
expected_headers = [
201-
'NAME', 'donor_id', 'biosample_id', 'sex', 'species', 'species__ontology_label',
202-
'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'organ',
203-
'organ__ontology_label', 'disease', 'disease__ontology_label', "is_primary_data\n"
206+
'NAME',
207+
'donor_id',
208+
'biosample_id',
209+
'sex',
210+
'species',
211+
'species__ontology_label',
212+
'library_preparation_protocol',
213+
'library_preparation_protocol__ontology_label',
214+
'organ',
215+
'organ__ontology_label',
216+
'disease',
217+
'disease__ontology_label',
218+
"is_primary_data\n",
204219
]
205220
self.assertEqual(
206-
expected_headers, name_line, 'did not get expected headers from metadata body'
221+
expected_headers,
222+
name_line,
223+
'did not get expected headers from metadata body',
207224
)
208225
expected_types = [
209-
'TYPE', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP',
210-
'GROUP', "GROUP\n"
226+
'TYPE',
227+
'GROUP',
228+
'GROUP',
229+
'GROUP',
230+
'GROUP',
231+
'GROUP',
232+
'GROUP',
233+
'GROUP',
234+
'GROUP',
235+
'GROUP',
236+
'GROUP',
237+
'GROUP',
238+
"GROUP\n",
211239
]
212240
type_line = metadata_body.readline().split("\t")
213241
self.assertEqual(
214-
expected_types, type_line, 'did not get expected types from metadata body'
242+
expected_types,
243+
type_line,
244+
'did not get expected types from metadata body',
215245
)
216246
for line in metadata_body.readlines():
217247
is_primary_data = line.split("\t")[12].strip()
218248
self.assertEqual(
219-
"False", is_primary_data, 'did not correctly read boolean value as string from data'
249+
"False",
250+
is_primary_data,
251+
'did not correctly read boolean value as string from data',
220252
)
221253

222254
def test_gene_id_indexed_generate_processed_matrix(self):
@@ -248,14 +280,16 @@ def test_gene_id_indexed_generate_processed_matrix(self):
248280
filtered_adata.write('indexed_by_gene_id.h5ad')
249281
"""
250282
indexed_by_geneid = AnnDataIngestor(
251-
"../tests/data/anndata/indexed_by_gene_id.h5ad", self.study_id, self.study_file_id
283+
"../tests/data/anndata/indexed_by_gene_id.h5ad",
284+
self.study_id,
285+
self.study_file_id,
252286
)
253287
adata = indexed_by_geneid.obtain_adata()
254288
self.anndata_ingest.generate_processed_matrix(adata)
255289

256-
now = time.time() # current time (ms since epoch)
290+
now = time.time() # current time (ms since epoch)
257291
expected_features_fp = 'h5ad_frag.features.processed.tsv.gz'
258-
mtime = os.path.getmtime(expected_features_fp) # modified time (ms since epoch)
292+
mtime = os.path.getmtime(expected_features_fp) # modified time (ms since epoch)
259293
self.assertTrue(abs(now - mtime) < 1000)
260294

261295
with gzip.open(expected_features_fp, 'rt') as f:
@@ -269,14 +303,18 @@ def test_gene_id_indexed_generate_processed_matrix(self):
269303
def test_check_if_indexed_by_gene_id(self):
270304
# check var.index.name
271305
feature_name = AnnDataIngestor(
272-
"../tests/data/anndata/indexed_by_gene_id.h5ad", self.study_id, self.study_file_id
306+
"../tests/data/anndata/indexed_by_gene_id.h5ad",
307+
self.study_id,
308+
self.study_file_id,
273309
)
274310
adata = feature_name.obtain_adata()
275311
self.assertTrue(feature_name.check_ensembl_index(adata))
276312

277313
# check data inspection
278314
data_inspect = AnnDataIngestor(
279-
"../tests/data/anndata/cellxgene.human_liver_b_cells.h5ad", self.study_id, self.study_file_id
315+
"../tests/data/anndata/cellxgene.human_liver_b_cells.h5ad",
316+
self.study_id,
317+
self.study_file_id,
280318
)
281319
liver_adata = data_inspect.obtain_adata()
282320
self.assertTrue(data_inspect.check_ensembl_index(liver_adata))
@@ -318,8 +356,17 @@ def test_create_raw_cells_arrays(self):
318356
self.assertEqual('h5ad_frag.matrix.raw.mtx.gz Cells', data_array['name'])
319357
self.assertEqual(2638, len(data_array['values']))
320358

321-
322359
def test_ingest_raw_cells(self):
323360
with patch('anndata_.bypass_mongo_writes', return_value=False):
324361
self.anndata_ingest.ingest_raw_cells()
325362
self.assertEqual(1, self.anndata_ingest.models_processed)
363+
364+
def test_validate_raw_location(self):
365+
result = self.anndata_ingest.validate_raw_location()
366+
self.assertTrue(result)
367+
368+
def test_invalid_raw_location(self):
369+
self.invalid_kwargs = {'obsm_keys': [self.cluster_name], 'raw_location': 'foo'}
370+
self.anndata_ingest = AnnDataIngestor(*self.layers_args, **self.invalid_kwargs)
371+
result = self.anndata_ingest.validate_raw_location()
372+
self.assertFalse(result)

0 commit comments

Comments
 (0)