Skip to content

Commit a412c3b

Browse files
authored
Merge pull request #389 from broadinstitute/development
Release 1.41.0
2 parents b8bcf46 + 9c81680 commit a412c3b

19 files changed

+363
-89
lines changed

ingest/anndata_.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
3636
IngestFiles.__init__(
3737
self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
3838
)
39+
self.kwargs = kwargs
3940

4041
def obtain_adata(self):
4142
try:
@@ -58,6 +59,36 @@ def basic_validation(self):
5859
except ValueError:
5960
return False
6061

62+
def validate_raw_location(self):
63+
"""
64+
Confirm file has data at raw_location
65+
"""
66+
adata = self.obtain_adata()
67+
raw_location = self.kwargs.get("raw_location")
68+
if raw_location is not None:
69+
try:
70+
if raw_location == ".raw":
71+
if adata.raw is None:
72+
msg = f'No data found in .raw slot'
73+
log_exception(
74+
IngestFiles.dev_logger, IngestFiles.user_logger, msg
75+
)
76+
raise ValueError(msg)
77+
else:
78+
if raw_location not in adata.layers.keys():
79+
msg = f'No data found at adata.layers["{raw_location}"]'
80+
log_exception(
81+
IngestFiles.dev_logger, IngestFiles.user_logger, msg
82+
)
83+
raise ValueError(msg)
84+
return True
85+
except ValueError:
86+
return False
87+
else:
88+
msg = 'Must specify location of raw counts in AnnData object'
89+
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
90+
return False
91+
6192
def create_cell_data_arrays(self):
6293
"""Extract cell name DataArray documents for raw data"""
6394
adata = self.obtain_adata()

ingest/cli_parser.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
"""Helper functions for ingest_pipeline.py
2-
"""
1+
"""Helper functions for ingest_pipeline.py"""
32

43
import argparse
54
import ast
@@ -451,6 +450,12 @@ def create_parser():
451450
help="Array of obsm key(s) to extract as cluster files",
452451
)
453452

453+
parser_anndata.add_argument(
454+
"--raw-location",
455+
help="location of raw counts. '.raw' for raw slot, "
456+
"else adata.layers key value or None if no raw counts",
457+
)
458+
454459
parser_anndata.add_argument(
455460
"--extract",
456461
type=ast.literal_eval,

ingest/de.py

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ def write_de_result(adata, group, annotation, rank_key, cluster_name, extra_para
367367
clean_group = DifferentialExpression.sanitize_string(group)
368368
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
369369
DifferentialExpression.de_logger.info(
370-
f"Writing DE output for {clean_group} vs rest"
370+
f"Writing DE output for {clean_group} vs restq"
371371
)
372372
elif de_type == "pairwise":
373373
# rank_genes_groups accepts a list. For SCP pairwise, should be a list with one item
@@ -403,6 +403,7 @@ def run_scanpy_de(
403403
):
404404
method = extra_params.get("method")
405405
de_type = extra_params.get("de_type")
406+
raw_location = extra_params.get("raw_location")
406407

407408
try:
408409
DifferentialExpression.assess_annotation(annotation, metadata, extra_params)
@@ -432,24 +433,50 @@ def run_scanpy_de(
432433
)
433434

434435
if matrix_file_type == "h5ad":
435-
if orig_adata.raw is not None:
436-
adata = AnnData(
437-
# using .copy() for the AnnData components is good practice
438-
# but we won't be using orig_adata for analyses
439-
# choosing to avoid .copy() for memory efficiency
440-
X=orig_adata.raw.X,
441-
obs=orig_adata.obs,
442-
var=orig_adata.var,
443-
)
436+
if raw_location == ".raw":
437+
if orig_adata.raw is not None:
438+
DifferentialExpression.de_logger.info(
439+
f"Performing DE on {raw_location} data"
440+
)
441+
adata = AnnData(
442+
# using .copy() for the AnnData components is good practice
443+
# but we won't be using orig_adata for analyses
444+
# choosing to avoid .copy() for memory efficiency
445+
X=orig_adata.raw.X,
446+
obs=orig_adata.obs,
447+
var=orig_adata.var,
448+
)
449+
else:
450+
msg = f'{matrix_file_path} does not have a .raw attribute'
451+
print(msg)
452+
log_exception(
453+
DifferentialExpression.dev_logger,
454+
DifferentialExpression.de_logger,
455+
msg,
456+
)
457+
raise ValueError(msg)
444458
else:
445-
msg = f'{matrix_file_path} does not have a .raw attribute'
446-
print(msg)
447-
log_exception(
448-
DifferentialExpression.dev_logger,
449-
DifferentialExpression.de_logger,
450-
msg,
451-
)
452-
raise ValueError(msg)
459+
if raw_location in orig_adata.layers.keys():
460+
DifferentialExpression.de_logger.info(
461+
f"Performing DE on adata.layers['{raw_location}'] data"
462+
)
463+
adata = AnnData(
464+
# using .copy() for the AnnData components is good practice
465+
# but we won't be using orig_adata for analyses
466+
# choosing to avoid .copy() for memory efficiency
467+
X=orig_adata.layers[raw_location],
468+
obs=orig_adata.obs,
469+
var=orig_adata.var,
470+
)
471+
else:
472+
msg = f'{matrix_file_path} does not have adata.layers["{raw_location}"]'
473+
print(msg)
474+
log_exception(
475+
DifferentialExpression.dev_logger,
476+
DifferentialExpression.de_logger,
477+
msg,
478+
)
479+
raise ValueError(msg)
453480
# AnnData expects gene x cell so dense and mtx matrices require transposition
454481
else:
455482
adata = adata.transpose()

ingest/ingest_pipeline.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['processed_expression']"
5656
5757
# Ingest AnnData - happy path raw count cell name only extraction
58-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']"
58+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']" --raw-location ".raw"
5959
6060
# Ingest AnnData - happy path cluster and metadata extraction
6161
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['cluster', 'metadata']" --obsm-keys "['X_umap','X_tsne']"
@@ -66,17 +66,20 @@
6666
# Differential expression analysis (sparse matrix)
6767
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression
6868
69-
# Differential expression analysis (h5ad matrix)
70-
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name louvain --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --matrix-file-type h5ad --annotation-file ../tests/data/anndata/h5ad_frag.metadata.tsv --cluster-file ../tests/data/anndata/h5ad_frag.cluster.X_umap.tsv --cluster-name umap --study-accession SCPdev --differential-expression
69+
# Differential expression analysis (h5ad matrix, raw count in raw slot)
70+
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type rest --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
71+
72+
# Differential expression analysis (h5ad matrix, raw count in adata.layers['counts'])
73+
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location 'counts' --annotation-name cell_type__ontology_label --de-type rest --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver_layers_counts.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
7174
7275
# Pairwise differential expression analysis (dense matrix)
7376
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['cholinergic neuron']" --group2 "cranial somatomotor neuron" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_dense_matrix.tsv --matrix-file-type dense --annotation-file ../tests/data/differential_expression/de_dense_metadata.tsv --cluster-file ../tests/data/differential_expression/de_dense_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression
7477
7578
# Pairwise differential expression analysis (sparse matrix)
7679
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['endothelial cell']" --group2 "smooth muscle cell" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression
7780
78-
# Pairwise differential expression analysis (h5ad matrix)
79-
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['mature B cell']" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
81+
# Pairwise differential expression analysis (h5ad matrix, raw count in raw slot)
82+
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type pairwise --group1 "mature B cell" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
8083
8184
"""
8285

@@ -559,7 +562,11 @@ def extract_from_anndata(self):
559562
if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get(
560563
'extract'
561564
):
562-
self.anndata.ingest_raw_cells()
565+
if self.anndata.validate_raw_location():
566+
self.anndata.ingest_raw_cells()
567+
else:
568+
self.report_validation("failure")
569+
return 1
563570
self.report_validation("success")
564571
return 0
565572
# scanpy unable to open AnnData file
270 Bytes
Binary file not shown.
621 Bytes
Binary file not shown.
3.88 KB
Binary file not shown.
Binary file not shown.
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
1738072997 # validation cache key
2-
1+
1742404288 # validation cache key
Binary file not shown.

0 commit comments

Comments
 (0)