Skip to content

Commit 1f0dcf5

Browse files
authored
Merge pull request #357 from broadinstitute/development
Release 1.33.0
2 parents 49b75e1 + ed710aa commit 1f0dcf5

File tree

5 files changed

+263
-7
lines changed

5 files changed

+263
-7
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,5 @@ errors.txt
3030
info.txt
3131
log.txt
3232
user_log.txt
33+
34+
ingest/.cas-api-token

ingest/cas.py

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
"""Cell Annotation Service (CAS) ETL for Single Cell Portal
2+
3+
Context: https://github.com/broadinstitute/scp-ingest-pipeline/pull/353
4+
"""
5+
6+
import json
7+
import re
8+
9+
import scanpy as sc
10+
11+
from cellarium.cas import CASClient
12+
from cellarium.cas._io import suppress_stderr
13+
import cellarium.cas.postprocessing.ontology_aware as pp
14+
from cellarium.cas.postprocessing.cell_ontology import CellOntologyCache
15+
from cellarium.cas.postprocessing import insert_cas_ontology_aware_response_into_adata
16+
17+
# Comment out block below below unless debugging
18+
# from cellarium.cas.visualization import CASCircularTreePlotUMAPDashApp
19+
20+
with suppress_stderr():
21+
cl = CellOntologyCache()
22+
23+
input_path = "pbmc_10x_v3_4k.h5ad"
24+
cas_response_output_path = f"{input_path}__cas_response_ontology_aware.json"
25+
26+
def cas_annotate(input_path, output_path):
27+
"""Call CAS for an H5AD, write results; return input AnnData, CAS response
28+
"""
29+
# TODO (SCP-5715):
30+
# - Add CAS API token to Google Secrets Manager
31+
# - Update block below to use GSM
32+
# - Ensure team removes any local .cas-api-token files and .gitignore entry
33+
with open(".cas-api-token") as f:
34+
api_token = f.read().strip()
35+
36+
cas = CASClient(api_token=api_token)
37+
print('CASClient')
38+
print(cas)
39+
40+
adata = sc.read_h5ad(input_path)
41+
42+
# Returns summary results that have substantial post-processing support
43+
cas_ontology_aware_response = cas.annotate_matrix_cell_type_ontology_aware_strategy(
44+
matrix=adata,
45+
chunk_size=500,
46+
feature_ids_column_name='gene_ids',
47+
feature_names_column_name='index'
48+
)
49+
50+
# Returns dataset-specific results.
51+
# Ontology-aware strategy (above, not this) is preferred.
52+
# cas_response = cas.annotate_matrix_cell_type_summary_statistics_strategy(
53+
# matrix=adata
54+
# )
55+
56+
with open(output_path, "w") as f:
57+
f.write(json.dumps(cas_ontology_aware_response))
58+
59+
adata.write(f"cas_annotated_before_postprocessing__{input_path}")
60+
61+
print(f"Wrote CAS response to: {output_path}")
62+
63+
return [adata, cas_ontology_aware_response]
64+
65+
def make_compliant_for_scp(adata):
66+
"""Shim to make AnnData file compliant with SCP metadata schema
67+
68+
Only use for demo / development purposes. This is generally commented out
69+
upstream.
70+
"""
71+
if 'biosample_id' not in adata.obs:
72+
adata.obs['biosample_id'] = "sample-1"
73+
adata.obs['donor_id'] = "donor-1"
74+
adata.obs['species'] = "NCBITaxon_9606"
75+
adata.obs['species__ontology_label'] = "Homo sapiens"
76+
adata.obs['disease'] = "PATO_0000461"
77+
adata.obs['disease__ontology_label'] = "normal"
78+
adata.obs['organ'] = "UBERON_0000178"
79+
adata.obs['organ__ontology_label'] = "blood"
80+
adata.obs['library_preparation_protocol'] = "EFO_0030059"
81+
adata.obs['library_preparation_protocol__ontology_label'] = "10x 3' v3"
82+
adata.obs['sex'] = "female"
83+
return adata
84+
85+
# Stub for potential later expansion
86+
# def format_as_scp_metadatum(cas_response_path):
87+
# with open(cas_response_filepath) as f:
88+
# cas_response = json.loads(f.read())
89+
90+
# tsv_rows = []
91+
# for cas_item in cas_response:
92+
# cell = cas_item["query_cell_id"]
93+
# first_match = cas_item["matches"][0]
94+
# annotation_label = cas_item["matches"][0]
95+
96+
def merge_cas_results(adata, cas_ontology_aware_response):
97+
"""Update AnnData with ontology-aware CAS results JSON
98+
"""
99+
insert_cas_ontology_aware_response_into_adata(cas_ontology_aware_response, adata, cl)
100+
101+
# Comment out block below unless debugging
102+
# DASH_SERVER_PORT = 8050
103+
# with suppress_stderr():
104+
# CASCircularTreePlotUMAPDashApp(
105+
# adata, # the AnnData file
106+
# cas_ontology_aware_response, # CAS response
107+
# cluster_label_obs_column="cluster_label", # (optional) The .obs column name containing cluster labels
108+
# ).run(port=DASH_SERVER_PORT, debug=False, jupyter_width="100%")
109+
110+
# TODO: Consider conditionally using this `single` method if (and only if)
111+
# AnnData lacks "raw annotations" as returned from e.g. Scanpy or Seurat
112+
# ("0", "1", "2", etc.)
113+
# pp.compute_most_granular_top_k_calls_single(
114+
# adata=adata,
115+
# cl=cl,
116+
# min_acceptable_score=0.1, # minimum acceptable score for a call
117+
# top_k=3, # how many top calls to make?
118+
# obs_prefix="cell_type_cas" # .obs column to write the top-k calls to
119+
# )
120+
121+
# Use this only if source AnnData has clusterings
122+
pp.compute_most_granular_top_k_calls_cluster(
123+
adata=adata,
124+
cl=cl,
125+
min_acceptable_score=0.1, # minimum acceptable score for a call
126+
cluster_label_obs_column='cluster_label', # .obs column containing cluster labels
127+
top_k=3, # how many top calls to make?
128+
obs_prefix='cell_type_cas' # .obs column to write the top-k calls to
129+
)
130+
131+
return adata
132+
133+
def trim_cas_adata(adata):
134+
"""Trim CAS AnnData to only annotation labels; omit IDs, scores
135+
136+
This ensures the AnnData can trivially initialize a polished SCP study.
137+
The IDs, scores etc. can be easily repopulated via `merge_cas_results` for
138+
debugging, convenient fuller AnnData output if desired, etc.
139+
"""
140+
# "Name" is ontology ID, e.g. CL_0000897. Score is CAS confidence.
141+
annots_to_omit = ["name", "score"]
142+
143+
columns = list(adata.obs)
144+
for to_omit in annots_to_omit:
145+
for col in columns:
146+
# E.g. re.match('cas.*_name_\d+', 'cas_cell_type_name_1')
147+
match = re.match(f".*_cas_.*{to_omit}_\d+", col)
148+
if match:
149+
del adata.obs[col]
150+
151+
return adata
152+
153+
# TODO: Consider porting to SCP Core JS
154+
# def friendlify_cas_adata(adata):
155+
# """Make CAS annotation names human-friendlier, e.g. "CAS
156+
# """
157+
158+
# columns = list(adata.obs)
159+
# for old_col in columns:
160+
# if not '_cas_' in old_col:
161+
# continue
162+
# # E.g. cell_type_cas_label_42 -> cell_type_(CAS 42)
163+
# col = re.sub(r'_cas_label_(\d+)', r'_(CAS \1)_', old_col)
164+
165+
# # E.g. cell_type_(CAS 42) -> cell type (CAS 42)
166+
# col = col.replace('_', ' ')
167+
168+
# # E.g. cell type (CAS 42) -> Cell type (CAS 42)
169+
# col = col[0].upper() + col[1:]
170+
171+
# # E.g. Cell type (CAS 1) -> Cell type (CAS)
172+
# col = col.replace('CAS 1)', 'CAS)')
173+
174+
# adata.obs[col] = adata.obs[old_col]
175+
# del adata.obs[old_col]
176+
177+
# return adata
178+
179+
def save_anndata(adata, stem, input_path):
180+
cas_anndata_output_path = f"{stem}__{input_path}"
181+
adata.write(cas_anndata_output_path)
182+
print(f"Wrote AnnData: {cas_anndata_output_path}")
183+
184+
def run(input_path, cas_response_output_path):
185+
print("Running CAS ingest")
186+
adata, cas_ontology_aware_response = cas_annotate(input_path, cas_response_output_path)
187+
188+
# Comment out block below below unless debugging
189+
# adata = sc.read_h5ad(f"cas_annotated_before_postprocessing__{input_path}")
190+
# with open(cas_response_output_path) as f:
191+
# cas_ontology_aware_response = json.loads(f.read())
192+
193+
adata = merge_cas_results(adata, cas_ontology_aware_response)
194+
save_anndata(adata, "cas_annotated_", input_path)
195+
196+
adata = trim_cas_adata(adata)
197+
save_anndata(adata, "cas_annotated_trimmed", input_path)
198+
199+
# Comment out block below below unless debugging or generating demo data
200+
# TODO: Enable SCP metadata validation exemption for AnnData
201+
# adata = make_compliant_for_scp(adata)
202+
# save_anndata(adata, "cas_annotated_trimmed_compliant", input_path)
203+
204+
205+
if __name__ == "__main__":
206+
run(input_path, cas_response_output_path)

ingest/ingest_pipeline.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -280,10 +280,8 @@ def load_subsample(
280280
annot_name = subsampled_data[1][0]
281281
annot_type = subsampled_data[1][1]
282282
sample_size = subsampled_data[2]
283-
query = {
284-
"study_id": ObjectId(self.study_id),
285-
"study_file_id": ObjectId(self.study_file_id),
286-
}
283+
query = self.get_cluster_query()
284+
287285
# Query mongo for linear_id and 'name' of parent
288286
# Then return 'name' and 'id' fields from query results
289287
parent_data = self.db[parent_collection_name].find_one(
@@ -312,6 +310,21 @@ def load_subsample(
312310
return 1
313311
return 0
314312

313+
def get_cluster_query(self):
314+
"""Generate MongoDB query to load ClusterGroup to set association IDs when subsampling"""
315+
query = {
316+
"study_id": ObjectId(self.study_id),
317+
"study_file_id": ObjectId(self.study_file_id),
318+
}
319+
320+
# if this is an AnnData file, we need to append in the cluster name, otherwise AnnData studies with
321+
# multiple clusters will fail subsampling as the first cluster is always returned from the query
322+
file_type = config.get_metric_properties().get_properties().get('fileType')
323+
if file_type and file_type == "AnnData":
324+
query["name"] = self.kwargs.get("name")
325+
326+
return query
327+
315328
def upload_metadata_to_bq(self):
316329
"""Uploads metadata to BigQuery"""
317330
if self.kwargs["validate_convention"] is not None:

requirements.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ colorama==0.4.1
1313
pymongo==4.6.3
1414
backoff==1.10.0
1515
scanpy==1.9.2
16-
anndata==0.9.1
16+
anndata==0.8.0
1717
ftfy==6.2.0
18+
cellarium-cas[vis]==1.4.11
1819

1920
# Dev dependencies
2021
pytest==7.2.1
@@ -38,6 +39,6 @@ opencensus==0.7.6
3839
opencensus-context==0.1.1
3940
opencensus-ext-stackdriver==0.7.2
4041
google-cloud-trace==0.23.0
41-
sentry-sdk==1.14.0
42+
sentry-sdk==2.8.0
4243

4344
# memory-profiler==0.57.0

tests/test_ingest.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
3333
"""
3434
import unittest
35-
from unittest.mock import patch
35+
from unittest.mock import patch, MagicMock
3636
from test_dense import mock_load_r_files
3737
import os
3838

@@ -652,6 +652,40 @@ def test_subsample(self, mock_load_subsample):
652652
self.assertEqual(len(status), 1)
653653
self.assertEqual(status[0], 0)
654654

655+
def test_get_cluster_query(self):
656+
"""When subsampling AnnData files cluster name should be appended to query"""
657+
args = [
658+
"--study-id",
659+
"5d276a50421aa9117c982845",
660+
"--study-file-id",
661+
"5dd5ae25421aa910a723a337",
662+
"ingest_subsample",
663+
"--cluster-file",
664+
"../tests/data/good_subsample_cluster.csv",
665+
"--name",
666+
"custer1",
667+
"--cell-metadata-file",
668+
"../tests/data/test_cell_metadata.csv",
669+
"--subsample",
670+
]
671+
parsed_args = create_parser().parse_args(args)
672+
validate_arguments(parsed_args)
673+
arguments = vars(parsed_args)
674+
ingest = IngestPipeline(**arguments)
675+
676+
mock_metrics = MagicMock()
677+
mock_metrics.get_properties.return_value = {"fileType": "AnnData"}
678+
with patch("config.get_metric_properties", return_value=mock_metrics):
679+
query = ingest.get_cluster_query()
680+
expected_keys = ['study_id', 'study_file_id', 'name']
681+
self.assertEqual(expected_keys, list(query.keys()))
682+
683+
mock_metrics.get_properties.return_value = {"fileType": "Cluster"}
684+
with patch("config.get_metric_properties", return_value=mock_metrics):
685+
query = ingest.get_cluster_query()
686+
expected_keys = ['study_id', 'study_file_id']
687+
self.assertEqual(expected_keys, list(query.keys()))
688+
655689
@patch("ingest_pipeline.IngestPipeline.load_subsample", return_value=0)
656690
def test_subsample_no_cell_intersection(self, mock_load_subsample):
657691
"""When cell values in cluster are not present in cell metadata file ingest should fail.

0 commit comments

Comments
 (0)