|
| 1 | +"""Cell Annotation Service (CAS) ETL for Single Cell Portal |
| 2 | +
|
| 3 | +Context: https://github.com/broadinstitute/scp-ingest-pipeline/pull/353 |
| 4 | +""" |
| 5 | + |
| 6 | +import json |
| 7 | +import re |
| 8 | + |
| 9 | +import scanpy as sc |
| 10 | + |
| 11 | +from cellarium.cas import CASClient |
| 12 | +from cellarium.cas._io import suppress_stderr |
| 13 | +import cellarium.cas.postprocessing.ontology_aware as pp |
| 14 | +from cellarium.cas.postprocessing.cell_ontology import CellOntologyCache |
| 15 | +from cellarium.cas.postprocessing import insert_cas_ontology_aware_response_into_adata |
| 16 | + |
| 17 | +# Comment out block below below unless debugging |
| 18 | +# from cellarium.cas.visualization import CASCircularTreePlotUMAPDashApp |
| 19 | + |
| 20 | +with suppress_stderr(): |
| 21 | + cl = CellOntologyCache() |
| 22 | + |
| 23 | +input_path = "pbmc_10x_v3_4k.h5ad" |
| 24 | +cas_response_output_path = f"{input_path}__cas_response_ontology_aware.json" |
| 25 | + |
| 26 | +def cas_annotate(input_path, output_path): |
| 27 | + """Call CAS for an H5AD, write results; return input AnnData, CAS response |
| 28 | + """ |
| 29 | + # TODO (SCP-5715): |
| 30 | + # - Add CAS API token to Google Secrets Manager |
| 31 | + # - Update block below to use GSM |
| 32 | + # - Ensure team removes any local .cas-api-token files and .gitignore entry |
| 33 | + with open(".cas-api-token") as f: |
| 34 | + api_token = f.read().strip() |
| 35 | + |
| 36 | + cas = CASClient(api_token=api_token) |
| 37 | + print('CASClient') |
| 38 | + print(cas) |
| 39 | + |
| 40 | + adata = sc.read_h5ad(input_path) |
| 41 | + |
| 42 | + # Returns summary results that have substantial post-processing support |
| 43 | + cas_ontology_aware_response = cas.annotate_matrix_cell_type_ontology_aware_strategy( |
| 44 | + matrix=adata, |
| 45 | + chunk_size=500, |
| 46 | + feature_ids_column_name='gene_ids', |
| 47 | + feature_names_column_name='index' |
| 48 | + ) |
| 49 | + |
| 50 | + # Returns dataset-specific results. |
| 51 | + # Ontology-aware strategy (above, not this) is preferred. |
| 52 | + # cas_response = cas.annotate_matrix_cell_type_summary_statistics_strategy( |
| 53 | + # matrix=adata |
| 54 | + # ) |
| 55 | + |
| 56 | + with open(output_path, "w") as f: |
| 57 | + f.write(json.dumps(cas_ontology_aware_response)) |
| 58 | + |
| 59 | + adata.write(f"cas_annotated_before_postprocessing__{input_path}") |
| 60 | + |
| 61 | + print(f"Wrote CAS response to: {output_path}") |
| 62 | + |
| 63 | + return [adata, cas_ontology_aware_response] |
| 64 | + |
| 65 | +def make_compliant_for_scp(adata): |
| 66 | + """Shim to make AnnData file compliant with SCP metadata schema |
| 67 | +
|
| 68 | + Only use for demo / development purposes. This is generally commented out |
| 69 | + upstream. |
| 70 | + """ |
| 71 | + if 'biosample_id' not in adata.obs: |
| 72 | + adata.obs['biosample_id'] = "sample-1" |
| 73 | + adata.obs['donor_id'] = "donor-1" |
| 74 | + adata.obs['species'] = "NCBITaxon_9606" |
| 75 | + adata.obs['species__ontology_label'] = "Homo sapiens" |
| 76 | + adata.obs['disease'] = "PATO_0000461" |
| 77 | + adata.obs['disease__ontology_label'] = "normal" |
| 78 | + adata.obs['organ'] = "UBERON_0000178" |
| 79 | + adata.obs['organ__ontology_label'] = "blood" |
| 80 | + adata.obs['library_preparation_protocol'] = "EFO_0030059" |
| 81 | + adata.obs['library_preparation_protocol__ontology_label'] = "10x 3' v3" |
| 82 | + adata.obs['sex'] = "female" |
| 83 | + return adata |
| 84 | + |
| 85 | +# Stub for potential later expansion |
| 86 | +# def format_as_scp_metadatum(cas_response_path): |
| 87 | +# with open(cas_response_filepath) as f: |
| 88 | +# cas_response = json.loads(f.read()) |
| 89 | + |
| 90 | +# tsv_rows = [] |
| 91 | +# for cas_item in cas_response: |
| 92 | +# cell = cas_item["query_cell_id"] |
| 93 | +# first_match = cas_item["matches"][0] |
| 94 | +# annotation_label = cas_item["matches"][0] |
| 95 | + |
| 96 | +def merge_cas_results(adata, cas_ontology_aware_response): |
| 97 | + """Update AnnData with ontology-aware CAS results JSON |
| 98 | + """ |
| 99 | + insert_cas_ontology_aware_response_into_adata(cas_ontology_aware_response, adata, cl) |
| 100 | + |
| 101 | + # Comment out block below unless debugging |
| 102 | + # DASH_SERVER_PORT = 8050 |
| 103 | + # with suppress_stderr(): |
| 104 | + # CASCircularTreePlotUMAPDashApp( |
| 105 | + # adata, # the AnnData file |
| 106 | + # cas_ontology_aware_response, # CAS response |
| 107 | + # cluster_label_obs_column="cluster_label", # (optional) The .obs column name containing cluster labels |
| 108 | + # ).run(port=DASH_SERVER_PORT, debug=False, jupyter_width="100%") |
| 109 | + |
| 110 | + # TODO: Consider conditionally using this `single` method if (and only if) |
| 111 | + # AnnData lacks "raw annotations" as returned from e.g. Scanpy or Seurat |
| 112 | + # ("0", "1", "2", etc.) |
| 113 | + # pp.compute_most_granular_top_k_calls_single( |
| 114 | + # adata=adata, |
| 115 | + # cl=cl, |
| 116 | + # min_acceptable_score=0.1, # minimum acceptable score for a call |
| 117 | + # top_k=3, # how many top calls to make? |
| 118 | + # obs_prefix="cell_type_cas" # .obs column to write the top-k calls to |
| 119 | + # ) |
| 120 | + |
| 121 | + # Use this only if source AnnData has clusterings |
| 122 | + pp.compute_most_granular_top_k_calls_cluster( |
| 123 | + adata=adata, |
| 124 | + cl=cl, |
| 125 | + min_acceptable_score=0.1, # minimum acceptable score for a call |
| 126 | + cluster_label_obs_column='cluster_label', # .obs column containing cluster labels |
| 127 | + top_k=3, # how many top calls to make? |
| 128 | + obs_prefix='cell_type_cas' # .obs column to write the top-k calls to |
| 129 | + ) |
| 130 | + |
| 131 | + return adata |
| 132 | + |
| 133 | +def trim_cas_adata(adata): |
| 134 | + """Trim CAS AnnData to only annotation labels; omit IDs, scores |
| 135 | +
|
| 136 | + This ensures the AnnData can trivially initialize a polished SCP study. |
| 137 | + The IDs, scores etc. can be easily repopulated via `merge_cas_results` for |
| 138 | + debugging, convenient fuller AnnData output if desired, etc. |
| 139 | + """ |
| 140 | + # "Name" is ontology ID, e.g. CL_0000897. Score is CAS confidence. |
| 141 | + annots_to_omit = ["name", "score"] |
| 142 | + |
| 143 | + columns = list(adata.obs) |
| 144 | + for to_omit in annots_to_omit: |
| 145 | + for col in columns: |
| 146 | + # E.g. re.match('cas.*_name_\d+', 'cas_cell_type_name_1') |
| 147 | + match = re.match(f".*_cas_.*{to_omit}_\d+", col) |
| 148 | + if match: |
| 149 | + del adata.obs[col] |
| 150 | + |
| 151 | + return adata |
| 152 | + |
| 153 | +# TODO: Consider porting to SCP Core JS |
| 154 | +# def friendlify_cas_adata(adata): |
| 155 | +# """Make CAS annotation names human-friendlier, e.g. "CAS |
| 156 | +# """ |
| 157 | + |
| 158 | +# columns = list(adata.obs) |
| 159 | +# for old_col in columns: |
| 160 | +# if not '_cas_' in old_col: |
| 161 | +# continue |
| 162 | +# # E.g. cell_type_cas_label_42 -> cell_type_(CAS 42) |
| 163 | +# col = re.sub(r'_cas_label_(\d+)', r'_(CAS \1)_', old_col) |
| 164 | + |
| 165 | +# # E.g. cell_type_(CAS 42) -> cell type (CAS 42) |
| 166 | +# col = col.replace('_', ' ') |
| 167 | + |
| 168 | +# # E.g. cell type (CAS 42) -> Cell type (CAS 42) |
| 169 | +# col = col[0].upper() + col[1:] |
| 170 | + |
| 171 | +# # E.g. Cell type (CAS 1) -> Cell type (CAS) |
| 172 | +# col = col.replace('CAS 1)', 'CAS)') |
| 173 | + |
| 174 | +# adata.obs[col] = adata.obs[old_col] |
| 175 | +# del adata.obs[old_col] |
| 176 | + |
| 177 | +# return adata |
| 178 | + |
| 179 | +def save_anndata(adata, stem, input_path): |
| 180 | + cas_anndata_output_path = f"{stem}__{input_path}" |
| 181 | + adata.write(cas_anndata_output_path) |
| 182 | + print(f"Wrote AnnData: {cas_anndata_output_path}") |
| 183 | + |
| 184 | +def run(input_path, cas_response_output_path): |
| 185 | + print("Running CAS ingest") |
| 186 | + adata, cas_ontology_aware_response = cas_annotate(input_path, cas_response_output_path) |
| 187 | + |
| 188 | + # Comment out block below below unless debugging |
| 189 | + # adata = sc.read_h5ad(f"cas_annotated_before_postprocessing__{input_path}") |
| 190 | + # with open(cas_response_output_path) as f: |
| 191 | + # cas_ontology_aware_response = json.loads(f.read()) |
| 192 | + |
| 193 | + adata = merge_cas_results(adata, cas_ontology_aware_response) |
| 194 | + save_anndata(adata, "cas_annotated_", input_path) |
| 195 | + |
| 196 | + adata = trim_cas_adata(adata) |
| 197 | + save_anndata(adata, "cas_annotated_trimmed", input_path) |
| 198 | + |
| 199 | + # Comment out block below below unless debugging or generating demo data |
| 200 | + # TODO: Enable SCP metadata validation exemption for AnnData |
| 201 | + # adata = make_compliant_for_scp(adata) |
| 202 | + # save_anndata(adata, "cas_annotated_trimmed_compliant", input_path) |
| 203 | + |
| 204 | + |
| 205 | +if __name__ == "__main__": |
| 206 | + run(input_path, cas_response_output_path) |
0 commit comments