Skip to content

Commit da12f0b

Browse files
authored
Merge pull request #382 from broadinstitute/development
Release 1.40.0
2 parents 02cafcc + 9185c71 commit da12f0b

File tree

4 files changed

+152
-64
lines changed

4 files changed

+152
-64
lines changed

ingest/author_de.py

+94-33
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import pandas as pd
88
import numpy as np
9+
from scipy.stats import spearmanr
910
import csv
1011
import logging
1112

@@ -25,7 +26,9 @@ def sort_comparison(groups):
2526
"""
2627

2728
if any(i.isdigit() for i in groups):
28-
sorted_arr = sorted(groups, key=lambda x: int("".join([i for i in x if i.isdigit()])))
29+
sorted_arr = sorted(
30+
groups, key=lambda x: int("".join([i for i in x if i.isdigit()]))
31+
)
2932
return sorted_arr
3033
elif "rest" == groups[1]:
3134
return groups
@@ -56,6 +59,7 @@ def canonicalize_name_and_order(data, header_refmap):
5659
data = data.rename(columns=rename_map)
5760

5861
data = data.astype({"group": "string"})
62+
data['order'] = data.index.astype(int)
5963

6064
# Update headers to expected order
6165
unsorted_headers = list(data.columns)
@@ -139,7 +143,12 @@ def generate_manifest(stem, clean_val, clean_val_p, qual):
139143

140144
if len(file_names_pairwise) != 0:
141145
for value in range(len(file_names_pairwise)):
142-
tsv_output.writerow([file_names_pairwise[value][0], file_names_pairwise[value][1],])
146+
tsv_output.writerow(
147+
[
148+
file_names_pairwise[value][0],
149+
file_names_pairwise[value][1],
150+
]
151+
)
143152

144153

145154
def sort_all_group(all_group):
@@ -188,44 +197,41 @@ def sort_comparison_metrics(comparison_metrics, size, significance):
188197

189198
# Arrange significance in expected order (ultimately ranked 3rd)
190199
comparison_metrics = sorted(
191-
comparison_metrics,
192-
key=lambda x: x.split('--')[-1] == significance
200+
comparison_metrics, key=lambda x: x.split('--')[-1] == significance
193201
)
194202

195203
# Arrange size in expected order (ultimately ranked 2nd)
196204
comparison_metrics = sorted(
197-
comparison_metrics,
198-
key=lambda x: x.split('--')[-1] == size
205+
comparison_metrics, key=lambda x: x.split('--')[-1] == size
199206
)
200207

201208
# Rank 1st with "gene", "group", then (if present) "comparison_group"
202-
comparison_metrics = sorted(comparison_metrics, key=lambda x: x.split('--')[-1] == "comparison_group")
203-
comparison_metrics = sorted(comparison_metrics, key=lambda x: x.split('--')[-1] == "group")
204-
comparison_metrics = sorted(comparison_metrics, key=lambda x: x.split('--')[-1] == "gene")
209+
comparison_metrics = sorted(
210+
comparison_metrics, key=lambda x: x.split('--')[-1] == "comparison_group"
211+
)
212+
comparison_metrics = sorted(
213+
comparison_metrics, key=lambda x: x.split('--')[-1] == "group"
214+
)
215+
comparison_metrics = sorted(
216+
comparison_metrics, key=lambda x: x.split('--')[-1] == "gene"
217+
)
205218

206219
comparison_metrics.reverse()
207220

208221
return comparison_metrics
209222

210223

211224
def sort_headers(headers, size, significance):
212-
"""Like `sort_comparison_metrics`, but for bare headers / metrics
213-
"""
225+
"""Like `sort_comparison_metrics`, but for bare headers / metrics"""
214226

215227
# Sort alphabetically
216228
headers = sorted(headers)
217229

218-
# Rank significance 1st (ultimately ranked 4th)
219-
headers = sorted(
220-
headers,
221-
key=lambda x: x == significance
222-
)
230+
# Rank significance 1st (ultimately ranked 5th)
231+
headers = sorted(headers, key=lambda x: x == significance)
223232

224233
# Rank size 1st (ultimately ranked 4th)
225-
headers = sorted(
226-
headers,
227-
key=lambda x: x == size
228-
)
234+
headers = sorted(headers, key=lambda x: x == size)
229235

230236
# Rank 1st with "gene", "group", then (if present) "comparison_group"
231237
headers = sorted(headers, key=lambda x: x == "comparison_group")
@@ -236,6 +242,7 @@ def sort_headers(headers, size, significance):
236242

237243
return headers
238244

245+
239246
# note: my initial files had pval, qval, logfoldchanges.
240247
# David's files have qval, mean, logfoldchanges.
241248
# For the purposes of this validation I will be using his column values/formatting.
@@ -248,7 +255,9 @@ def validate_size_and_significance(metrics, size, significance, logger):
248255
- Log to Sentry / Mixpanel
249256
"""
250257
has_size = any([metric.split('--')[-1] == size for metric in metrics])
251-
has_significance = any([metric.split('--')[-1] == significance for metric in metrics])
258+
has_significance = any(
259+
[metric.split('--')[-1] == significance for metric in metrics]
260+
)
252261

253262
in_headers = f"in headers: {metrics}"
254263

@@ -263,7 +272,9 @@ def validate_size_and_significance(metrics, size, significance, logger):
263272
logger.error(msg)
264273
raise ValueError(msg)
265274
elif has_size and has_significance:
266-
logger.info(f'Found size ("{size}") and significance ("{significance}") metrics {in_headers}')
275+
logger.info(
276+
f'Found size ("{size}") and significance ("{significance}") metrics {in_headers}'
277+
)
267278

268279

269280
def get_groups_and_metrics(raw_column_names, size, significance, logger):
@@ -292,7 +303,9 @@ def get_groups_and_metrics(raw_column_names, size, significance, logger):
292303
column_items = raw_column_name.split("--")
293304
split_header = []
294305
for item in column_items:
295-
item = item.replace("'", "") # Remove quotes in e.g. 'type_0'--'type_1'--qval
306+
item = item.replace(
307+
"'", ""
308+
) # Remove quotes in e.g. 'type_0'--'type_1'--qval
296309
if (item != "") and (item != "_"):
297310
split_header.append(item.strip("_"))
298311
split_headers.append(split_header)
@@ -325,13 +338,53 @@ def detect_seurat_findallmarkers(headers):
325338
326339
These headers were observed in a real user-uploaded DE file.
327340
"""
328-
findallmarkers_headers = ['p_val', 'avg_log2FC', 'pct.1', 'pct.2', 'p_val_adj', 'cluster', 'gene']
329-
is_seurat_findallmarkers = (
330-
len(headers) == len(findallmarkers_headers) and all(headers == findallmarkers_headers)
341+
findallmarkers_headers = [
342+
'p_val',
343+
'avg_log2FC',
344+
'pct.1',
345+
'pct.2',
346+
'p_val_adj',
347+
'cluster',
348+
'gene',
349+
]
350+
is_seurat_findallmarkers = len(headers) == len(findallmarkers_headers) and all(
351+
headers == findallmarkers_headers
331352
)
332353
return is_seurat_findallmarkers
333354

334355

356+
def order_not_significant(array_1, array_2):
357+
correlation, pval = spearmanr(array_1, array_2)
358+
if correlation > 0.95:
359+
return False
360+
else:
361+
return True
362+
363+
364+
def organize_results(df):
365+
# processing turned values into strings, convert to numeric for sorting
366+
df["order"] = df["order"].astype(float)
367+
df["order"] = df["order"].astype(int)
368+
# sort dataframe by input row order
369+
df = df.sort_values(by="order")
370+
df = df.set_index('order')
371+
# maintain unnamed index column in DE results file
372+
df.index.name = None
373+
# processing ensures the significance metric is the 3rd column of the df
374+
df[df.columns[2]] = df[df.columns[2]].astype(float)
375+
input_order = df[df.columns[2]].to_numpy()
376+
sig_sorted = df[df.columns[2]].sort_values()
377+
sig_array = sig_sorted.to_numpy()
378+
379+
if order_not_significant(sig_array, input_order):
380+
# sort dataframe by significance metric
381+
df = df.sort_values(by=df.columns[2])
382+
return df
383+
else:
384+
# leave dataframe sorted by input row order
385+
return df
386+
387+
335388
class AuthorDifferentialExpression:
336389
dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
337390
author_de_logger = setup_logger(
@@ -351,7 +404,7 @@ def __init__(
351404
annotation_scope,
352405
method,
353406
differential_expression_file,
354-
header_refmap
407+
header_refmap,
355408
):
356409
"""
357410
:param cluster_name (string) Name of cluster, e.g. "All Cells UMAP"
@@ -413,13 +466,17 @@ def execute(self):
413466
groups, clean_val, metrics = get_groups_and_metrics(
414467
one_vs_rest, self.size_metric, self.significance_metric, logger
415468
)
416-
self.generate_result_files(one_vs_rest, genes, rest, groups, clean_val, metrics)
469+
self.generate_result_files(
470+
one_vs_rest, genes, rest, groups, clean_val, metrics
471+
)
417472

418473
if len(pairwise) != 0:
419474
groups_p, clean_val_p, metrics = get_groups_and_metrics(
420475
pairwise, self.size_metric, self.significance_metric, logger
421476
)
422-
self.generate_result_files(pairwise, genes, rest, groups_p, clean_val_p, metrics)
477+
self.generate_result_files(
478+
pairwise, genes, rest, groups_p, clean_val_p, metrics
479+
)
423480
generate_manifest(self.stem, clean_val, clean_val_p, metrics)
424481

425482
print("Author DE transformation succeeded")
@@ -462,7 +519,7 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
462519
for i in all_group_fin:
463520
for j in range(0, len(i), num_metrics):
464521
x = j
465-
comparison_metrics = i[x: x + num_metrics]
522+
comparison_metrics = i[x : x + num_metrics]
466523
sorted_comparison_metrics = sort_comparison_metrics(
467524
comparison_metrics, self.size_metric, self.significance_metric
468525
)
@@ -510,7 +567,9 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
510567
sorted_list = sort_comparison([group, comparison_group])
511568
comparison = f'{sorted_list[0]}--{sorted_list[1]}'
512569

513-
clean_comparison_metric = '--'.join([sanitize_string(group) for group in comparison.split('--')])
570+
clean_comparison_metric = '--'.join(
571+
[sanitize_string(group) for group in comparison.split('--')]
572+
)
514573

515574
tsv_name = f'{self.stem}--{clean_comparison_metric}--{self.annot_scope}--{self.method}.tsv'
516575

@@ -521,14 +580,16 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
521580
t_arr = arr.transpose()
522581

523582
if len(t_arr) == 0:
524-
print(f"No data to output for TSV, skip preparation to write {tsv_name}")
583+
print(
584+
f"No data to output for TSV, skip preparation to write {tsv_name}"
585+
)
525586
continue
526587

527588
# Drop rows that are all "nan", as seen sometimes in Seurat FindAllMarkers()
528589
t_arr = t_arr[~(t_arr == 'nan').any(axis=1)]
529590

530591
inner_df = pd.DataFrame(data=t_arr, columns=headers)
531-
592+
inner_df = organize_results(inner_df)
532593
inner_df.to_csv(tsv_name, sep='\t')
533594

534595
print(f"Wrote TSV: {tsv_name}")
+2-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
1737653567 # validation cache key
1+
1738072997 # validation cache key
2+

schema/alexandria_convention/alexandria_convention_schema.tsv

+2-2
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ growth_factor_perturbation__concentration__unit_label string unit_label gr
5252
growth_factor_perturbation__ontology_label string TRUE ontology_label growth_factor_perturbation growth_factor_perturbation__ontology_label
5353
growth_factor_perturbation__solvent string TRUE growth_factor_perturbation Solvent in which the growth factor was added to the cells. Ex. the base media.
5454
growth_factor_perturbation__source string TRUE growth_factor_perturbation Source from which the growth factor was purchased
55-
has_electrophysiology boolean boolean
56-
has_morphology boolean boolean
55+
has_electrophysiology boolean boolean Indicates availability of electrophysiology data
56+
has_morphology boolean boolean Indicates availability of morphology data
5757
is_living string enum "[""yes"", ""no"", ""unknown""]" Whether organism was alive at time of biomaterial collection
5858
mhc_genotype string MHC genotype for humans and other species
5959
mouse_strain string ontology https://www.ebi.ac.uk/ols/api/ontologies/ncit https://www.ebi.ac.uk/ols/ontologies/ncit "NCIT_C14420 " species == NCBITaxon_10090 Mouse strain of the donor organism (ex. C57BL/6, BALB/c, 129, undetermined)

0 commit comments

Comments
 (0)