6
6
7
7
import pandas as pd
8
8
import numpy as np
9
+ from scipy .stats import spearmanr
9
10
import csv
10
11
import logging
11
12
@@ -25,7 +26,9 @@ def sort_comparison(groups):
25
26
"""
26
27
27
28
if any (i .isdigit () for i in groups ):
28
- sorted_arr = sorted (groups , key = lambda x : int ("" .join ([i for i in x if i .isdigit ()])))
29
+ sorted_arr = sorted (
30
+ groups , key = lambda x : int ("" .join ([i for i in x if i .isdigit ()]))
31
+ )
29
32
return sorted_arr
30
33
elif "rest" == groups [1 ]:
31
34
return groups
@@ -56,6 +59,7 @@ def canonicalize_name_and_order(data, header_refmap):
56
59
data = data .rename (columns = rename_map )
57
60
58
61
data = data .astype ({"group" : "string" })
62
+ data ['order' ] = data .index .astype (int )
59
63
60
64
# Update headers to expected order
61
65
unsorted_headers = list (data .columns )
@@ -139,7 +143,12 @@ def generate_manifest(stem, clean_val, clean_val_p, qual):
139
143
140
144
if len (file_names_pairwise ) != 0 :
141
145
for value in range (len (file_names_pairwise )):
142
- tsv_output .writerow ([file_names_pairwise [value ][0 ], file_names_pairwise [value ][1 ],])
146
+ tsv_output .writerow (
147
+ [
148
+ file_names_pairwise [value ][0 ],
149
+ file_names_pairwise [value ][1 ],
150
+ ]
151
+ )
143
152
144
153
145
154
def sort_all_group (all_group ):
@@ -188,44 +197,41 @@ def sort_comparison_metrics(comparison_metrics, size, significance):
188
197
189
198
# Arrange significance in expected order (ultimately ranked 3rd)
190
199
comparison_metrics = sorted (
191
- comparison_metrics ,
192
- key = lambda x : x .split ('--' )[- 1 ] == significance
200
+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == significance
193
201
)
194
202
195
203
# Arrange size in expected order (ultimately ranked 2nd)
196
204
comparison_metrics = sorted (
197
- comparison_metrics ,
198
- key = lambda x : x .split ('--' )[- 1 ] == size
205
+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == size
199
206
)
200
207
201
208
# Rank 1st with "gene", "group", then (if present) "comparison_group"
202
- comparison_metrics = sorted (comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "comparison_group" )
203
- comparison_metrics = sorted (comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "group" )
204
- comparison_metrics = sorted (comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "gene" )
209
+ comparison_metrics = sorted (
210
+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "comparison_group"
211
+ )
212
+ comparison_metrics = sorted (
213
+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "group"
214
+ )
215
+ comparison_metrics = sorted (
216
+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "gene"
217
+ )
205
218
206
219
comparison_metrics .reverse ()
207
220
208
221
return comparison_metrics
209
222
210
223
211
224
def sort_headers (headers , size , significance ):
212
- """Like `sort_comparison_metrics`, but for bare headers / metrics
213
- """
225
+ """Like `sort_comparison_metrics`, but for bare headers / metrics"""
214
226
215
227
# Sort alphabetically
216
228
headers = sorted (headers )
217
229
218
- # Rank significance 1st (ultimately ranked 4th)
219
- headers = sorted (
220
- headers ,
221
- key = lambda x : x == significance
222
- )
230
+ # Rank significance 1st (ultimately ranked 5th)
231
+ headers = sorted (headers , key = lambda x : x == significance )
223
232
224
233
# Rank size 1st (ultimately ranked 4th)
225
- headers = sorted (
226
- headers ,
227
- key = lambda x : x == size
228
- )
234
+ headers = sorted (headers , key = lambda x : x == size )
229
235
230
236
# Rank 1st with "gene", "group", then (if present) "comparison_group"
231
237
headers = sorted (headers , key = lambda x : x == "comparison_group" )
@@ -236,6 +242,7 @@ def sort_headers(headers, size, significance):
236
242
237
243
return headers
238
244
245
+
239
246
# note: my initial files had pval, qval, logfoldchanges.
240
247
# David's files have qval, mean, logfoldchanges.
241
248
# For the purposes of this validation I will be using his column values/formatting.
@@ -248,7 +255,9 @@ def validate_size_and_significance(metrics, size, significance, logger):
248
255
- Log to Sentry / Mixpanel
249
256
"""
250
257
has_size = any ([metric .split ('--' )[- 1 ] == size for metric in metrics ])
251
- has_significance = any ([metric .split ('--' )[- 1 ] == significance for metric in metrics ])
258
+ has_significance = any (
259
+ [metric .split ('--' )[- 1 ] == significance for metric in metrics ]
260
+ )
252
261
253
262
in_headers = f"in headers: { metrics } "
254
263
@@ -263,7 +272,9 @@ def validate_size_and_significance(metrics, size, significance, logger):
263
272
logger .error (msg )
264
273
raise ValueError (msg )
265
274
elif has_size and has_significance :
266
- logger .info (f'Found size ("{ size } ") and significance ("{ significance } ") metrics { in_headers } ' )
275
+ logger .info (
276
+ f'Found size ("{ size } ") and significance ("{ significance } ") metrics { in_headers } '
277
+ )
267
278
268
279
269
280
def get_groups_and_metrics (raw_column_names , size , significance , logger ):
@@ -292,7 +303,9 @@ def get_groups_and_metrics(raw_column_names, size, significance, logger):
292
303
column_items = raw_column_name .split ("--" )
293
304
split_header = []
294
305
for item in column_items :
295
- item = item .replace ("'" , "" ) # Remove quotes in e.g. 'type_0'--'type_1'--qval
306
+ item = item .replace (
307
+ "'" , ""
308
+ ) # Remove quotes in e.g. 'type_0'--'type_1'--qval
296
309
if (item != "" ) and (item != "_" ):
297
310
split_header .append (item .strip ("_" ))
298
311
split_headers .append (split_header )
@@ -325,13 +338,53 @@ def detect_seurat_findallmarkers(headers):
325
338
326
339
These headers were observed in a real user-uploaded DE file.
327
340
"""
328
- findallmarkers_headers = ['p_val' , 'avg_log2FC' , 'pct.1' , 'pct.2' , 'p_val_adj' , 'cluster' , 'gene' ]
329
- is_seurat_findallmarkers = (
330
- len (headers ) == len (findallmarkers_headers ) and all (headers == findallmarkers_headers )
341
+ findallmarkers_headers = [
342
+ 'p_val' ,
343
+ 'avg_log2FC' ,
344
+ 'pct.1' ,
345
+ 'pct.2' ,
346
+ 'p_val_adj' ,
347
+ 'cluster' ,
348
+ 'gene' ,
349
+ ]
350
+ is_seurat_findallmarkers = len (headers ) == len (findallmarkers_headers ) and all (
351
+ headers == findallmarkers_headers
331
352
)
332
353
return is_seurat_findallmarkers
333
354
334
355
356
+ def order_not_significant (array_1 , array_2 ):
357
+ correlation , pval = spearmanr (array_1 , array_2 )
358
+ if correlation > 0.95 :
359
+ return False
360
+ else :
361
+ return True
362
+
363
+
364
+ def organize_results (df ):
365
+ # processing turned values into strings, convert to numeric for sorting
366
+ df ["order" ] = df ["order" ].astype (float )
367
+ df ["order" ] = df ["order" ].astype (int )
368
+ # sort dataframe by input row order
369
+ df = df .sort_values (by = "order" )
370
+ df = df .set_index ('order' )
371
+ # maintain unnamed index column in DE results file
372
+ df .index .name = None
373
+ # processing ensures the significance metric is the 3rd column of the df
374
+ df [df .columns [2 ]] = df [df .columns [2 ]].astype (float )
375
+ input_order = df [df .columns [2 ]].to_numpy ()
376
+ sig_sorted = df [df .columns [2 ]].sort_values ()
377
+ sig_array = sig_sorted .to_numpy ()
378
+
379
+ if order_not_significant (sig_array , input_order ):
380
+ # sort dataframe by significance metric
381
+ df = df .sort_values (by = df .columns [2 ])
382
+ return df
383
+ else :
384
+ # leave dataframe sorted by input row order
385
+ return df
386
+
387
+
335
388
class AuthorDifferentialExpression :
336
389
dev_logger = setup_logger (__name__ , "log.txt" , format = "support_configs" )
337
390
author_de_logger = setup_logger (
@@ -351,7 +404,7 @@ def __init__(
351
404
annotation_scope ,
352
405
method ,
353
406
differential_expression_file ,
354
- header_refmap
407
+ header_refmap ,
355
408
):
356
409
"""
357
410
:param cluster_name (string) Name of cluster, e.g. "All Cells UMAP"
@@ -413,13 +466,17 @@ def execute(self):
413
466
groups , clean_val , metrics = get_groups_and_metrics (
414
467
one_vs_rest , self .size_metric , self .significance_metric , logger
415
468
)
416
- self .generate_result_files (one_vs_rest , genes , rest , groups , clean_val , metrics )
469
+ self .generate_result_files (
470
+ one_vs_rest , genes , rest , groups , clean_val , metrics
471
+ )
417
472
418
473
if len (pairwise ) != 0 :
419
474
groups_p , clean_val_p , metrics = get_groups_and_metrics (
420
475
pairwise , self .size_metric , self .significance_metric , logger
421
476
)
422
- self .generate_result_files (pairwise , genes , rest , groups_p , clean_val_p , metrics )
477
+ self .generate_result_files (
478
+ pairwise , genes , rest , groups_p , clean_val_p , metrics
479
+ )
423
480
generate_manifest (self .stem , clean_val , clean_val_p , metrics )
424
481
425
482
print ("Author DE transformation succeeded" )
@@ -462,7 +519,7 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
462
519
for i in all_group_fin :
463
520
for j in range (0 , len (i ), num_metrics ):
464
521
x = j
465
- comparison_metrics = i [x : x + num_metrics ]
522
+ comparison_metrics = i [x : x + num_metrics ]
466
523
sorted_comparison_metrics = sort_comparison_metrics (
467
524
comparison_metrics , self .size_metric , self .significance_metric
468
525
)
@@ -510,7 +567,9 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
510
567
sorted_list = sort_comparison ([group , comparison_group ])
511
568
comparison = f'{ sorted_list [0 ]} --{ sorted_list [1 ]} '
512
569
513
- clean_comparison_metric = '--' .join ([sanitize_string (group ) for group in comparison .split ('--' )])
570
+ clean_comparison_metric = '--' .join (
571
+ [sanitize_string (group ) for group in comparison .split ('--' )]
572
+ )
514
573
515
574
tsv_name = f'{ self .stem } --{ clean_comparison_metric } --{ self .annot_scope } --{ self .method } .tsv'
516
575
@@ -521,14 +580,16 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
521
580
t_arr = arr .transpose ()
522
581
523
582
if len (t_arr ) == 0 :
524
- print (f"No data to output for TSV, skip preparation to write { tsv_name } " )
583
+ print (
584
+ f"No data to output for TSV, skip preparation to write { tsv_name } "
585
+ )
525
586
continue
526
587
527
588
# Drop rows that are all "nan", as seen sometimes in Seurat FindAllMarkers()
528
589
t_arr = t_arr [~ (t_arr == 'nan' ).any (axis = 1 )]
529
590
530
591
inner_df = pd .DataFrame (data = t_arr , columns = headers )
531
-
592
+ inner_df = organize_results ( inner_df )
532
593
inner_df .to_csv (tsv_name , sep = '\t ' )
533
594
534
595
print (f"Wrote TSV: { tsv_name } " )
0 commit comments