3
3
from copy import deepcopy
4
4
import functools as ft
5
5
import logging
6
+ import numpy as np
6
7
from os import PathLike
7
8
from pathlib import Path
8
9
from pathlib import PurePath
@@ -83,6 +84,13 @@ def main():
83
84
"integers. Must be same length as <NUM_SPLITS>. If omitted will "
84
85
"default to randomized seeds."
85
86
)
87
+ p_process_datasets .add_argument (
88
+ '-e' , '--exclude_improve_drug_id' , dest = 'EXCL_DRUGS_LIST' ,
89
+ type = _improve_drug_id_list ,
90
+ default = None ,
91
+ help = 'define a list of improve_drug_id/improve_chem_id[s] that '
92
+ 'should be excluded from the reference datasets.'
93
+ )
86
94
87
95
p_all = command_parsers .add_parser (
88
96
"all" ,
@@ -183,7 +191,7 @@ def process_datasets(args):
183
191
columns = {'improve_drug_id' : 'improve_chem_id' },
184
192
inplace = True ,
185
193
)
186
- response_data ['improve_sample_id' ] = "SAMPLE_ID_ " + response_data ['improve_sample_id' ].astype (int ).astype (str )
194
+ response_data ['improve_sample_id' ] = "SAMPLE-ID- " + response_data ['improve_sample_id' ].astype (int ).astype (str )
187
195
# exporting the drug response data to 'y_data/response.tsv'
188
196
outfile_path = args .WORKDIR .joinpath ("data_out" , "y_data" , "response.tsv" )
189
197
response_data .to_csv (
@@ -201,12 +209,12 @@ def process_datasets(args):
201
209
#-------------------------------------------------------------------
202
210
203
211
204
- split_data_sets (
205
- args = args ,
206
- data_sets = data_sets ,
207
- data_sets_info = data_sets_info ,
208
- response_data = response_data
209
- )
212
+ # split_data_sets(
213
+ # args=args,
214
+ # data_sets=data_sets,
215
+ # data_sets_info=data_sets_info,
216
+ # response_data=response_data
217
+ # )
210
218
211
219
#-------------------------------------------------------------------
212
220
# getting common / reference gene symbols
@@ -276,6 +284,9 @@ def process_datasets(args):
276
284
)
277
285
278
286
merged_transcriptomics = merged_transcriptomics [merged_transcriptomics ['entrez_id' ] != 0 ]
287
+ merged_transcriptomics = merged_transcriptomics .fillna (0 ).T .reset_index ()
288
+ for i in range (0 ,3 ):
289
+ merged_transcriptomics .iloc [i ,0 ] = np .nan
279
290
280
291
# writing the expression datatable to '/x_data/*_expression.tsv'
281
292
outfile_path = args .WORKDIR .joinpath (
@@ -287,12 +298,11 @@ def process_datasets(args):
287
298
# This back fills NAs with 0s - the assumend "neutral" value for
288
299
# gene expression data
289
300
(merged_transcriptomics
290
- .fillna (0 )
291
- .transpose ()
292
301
.to_csv (
293
302
path_or_buf = outfile_path ,
294
303
sep = '\t ' ,
295
- header = False
304
+ header = False ,
305
+ index = False
296
306
)
297
307
)
298
308
@@ -332,6 +342,9 @@ def process_datasets(args):
332
342
'gene_symbol' ,
333
343
merged_copy_number .pop ('gene_symbol' )
334
344
)
345
+ merged_copy_number = merged_copy_number .T .reset_index ()
346
+ for i in range (0 ,3 ):
347
+ merged_copy_number .iloc [i ,0 ] = np .nan
335
348
336
349
# writing the expression datatable to '/x_data/*_copy_number.tsv'
337
350
outfile_path = args .WORKDIR .joinpath (
@@ -340,11 +353,11 @@ def process_datasets(args):
340
353
"cancer_copy_number.tsv"
341
354
)
342
355
(merged_copy_number
343
- .transpose ()
344
356
.to_csv (
345
357
path_or_buf = outfile_path ,
346
358
sep = '\t ' ,
347
- header = False
359
+ header = False ,
360
+ index = False
348
361
)
349
362
)
350
363
@@ -369,6 +382,9 @@ def process_datasets(args):
369
382
'gene_symbol' ,
370
383
discretized_copy_number .pop ('gene_symbol' )
371
384
)
385
+ discretized_copy_number = discretized_copy_number .T .reset_index ()
386
+ for i in range (0 ,3 ):
387
+ discretized_copy_number .iloc [i ,0 ] = np .nan
372
388
373
389
# writing the expression datatable to '/x_data/*_copy_number.tsv'
374
390
outfile_path = args .WORKDIR .joinpath (
@@ -377,11 +393,11 @@ def process_datasets(args):
377
393
"cancer_discretized_copy_number.tsv"
378
394
)
379
395
(discretized_copy_number
380
- .transpose ()
381
396
.to_csv (
382
397
path_or_buf = outfile_path ,
383
398
sep = '\t ' ,
384
- header = False
399
+ header = False ,
400
+ index = False
385
401
)
386
402
)
387
403
@@ -398,6 +414,13 @@ def process_datasets(args):
398
414
399
415
concat_drugs = pd .concat (dfs_to_merge .values ())
400
416
out_df = concat_drugs [['improve_drug_id' ,'canSMILES' ]].drop_duplicates ()
417
+
418
+ if args .EXCL_DRUGS_LIST is not None :
419
+ logger .info (
420
+ f"Removing all chemical compunds with ids: '{ args .EXCL_DRUGS_LIST } '"
421
+ )
422
+ out_df = out_df [~ out_df ['improve_drug_id' ].isin (args .EXCL_DRUGS_LIST )]
423
+
401
424
out_df .rename (
402
425
columns = {'improve_drug_id' : 'improve_chem_id' },
403
426
inplace = True ,
@@ -437,7 +460,7 @@ def process_datasets(args):
437
460
# retrieving unique mutations (the above creates multiplicates) &
438
461
# adding a prefix to the improve_sample_id
439
462
unique_mutations = merged_mutations [['entrez_id' , 'improve_sample_id' , 'mutation' ]].drop_duplicates ()
440
- unique_mutations ['improve_sample_id' ] = 'SAMPLE_ID_ ' + unique_mutations ['improve_sample_id' ].astype (str )
463
+ unique_mutations ['improve_sample_id' ] = 'SAMPLE-ID- ' + unique_mutations ['improve_sample_id' ].astype (str )
441
464
442
465
# counting the mutations per entrez_id/improve_sample_id pair and
443
466
# aggregating it into a pivot table (also filling NAs with 0s)
@@ -474,17 +497,21 @@ def process_datasets(args):
474
497
# removing some rows where we don't have a 'gene_symbol' for the
475
498
# entrez id
476
499
mutation_counts = mutation_counts [mutation_counts ['gene_symbol' ].notna ()]
500
+ mutation_counts = mutation_counts .T .reset_index ()
501
+ for i in range (0 ,3 ):
502
+ mutation_counts .iloc [i ,0 ] = np .nan
477
503
478
504
# writing the dataframe to the mutation counts mastertable
479
505
outfile_path = args .WORKDIR .joinpath (
480
506
"data_out" ,
481
507
"x_data" ,
482
508
"cancer_mutation_count.tsv"
483
509
)
484
- mutation_counts .T . to_csv (
510
+ mutation_counts .to_csv (
485
511
path_or_buf = outfile_path ,
486
512
sep = '\t ' ,
487
- header = False
513
+ header = False ,
514
+ index = False
488
515
)
489
516
490
517
def split_data_sets (
@@ -518,7 +545,7 @@ def split_data_sets(
518
545
columns = {'improve_drug_id' : 'improve_chem_id' },
519
546
inplace = True ,
520
547
)
521
- drug_response_rows ['improve_sample_id' ] = "SAMPLE_ID_ " + drug_response_rows ['improve_sample_id' ].astype (int ).astype (str )
548
+ drug_response_rows ['improve_sample_id' ] = "SAMPLE-ID- " + drug_response_rows ['improve_sample_id' ].astype (int ).astype (str )
522
549
row_nums = pd .merge (
523
550
response_data ,
524
551
drug_response_rows ,
@@ -563,7 +590,7 @@ def split_data_sets(
563
590
columns = {'improve_drug_id' : 'improve_chem_id' },
564
591
inplace = True ,
565
592
)
566
- train_keys ['improve_sample_id' ] = "SAMPLE_ID_ " + train_keys ['improve_sample_id' ].astype (int ).astype (str )
593
+ train_keys ['improve_sample_id' ] = "SAMPLE-ID- " + train_keys ['improve_sample_id' ].astype (int ).astype (str )
567
594
row_nums = pd .merge (
568
595
response_data ,
569
596
train_keys ,
@@ -601,7 +628,7 @@ def split_data_sets(
601
628
columns = {'improve_drug_id' : 'improve_chem_id' },
602
629
inplace = True ,
603
630
)
604
- test_keys ['improve_sample_id' ] = "SAMPLE_ID_ " + test_keys ['improve_sample_id' ].astype (int ).astype (str )
631
+ test_keys ['improve_sample_id' ] = "SAMPLE-ID- " + test_keys ['improve_sample_id' ].astype (int ).astype (str )
605
632
row_nums = pd .merge (
606
633
response_data ,
607
634
test_keys ,
@@ -632,7 +659,7 @@ def split_data_sets(
632
659
columns = {'improve_drug_id' : 'improve_chem_id' },
633
660
inplace = True ,
634
661
)
635
- val_keys ['improve_sample_id' ] = "SAMPLE_ID_ " + val_keys ['improve_sample_id' ].astype (int ).astype (str )
662
+ val_keys ['improve_sample_id' ] = "SAMPLE-ID- " + val_keys ['improve_sample_id' ].astype (int ).astype (str )
636
663
row_nums = pd .merge (
637
664
response_data ,
638
665
val_keys ,
@@ -679,7 +706,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
679
706
data_sets [data_set ]
680
707
.format (data_type = data_type )
681
708
.transpose ()
682
- .add_prefix ('SAMPLE_ID_ ' , axis = 1 )
709
+ .add_prefix ('SAMPLE-ID- ' , axis = 1 )
683
710
)
684
711
685
712
merged_data = None
@@ -805,6 +832,15 @@ def _random_seed_list(list: str) -> list:
805
832
list_ = list .split (',' )
806
833
return [int (item ) for item in list_ ]
807
834
835
+ def _improve_drug_id_list (list : str ) -> list :
836
+ if not isinstance (list , str ):
837
+ raise TypeError (
838
+ f"'exclude_improve_drug_id' must be of type str. Supplied argument "
839
+ f"is of type { type (list )} ."
840
+ )
841
+ list_ = list .split (',' )
842
+ return list_
843
+
808
844
809
845
if __name__ == '__main__' :
810
846
try : main ()
0 commit comments