@@ -183,6 +183,7 @@ def process_datasets(args):
183
183
columns = {'improve_drug_id' : 'improve_chem_id' },
184
184
inplace = True ,
185
185
)
186
+ response_data ['improve_sample_id' ] = "SAMPLE_ID_" + response_data ['improve_sample_id' ].astype (int ).astype (str )
186
187
# exporting the drug response data to 'y_data/response.tsv'
187
188
outfile_path = args .WORKDIR .joinpath ("data_out" , "y_data" , "response.tsv" )
188
189
response_data .to_csv (
@@ -200,7 +201,6 @@ def process_datasets(args):
200
201
#-------------------------------------------------------------------
201
202
202
203
203
- # TODO: potentially change vars to be read from `args`
204
204
split_data_sets (
205
205
args = args ,
206
206
data_sets = data_sets ,
@@ -433,8 +433,10 @@ def process_datasets(args):
433
433
how = 'outer' ),
434
434
dfs_to_merge .values ())
435
435
436
- # retrieving unique mutations (the above creates multiplicates)
436
+ # retrieving unique mutations (the above creates multiplicates) &
437
+ # adding a prefix to the improve_sample_id
437
438
unique_mutations = merged_mutations [['entrez_id' , 'improve_sample_id' , 'mutation' ]].drop_duplicates ()
439
+ unique_mutations ['improve_sample_id' ] = 'SAMPLE_ID_' + unique_mutations ['improve_sample_id' ].astype (str )
438
440
439
441
# counting the mutations per entrez_id/improve_sample_id pair and
440
442
# aggregating it into a pivot table (also filling NAs with 0s)
@@ -505,7 +507,7 @@ def split_data_sets(
505
507
logger .info (f'creating splits for { data_set } ...' )
506
508
# getting "<DATASET>_all.txt"
507
509
drug_response_rows = (
508
- data_sets ['mpnst' ]
510
+ data_sets [data_set ]
509
511
.experiments [
510
512
['improve_sample_id' , 'improve_drug_id' , "time" , "study" ]
511
513
]
@@ -515,6 +517,7 @@ def split_data_sets(
515
517
columns = {'improve_drug_id' : 'improve_chem_id' },
516
518
inplace = True ,
517
519
)
520
+ drug_response_rows ['improve_sample_id' ] = "SAMPLE_ID_" + drug_response_rows ['improve_sample_id' ].astype (int ).astype (str )
518
521
row_nums = pd .merge (
519
522
response_data ,
520
523
drug_response_rows ,
@@ -559,6 +562,7 @@ def split_data_sets(
559
562
columns = {'improve_drug_id' : 'improve_chem_id' },
560
563
inplace = True ,
561
564
)
565
+ train_keys ['improve_sample_id' ] = "SAMPLE_ID_" + train_keys ['improve_sample_id' ].astype (int ).astype (str )
562
566
row_nums = pd .merge (
563
567
response_data ,
564
568
train_keys ,
@@ -596,6 +600,7 @@ def split_data_sets(
596
600
columns = {'improve_drug_id' : 'improve_chem_id' },
597
601
inplace = True ,
598
602
)
603
+ test_keys ['improve_sample_id' ] = "SAMPLE_ID_" + test_keys ['improve_sample_id' ].astype (int ).astype (str )
599
604
row_nums = pd .merge (
600
605
response_data ,
601
606
test_keys ,
@@ -626,6 +631,7 @@ def split_data_sets(
626
631
columns = {'improve_drug_id' : 'improve_chem_id' },
627
632
inplace = True ,
628
633
)
634
+ val_keys ['improve_sample_id' ] = "SAMPLE_ID_" + val_keys ['improve_sample_id' ].astype (int ).astype (str )
629
635
row_nums = pd .merge (
630
636
response_data ,
631
637
val_keys ,
@@ -669,7 +675,10 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
669
675
getattr (data_sets [data_set ], data_type , None ) is not None
670
676
):
671
677
dfs_to_merge .append (
672
- data_sets [data_set ].format (data_type = data_type ).transpose ()
678
+ data_sets [data_set ]
679
+ .format (data_type = data_type )
680
+ .transpose ()
681
+ .add_prefix ('SAMPLE_ID_' , axis = 1 )
673
682
)
674
683
675
684
merged_data = None
@@ -697,7 +706,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
697
706
)
698
707
699
708
# Casting col and row indices back to int
700
- merged_data .columns .astype (int )
709
+ # merged_data.columns.astype(int)
701
710
if not merged_data .index .dtype == int :
702
711
merged_data .index = merged_data .index .astype (int )
703
712
0 commit comments