Skip to content

Commit 309b9e1

Browse files
committed
small fixes to the improve wrapper script
1 parent dcabaea commit 309b9e1

File tree

1 file changed

+58
-22
lines changed

1 file changed

+58
-22
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from copy import deepcopy
44
import functools as ft
55
import logging
6+
import numpy as np
67
from os import PathLike
78
from pathlib import Path
89
from pathlib import PurePath
@@ -83,6 +84,13 @@ def main():
8384
"integers. Must be same length as <NUM_SPLITS>. If omitted will "
8485
"default to randomized seeds."
8586
)
87+
p_process_datasets.add_argument(
88+
'-e', '--exclude_improve_drug_id', dest='EXCL_DRUGS_LIST',
89+
type=_improve_drug_id_list,
90+
default=None,
91+
help='define a list of improve_drug_id/improve_chem_id[s] that '
92+
'should be excluded from the reference datasets.'
93+
)
8694

8795
p_all = command_parsers.add_parser(
8896
"all",
@@ -183,7 +191,7 @@ def process_datasets(args):
183191
columns={'improve_drug_id': 'improve_chem_id'},
184192
inplace=True,
185193
)
186-
response_data['improve_sample_id'] = "SAMPLE_ID_" + response_data['improve_sample_id'].astype(int).astype(str)
194+
response_data['improve_sample_id'] = "SAMPLE-ID-" + response_data['improve_sample_id'].astype(int).astype(str)
187195
# exporting the drug response data to 'y_data/response.tsv'
188196
outfile_path = args.WORKDIR.joinpath("data_out", "y_data", "response.tsv")
189197
response_data.to_csv(
@@ -201,12 +209,12 @@ def process_datasets(args):
201209
#-------------------------------------------------------------------
202210

203211

204-
split_data_sets(
205-
args=args,
206-
data_sets=data_sets,
207-
data_sets_info=data_sets_info,
208-
response_data=response_data
209-
)
212+
# split_data_sets(
213+
# args=args,
214+
# data_sets=data_sets,
215+
# data_sets_info=data_sets_info,
216+
# response_data=response_data
217+
# )
210218

211219
#-------------------------------------------------------------------
212220
# getting common / reference gene symbols
@@ -276,6 +284,9 @@ def process_datasets(args):
276284
)
277285

278286
merged_transcriptomics = merged_transcriptomics[merged_transcriptomics['entrez_id'] != 0]
287+
merged_transcriptomics = merged_transcriptomics.fillna(0).T.reset_index()
288+
for i in range(0,3):
289+
merged_transcriptomics.iloc[i,0] = np.nan
279290

280291
# writing the expression datatable to '/x_data/*_expression.tsv'
281292
outfile_path = args.WORKDIR.joinpath(
@@ -287,12 +298,11 @@ def process_datasets(args):
287298
# This back fills NAs with 0s - the assumend "neutral" value for
288299
# gene expression data
289300
(merged_transcriptomics
290-
.fillna(0)
291-
.transpose()
292301
.to_csv(
293302
path_or_buf=outfile_path,
294303
sep='\t',
295-
header=False
304+
header=False,
305+
index=False
296306
)
297307
)
298308

@@ -332,6 +342,9 @@ def process_datasets(args):
332342
'gene_symbol',
333343
merged_copy_number.pop('gene_symbol')
334344
)
345+
merged_copy_number = merged_copy_number.T.reset_index()
346+
for i in range(0,3):
347+
merged_copy_number.iloc[i,0] = np.nan
335348

336349
# writing the expression datatable to '/x_data/*_copy_number.tsv'
337350
outfile_path = args.WORKDIR.joinpath(
@@ -340,11 +353,11 @@ def process_datasets(args):
340353
"cancer_copy_number.tsv"
341354
)
342355
(merged_copy_number
343-
.transpose()
344356
.to_csv(
345357
path_or_buf=outfile_path,
346358
sep='\t',
347-
header=False
359+
header=False,
360+
index=False
348361
)
349362
)
350363

@@ -369,6 +382,9 @@ def process_datasets(args):
369382
'gene_symbol',
370383
discretized_copy_number.pop('gene_symbol')
371384
)
385+
discretized_copy_number = discretized_copy_number.T.reset_index()
386+
for i in range(0,3):
387+
discretized_copy_number.iloc[i,0] = np.nan
372388

373389
# writing the expression datatable to '/x_data/*_copy_number.tsv'
374390
outfile_path = args.WORKDIR.joinpath(
@@ -377,11 +393,11 @@ def process_datasets(args):
377393
"cancer_discretized_copy_number.tsv"
378394
)
379395
(discretized_copy_number
380-
.transpose()
381396
.to_csv(
382397
path_or_buf=outfile_path,
383398
sep='\t',
384-
header=False
399+
header=False,
400+
index=False
385401
)
386402
)
387403

@@ -398,6 +414,13 @@ def process_datasets(args):
398414

399415
concat_drugs = pd.concat(dfs_to_merge.values())
400416
out_df = concat_drugs[['improve_drug_id','canSMILES']].drop_duplicates()
417+
418+
if args.EXCL_DRUGS_LIST is not None:
419+
logger.info(
420+
f"Removing all chemical compunds with ids: '{args.EXCL_DRUGS_LIST}'"
421+
)
422+
out_df = out_df[~out_df['improve_drug_id'].isin(args.EXCL_DRUGS_LIST)]
423+
401424
out_df.rename(
402425
columns={'improve_drug_id': 'improve_chem_id'},
403426
inplace=True,
@@ -437,7 +460,7 @@ def process_datasets(args):
437460
# retrieving unique mutations (the above creates multiplicates) &
438461
# adding a prefix to the improve_sample_id
439462
unique_mutations = merged_mutations[['entrez_id', 'improve_sample_id', 'mutation']].drop_duplicates()
440-
unique_mutations['improve_sample_id'] = 'SAMPLE_ID_' + unique_mutations['improve_sample_id'].astype(str)
463+
unique_mutations['improve_sample_id'] = 'SAMPLE-ID-' + unique_mutations['improve_sample_id'].astype(str)
441464

442465
# counting the mutations per entrez_id/improve_sample_id pair and
443466
# aggregating it into a pivot table (also filling NAs with 0s)
@@ -474,17 +497,21 @@ def process_datasets(args):
474497
# removing some rows where we don't have a 'gene_symbol' for the
475498
# entrez id
476499
mutation_counts = mutation_counts[mutation_counts['gene_symbol'].notna()]
500+
mutation_counts = mutation_counts.T.reset_index()
501+
for i in range(0,3):
502+
mutation_counts.iloc[i,0] = np.nan
477503

478504
# writing the dataframe to the mutation counts mastertable
479505
outfile_path = args.WORKDIR.joinpath(
480506
"data_out",
481507
"x_data",
482508
"cancer_mutation_count.tsv"
483509
)
484-
mutation_counts.T.to_csv(
510+
mutation_counts.to_csv(
485511
path_or_buf=outfile_path,
486512
sep='\t',
487-
header=False
513+
header=False,
514+
index=False
488515
)
489516

490517
def split_data_sets(
@@ -518,7 +545,7 @@ def split_data_sets(
518545
columns={'improve_drug_id': 'improve_chem_id'},
519546
inplace=True,
520547
)
521-
drug_response_rows['improve_sample_id'] = "SAMPLE_ID_" + drug_response_rows['improve_sample_id'].astype(int).astype(str)
548+
drug_response_rows['improve_sample_id'] = "SAMPLE-ID-" + drug_response_rows['improve_sample_id'].astype(int).astype(str)
522549
row_nums = pd.merge(
523550
response_data,
524551
drug_response_rows,
@@ -563,7 +590,7 @@ def split_data_sets(
563590
columns={'improve_drug_id': 'improve_chem_id'},
564591
inplace=True,
565592
)
566-
train_keys['improve_sample_id'] = "SAMPLE_ID_" + train_keys['improve_sample_id'].astype(int).astype(str)
593+
train_keys['improve_sample_id'] = "SAMPLE-ID-" + train_keys['improve_sample_id'].astype(int).astype(str)
567594
row_nums = pd.merge(
568595
response_data,
569596
train_keys,
@@ -601,7 +628,7 @@ def split_data_sets(
601628
columns={'improve_drug_id': 'improve_chem_id'},
602629
inplace=True,
603630
)
604-
test_keys['improve_sample_id'] = "SAMPLE_ID_" + test_keys['improve_sample_id'].astype(int).astype(str)
631+
test_keys['improve_sample_id'] = "SAMPLE-ID-" + test_keys['improve_sample_id'].astype(int).astype(str)
605632
row_nums = pd.merge(
606633
response_data,
607634
test_keys,
@@ -632,7 +659,7 @@ def split_data_sets(
632659
columns={'improve_drug_id': 'improve_chem_id'},
633660
inplace=True,
634661
)
635-
val_keys['improve_sample_id'] = "SAMPLE_ID_" + val_keys['improve_sample_id'].astype(int).astype(str)
662+
val_keys['improve_sample_id'] = "SAMPLE-ID-" + val_keys['improve_sample_id'].astype(int).astype(str)
636663
row_nums = pd.merge(
637664
response_data,
638665
val_keys,
@@ -679,7 +706,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
679706
data_sets[data_set]
680707
.format(data_type=data_type)
681708
.transpose()
682-
.add_prefix('SAMPLE_ID_', axis=1)
709+
.add_prefix('SAMPLE-ID-', axis=1)
683710
)
684711

685712
merged_data = None
@@ -805,6 +832,15 @@ def _random_seed_list(list: str) -> list:
805832
list_ = list.split(',')
806833
return [int(item) for item in list_]
807834

835+
def _improve_drug_id_list(list: str) -> list:
836+
if not isinstance(list, str):
837+
raise TypeError(
838+
f"'exclude_improve_drug_id' must be of type str. Supplied argument "
839+
f"is of type {type(list)}."
840+
)
841+
list_ = list.split(',')
842+
return list_
843+
808844

809845
if __name__ == '__main__':
810846
try: main()

0 commit comments

Comments
 (0)