Skip to content

Commit b9bc5d3

Browse files
committed
added cast improve_sample_id to string and added 'SAMPLE_ID_' prefix
1 parent e5d323e commit b9bc5d3

File tree

1 file changed

+14
-5
lines changed

1 file changed

+14
-5
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ def process_datasets(args):
183183
columns={'improve_drug_id': 'improve_chem_id'},
184184
inplace=True,
185185
)
186+
response_data['improve_sample_id'] = "SAMPLE_ID_" + response_data['improve_sample_id'].astype(int).astype(str)
186187
# exporting the drug response data to 'y_data/response.tsv'
187188
outfile_path = args.WORKDIR.joinpath("data_out", "y_data", "response.tsv")
188189
response_data.to_csv(
@@ -200,7 +201,6 @@ def process_datasets(args):
200201
#-------------------------------------------------------------------
201202

202203

203-
# TODO: potentially change vars to be read from `args`
204204
split_data_sets(
205205
args=args,
206206
data_sets=data_sets,
@@ -433,8 +433,10 @@ def process_datasets(args):
433433
how='outer'),
434434
dfs_to_merge.values())
435435

436-
# retrieving unique mutations (the above creates multiplicates)
436+
# retrieving unique mutations (the above creates multiplicates) &
437+
# adding a prefix to the improve_sample_id
437438
unique_mutations = merged_mutations[['entrez_id', 'improve_sample_id', 'mutation']].drop_duplicates()
439+
unique_mutations['improve_sample_id'] = 'SAMPLE_ID_' + unique_mutations['improve_sample_id'].astype(str)
438440

439441
# counting the mutations per entrez_id/improve_sample_id pair and
440442
# aggregating it into a pivot table (also filling NAs with 0s)
@@ -505,7 +507,7 @@ def split_data_sets(
505507
logger.info(f'creating splits for {data_set} ...')
506508
# getting "<DATASET>_all.txt"
507509
drug_response_rows = (
508-
data_sets['mpnst']
510+
data_sets[data_set]
509511
.experiments[
510512
['improve_sample_id', 'improve_drug_id', "time", "study"]
511513
]
@@ -515,6 +517,7 @@ def split_data_sets(
515517
columns={'improve_drug_id': 'improve_chem_id'},
516518
inplace=True,
517519
)
520+
drug_response_rows['improve_sample_id'] = "SAMPLE_ID_" + drug_response_rows['improve_sample_id'].astype(int).astype(str)
518521
row_nums = pd.merge(
519522
response_data,
520523
drug_response_rows,
@@ -559,6 +562,7 @@ def split_data_sets(
559562
columns={'improve_drug_id': 'improve_chem_id'},
560563
inplace=True,
561564
)
565+
train_keys['improve_sample_id'] = "SAMPLE_ID_" + train_keys['improve_sample_id'].astype(int).astype(str)
562566
row_nums = pd.merge(
563567
response_data,
564568
train_keys,
@@ -596,6 +600,7 @@ def split_data_sets(
596600
columns={'improve_drug_id': 'improve_chem_id'},
597601
inplace=True,
598602
)
603+
test_keys['improve_sample_id'] = "SAMPLE_ID_" + test_keys['improve_sample_id'].astype(int).astype(str)
599604
row_nums = pd.merge(
600605
response_data,
601606
test_keys,
@@ -626,6 +631,7 @@ def split_data_sets(
626631
columns={'improve_drug_id': 'improve_chem_id'},
627632
inplace=True,
628633
)
634+
val_keys['improve_sample_id'] = "SAMPLE_ID_" + val_keys['improve_sample_id'].astype(int).astype(str)
629635
row_nums = pd.merge(
630636
response_data,
631637
val_keys,
@@ -669,7 +675,10 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
669675
getattr(data_sets[data_set], data_type, None) is not None
670676
):
671677
dfs_to_merge.append(
672-
data_sets[data_set].format(data_type=data_type).transpose()
678+
data_sets[data_set]
679+
.format(data_type=data_type)
680+
.transpose()
681+
.add_prefix('SAMPLE_ID_', axis=1)
673682
)
674683

675684
merged_data = None
@@ -697,7 +706,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
697706
)
698707

699708
# Casting col and row indices back to int
700-
merged_data.columns.astype(int)
709+
# merged_data.columns.astype(int)
701710
if not merged_data.index.dtype == int:
702711
merged_data.index = merged_data.index.astype(int)
703712

0 commit comments

Comments
 (0)