Skip to content

Commit e7266a0

Browse files
committed
added drug_mordred.tsv output
1 parent 7a46f96 commit e7266a0

File tree

1 file changed

+49
-6
lines changed

1 file changed

+49
-6
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,12 @@ def process_datasets(args):
209209
#-------------------------------------------------------------------
210210

211211

212-
split_data_sets(
213-
args=args,
214-
data_sets=data_sets,
215-
data_sets_info=data_sets_info,
216-
response_data=response_data
217-
)
212+
# split_data_sets(
213+
# args=args,
214+
# data_sets=data_sets,
215+
# data_sets_info=data_sets_info,
216+
# response_data=response_data
217+
# )
218218

219219
#-------------------------------------------------------------------
220220
# getting common / reference gene symbols
@@ -439,6 +439,48 @@ def process_datasets(args):
439439
)
440440

441441

442+
#-------------------------------------------------------------------
443+
# create mordred table
444+
#-------------------------------------------------------------------
445+
446+
dfs_to_merge = {}
447+
for data_set in data_sets:
448+
if (data_sets[data_set].experiments is not None
449+
and data_sets[data_set].drug_descriptors is not None
450+
):
451+
df_tmp = data_sets[data_set].format(data_type='drug_descriptor', shape='wide')
452+
df_tmp = df_tmp.drop(columns=['morgan fingerprint']).add_prefix('mordred.')
453+
dfs_to_merge[data_set] = df_tmp
454+
455+
concat_drugs = pd.concat(dfs_to_merge.values())
456+
concat_drugs = concat_drugs.replace({'False': '0', 'True': '1'})
457+
cols = concat_drugs.columns
458+
concat_drugs[cols] = concat_drugs[cols].apply(pd.to_numeric, errors='coerce')
459+
out_df = concat_drugs.reset_index()
460+
out_df = out_df.fillna(0).round(4).drop_duplicates(subset=['improve_drug_id'], keep='first')
461+
462+
if args.EXCL_DRUGS_LIST is not None:
463+
logger.info(
464+
f"Removing all chemical compunds with ids: '{args.EXCL_DRUGS_LIST}'"
465+
)
466+
out_df = out_df[~out_df['improve_drug_id'].isin(args.EXCL_DRUGS_LIST)]
467+
468+
out_df.rename(
469+
columns={'improve_drug_id': 'improve_chem_id'},
470+
inplace=True,
471+
)
472+
473+
outfile_path = args.WORKDIR.joinpath(
474+
"data_out",
475+
"x_data",
476+
"drug_mordred.tsv"
477+
)
478+
out_df.to_csv(
479+
path_or_buf=outfile_path,
480+
sep='\t',
481+
index=False,
482+
)
483+
442484
#-------------------------------------------------------------------
443485
# create mutation count table
444486
#-------------------------------------------------------------------
@@ -514,6 +556,7 @@ def process_datasets(args):
514556
index=False
515557
)
516558

559+
517560
def split_data_sets(
518561
args: dict,
519562
data_sets: dict,

0 commit comments

Comments
 (0)