Skip to content

Commit 871a0e0

Browse files
committed
added output for morgan fingerprint
1 parent e7266a0 commit 871a0e0

File tree

1 file changed

+41
-6
lines changed

1 file changed

+41
-6
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,12 @@ def process_datasets(args):
209209
#-------------------------------------------------------------------
210210

211211

212-
# split_data_sets(
213-
# args=args,
214-
# data_sets=data_sets,
215-
# data_sets_info=data_sets_info,
216-
# response_data=response_data
217-
# )
212+
split_data_sets(
213+
args=args,
214+
data_sets=data_sets,
215+
data_sets_info=data_sets_info,
216+
response_data=response_data
217+
)
218218

219219
#-------------------------------------------------------------------
220220
# getting common / reference gene symbols
@@ -481,6 +481,41 @@ def process_datasets(args):
481481
index=False,
482482
)
483483

484+
485+
#-------------------------------------------------------------------
486+
# create morgan table
487+
#-------------------------------------------------------------------
488+
489+
dfs_to_merge = {}
490+
for data_set in data_sets:
491+
if (data_sets[data_set].experiments is not None
492+
and data_sets[data_set].drug_descriptors is not None
493+
):
494+
df_tmp = data_sets[data_set].format(data_type='drug_descriptor', shape='wide')
495+
df_tmp = df_tmp['morgan fingerprint']
496+
dfs_to_merge[data_set] = df_tmp
497+
498+
concat_drugs = pd.concat(dfs_to_merge.values())
499+
out_df = concat_drugs.reset_index()
500+
out_df = out_df.drop_duplicates(subset=['improve_drug_id'], keep='first')
501+
out_df = pd.concat((out_df, out_df['morgan fingerprint'].astype(str).apply(lambda x: pd.Series(list(x))).astype(int).add_prefix('ecfp4.')), axis=1)
502+
out_df = out_df.drop(['morgan fingerprint'], axis=1)
503+
out_df.rename(
504+
columns={'improve_drug_id': 'improve_chem_id'},
505+
inplace=True,
506+
)
507+
508+
outfile_path = args.WORKDIR.joinpath(
509+
"data_out",
510+
"x_data",
511+
"drug_ecfp4_nbits1024.tsv"
512+
)
513+
out_df.to_csv(
514+
path_or_buf=outfile_path,
515+
sep='\t',
516+
index=False,
517+
)
518+
484519
#-------------------------------------------------------------------
485520
# create mutation count table
486521
#-------------------------------------------------------------------

0 commit comments

Comments
 (0)