Skip to content

Commit 8000968

Browse files
authored
Merge pull request #279 from PNNL-CompBio/drug-desc-patch
Fixes drug descriptor issue
2 parents e0c5aec + ea2fe3b commit 8000968

File tree

1 file changed

+12
-3
lines changed

1 file changed

+12
-3
lines changed

build/utils/build_drug_desc.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def smiles_to_mordred(smiles,nproc=2):
6464
##reformat here
6565
longtab = pd.melt(dd,id_vars='smile',value_vars=values)
6666
longtab = longtab.rename({'variable':'structural_descriptor','value':'descriptor_value'},axis=1)
67+
6768
return longtab
6869

6970
def main():
@@ -82,16 +83,24 @@ def main():
8283
cansmiles = [a for a in set(tab.canSMILES) if str(a)!='nan']
8384
# isosmiles = list(set(tab.isoSMILES))
8485
morgs = smiles_to_fingerprint(cansmiles)
85-
# print(morgs)
86+
8687
ids = pd.DataFrame(tab[['improve_drug_id','canSMILES']]).drop_duplicates()
87-
# print(ids)
88+
8889
id_morg = ids.rename({"canSMILES":'smile'},axis=1).merge(morgs)[['improve_drug_id','structural_descriptor','descriptor_value']]
8990

9091
mords = smiles_to_mordred(cansmiles,nproc=ncors)
9192

9293
id_mord = ids.rename({'canSMILES':'smile'},axis=1).merge(mords)[['improve_drug_id','structural_descriptor','descriptor_value']]
9394

94-
full = pd.concat([id_morg,id_mord],axis=0)
95+
full = pd.concat([id_morg,id_mord],axis=0)
96+
97+
# Convert any values that contain the following strings to NA. I think this covers all of the cases, but add here if more are found.
98+
strings_to_replace = ["min", "max", "invalid", "multiple", "missing"]
99+
pattern = '|'.join(strings_to_replace)
100+
full['descriptor_value'] = full['descriptor_value'].astype(str)
101+
full.loc[full['descriptor_value'].str.contains(pattern, case=False, na=False), 'descriptor_value'] = "NaN"
102+
103+
95104
full.to_csv(args.outtable,sep='\t',index=False,compression='gzip')
96105

97106
if __name__=='__main__':

0 commit comments

Comments
 (0)