@@ -64,6 +64,7 @@ def smiles_to_mordred(smiles,nproc=2):
64
64
##reformat here
65
65
longtab = pd .melt (dd ,id_vars = 'smile' ,value_vars = values )
66
66
longtab = longtab .rename ({'variable' :'structural_descriptor' ,'value' :'descriptor_value' },axis = 1 )
67
+
67
68
return longtab
68
69
69
70
def main ():
@@ -82,16 +83,24 @@ def main():
82
83
cansmiles = [a for a in set (tab .canSMILES ) if str (a )!= 'nan' ]
83
84
# isosmiles = list(set(tab.isoSMILES))
84
85
morgs = smiles_to_fingerprint (cansmiles )
85
- # print(morgs)
86
+
86
87
ids = pd .DataFrame (tab [['improve_drug_id' ,'canSMILES' ]]).drop_duplicates ()
87
- # print(ids)
88
+
88
89
id_morg = ids .rename ({"canSMILES" :'smile' },axis = 1 ).merge (morgs )[['improve_drug_id' ,'structural_descriptor' ,'descriptor_value' ]]
89
90
90
91
mords = smiles_to_mordred (cansmiles ,nproc = ncors )
91
92
92
93
id_mord = ids .rename ({'canSMILES' :'smile' },axis = 1 ).merge (mords )[['improve_drug_id' ,'structural_descriptor' ,'descriptor_value' ]]
93
94
94
- full = pd .concat ([id_morg ,id_mord ],axis = 0 )
95
+ full = pd .concat ([id_morg ,id_mord ],axis = 0 )
96
+
97
+ # Convert any values that contain the following strings to NA. I think this covers all of the cases, but add here if more are found.
98
+ strings_to_replace = ["min" , "max" , "invalid" , "multiple" , "missing" ]
99
+ pattern = '|' .join (strings_to_replace )
100
+ full ['descriptor_value' ] = full ['descriptor_value' ].astype (str )
101
+ full .loc [full ['descriptor_value' ].str .contains (pattern , case = False , na = False ), 'descriptor_value' ] = "NaN"
102
+
103
+
95
104
full .to_csv (args .outtable ,sep = '\t ' ,index = False ,compression = 'gzip' )
96
105
97
106
if __name__ == '__main__' :
0 commit comments