Skip to content

Commit d6b5c9b

Browse files
authored
Merge pull request #373 from PNNL-CompBio/drug_desc_fix
Fixes rare drug descriptor issue
2 parents c914a9c + 4e63418 commit d6b5c9b

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

build/utils/build_drug_desc.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ def main():
9999
pattern = '|'.join(strings_to_replace)
100100
full['descriptor_value'] = full['descriptor_value'].astype(str)
101101
full.loc[full['descriptor_value'].str.contains(pattern, case=False, na=False), 'descriptor_value'] = "NaN"
102+
103+
# Remove Data that is incorrectly written by mordred or rdkit. - Very rare bug, but it happens.
104+
full['improve_drug_id'] = full['improve_drug_id'].astype(str).str.strip()
105+
mask = full['improve_drug_id'].str.match(r'^SMI_\d+$')
106+
n_dropped = (~mask).sum()
107+
if n_dropped:
108+
print(f"Dropping {n_dropped} malformed improve_drug_id rows.")
109+
full = full[mask].copy()
102110

103111

104112
full.to_csv(args.outtable,sep='\t',index=False,compression='gzip')

0 commit comments

Comments
 (0)