Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ File Search Benchmark.png
# PyCache
__pycache__
*/__pycache__

results/
231 changes: 179 additions & 52 deletions monomer_generation/generate_monomers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,70 @@
Generates jsons files using the new format up-to-date as of 3/14/23
"""

from openff.toolkit import Topology, Molecule
from substructure_generator import SubstructureGenerator
import sys
import enum
import json
import multiprocessing
import os
import numpy as np
from monomer_smiles_input import ALL_SMILES_INPUT
from pathlib import Path
from partition import partition
from rdkit import Chem
import signal
import sys
import time
from collections.abc import Generator
from pathlib import Path
from types import FrameType
import logging

import openmm
from openmm.app import PDBFile
from copy import deepcopy
from simtk import unit
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.toolkit.typing.engines.smirnoff import vdWHandler, AngleHandler
from openff.toolkit.utils.toolkits import GLOBAL_TOOLKIT_REGISTRY, ToolkitRegistry, OpenEyeToolkitWrapper
import tqdm
import tqdm.contrib
from tqdm.contrib.logging import logging_redirect_tqdm
from monomer_smiles_input import ALL_SMILES_INPUT
from openff.toolkit import Topology
from openff.toolkit.utils.toolkits import (
GLOBAL_TOOLKIT_REGISTRY,
OpenEyeToolkitWrapper,
)
from openff.units.openmm import to_openmm as to_openmm_quantity
from simtk import unit
from substructure_generator import SubstructureGenerator

from typing import (
Dict,
List,
Tuple,
)
sys.path.append(os.path.abspath(__file__ + "/../..")) # TODO: fix this mess
from pdb_file_search import PDBFiles


class Columns(enum.IntEnum):
FILENAME = 0
SUCCESS = 1
EXCEPTION_MESSAGE = 2
TIME = 3


# CONFIGURATION CONSTANTS
WRITE_RESULTING_TOPOLOGIES: bool = False
SKIP_EXISTING_JSONTOPS: bool = False
N_PROCESSES: int | None = 8 # None: Auto-detect
SORT_BY: tuple[Columns, ...] = (Columns.SUCCESS, Columns.TIME)
PDB_FILE_TIMEOUT_SEC: None | int = None # None: No timeout.
# END OF CONFIGURATION CONSTANTS

# Note that PDB_FILE_TIMEOUT_SEC uses signal.alarm, which can't interrupt in the
# middle of a Python bytecode instruction. This means that if the process is
# in a long-running C calculation (such as matching SMARTS to an RDMol, or
# splitting up an RDMol into its constituent molecules), the timeout will
# fire at the end of that calculation. If the process is taking longer than you
# expect, you can use ctrl+c to interrupt - you'll still get results for what's
# already finished

GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper)


def timeout_handler(signum: int, frame: FrameType | None):
raise TimeoutError(f"Timeout after {PDB_FILE_TIMEOUT_SEC} seconds")


def _identify_all_molecules(
self,
) -> Dict[int, Tuple[int, Dict[int, int]]]:
identity_maps: Dict[int, Tuple[int, Dict[int, int]]] = dict()
) -> dict[int, tuple[int, dict[int, int]]]:
identity_maps: dict[int, tuple[int, dict[int, int]]] = dict()
already_matched_mols = set()

for mol1_idx in range(self.n_molecules):
Expand All @@ -46,6 +79,7 @@ def _identify_all_molecules(

return identity_maps


def minimize_energy(off_topology, forcefield, max_iters):
# mol should already have one conformer...

Expand All @@ -57,45 +91,34 @@ def minimize_energy(off_topology, forcefield, max_iters):
m.assign_partial_charges(partial_charge_method="gasteiger")

start = time.time()
system = forcefield.create_openmm_system(off_topology, allow_nonintegral_charges=True, charge_from_molecules=off_topology.molecules)
system = forcefield.create_openmm_system(
off_topology,
allow_nonintegral_charges=True,
charge_from_molecules=off_topology.molecules,
)
time_to_parameterize = time.time() - start

time_step = 2*unit.femtoseconds # simulation timestep
temperature = 1000*unit.kelvin # simulation temperature
friction = 1/unit.picosecond # collision rate
time_step = 2 * unit.femtoseconds # simulation timestep
temperature = 1000 * unit.kelvin # simulation temperature
friction = 1 / unit.picosecond # collision rate
integrator = openmm.LangevinIntegrator(temperature, friction, time_step)
simulation = openmm.app.Simulation(omm_topology, system, integrator)
openmm_positions = to_openmm_quantity(off_topology.get_positions())
simulation.context.setPositions(openmm_positions)

start = time.time()
simulation.minimizeEnergy(maxIterations=max_iters)
time_to_energy_minimize = time.time() - start
simulation.step(2)
st = simulation.context.getState(getPositions=True, getEnergy=True)
return st.getPotentialEnergy(), time_to_parameterize, time_to_energy_minimize
return st.getPotentialEnergy(), time_to_parameterize, time_to_energy_minimize

sys.path.append(os.path.abspath(__file__ + "/../..")) # TODO: fix this mess
from pdb_file_search import PDBFiles

def successfully_loaded(top):
match_info = [atom.metadata["match_info"] for atom in top.atoms]
return all([bool(match) for match in match_info])

# Make a file to store new jsons (TODO: change this to any new file structure)
current_dir = Path(__file__).parent.resolve()
json_dir = current_dir / Path("json_files")
json_dir.mkdir(parents=False, exist_ok=True)
os.chdir(current_dir)

# set flag if the script should try to test_load the new json file
test_load = True

# create object for json creation and loading:
# with open("polymer_energies.txt", "w") as file:
# file.write("name, num_atoms, energy, time_to_load, time_to_parameterize, time_to_energy_minimize\n")

for file_name, monomer_info in ALL_SMILES_INPUT.items():
def load_file(
file_name: str,
monomer_info: dict[str, tuple[str, list[int]]],
test_load: bool = True,
) -> Generator[tuple[Path, bool, str | None, float]]:
# time_to_load
# time_to_parameterize
# time_to_energy_minimize
Expand All @@ -112,20 +135,56 @@ def successfully_loaded(top):
pdb_files = PDBFiles.search(file_name)
for pdb_file in pdb_files:
if test_load:
print(f"testing {file_name}")
assert pdb_file != None
substructs = engine.get_monomer_info_dict()["monomers"]

# manually ensure that no molecules are cached to obtain the worst-case time complexity
# manually ensure that no molecules are cached to obtain the worst-case time complexity
# for if parameterization should be done over ALL atoms, since that is the time-complexity
# problem we are interested in solving. This can be done with some manipulation of
# openff's identical_molecule_groups property for version 0.13.2
Topology._identify_chemically_identical_molecules = _identify_all_molecules

current_dir = Path(__file__).parent.parent.resolve()
pdb_out = (
current_dir
/ "results"
/ Path(pdb_file).relative_to(current_dir / "compatible_pdbs")
)
monomers_out = pdb_out.with_suffix(".monomers.json")
jsontop_out = pdb_out.with_suffix(".topology.json")
pdb_file_short = pdb_file.relative_to(pdb_file.parent.parent)

if WRITE_RESULTING_TOPOLOGIES:
pdb_out.parent.mkdir(parents=True, exist_ok=True)
monomers_out.write_text(json.dumps(substructs))
pdb_out.write_text(pdb_file.read_text())

if SKIP_EXISTING_JSONTOPS and jsontop_out.exists():
continue

start = time.time()
top = Topology.from_pdb(str(pdb_file), _custom_substructures=substructs)
if PDB_FILE_TIMEOUT_SEC is not None:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(PDB_FILE_TIMEOUT_SEC)
try:
top: Topology = Topology.from_pdb(
str(pdb_file),
_custom_substructures=substructs,
)
except Exception as e:
success = False
exc = str(e)
else:
success = successfully_loaded(top)
exc = None
if WRITE_RESULTING_TOPOLOGIES and success:
jsontop_out.write_text(top.to_json())
finally:
if PDB_FILE_TIMEOUT_SEC is not None:
signal.alarm(0)
time_to_load = time.time() - start
assert successfully_loaded(top)

yield (pdb_file_short, success, exc, time_to_load)

# # desolvate since not all systems have solvent
# new_top = Topology()
Expand Down Expand Up @@ -156,4 +215,72 @@ def successfully_loaded(top):
# print(energy)
# # with open("polymer_energies.txt", "a") as file:
# # file.write(f"{pdb_file.stem}, {num_atoms}, {energy}, {time_to_load}, {time_to_parameterize}, {time_to_energy_minimize}\n")
# # #______________________________________________________________________________
# # #______________________________________________________________________________


def load_file_star(
tup: tuple[str, dict[str, tuple[str, list[int]]]],
) -> list[tuple[Path, bool, str | None, float]]:
return list(load_file(*tup))


def successfully_loaded(top: Topology) -> bool:
match_info = [atom.metadata["match_info"] for atom in top.atoms]
return all([bool(match) for match in match_info])


if __name__ == "__main__":
# Make a file to store new jsons (TODO: change this to any new file structure)
current_dir = Path(__file__).parent.resolve()
json_dir = current_dir / Path("json_files")
json_dir.mkdir(parents=False, exist_ok=True)
os.chdir(current_dir)

# create object for json creation and loading:
# with open("polymer_energies.txt", "w") as file:
# file.write("name, num_atoms, energy, time_to_load, time_to_parameterize, time_to_energy_minimize\n")

results: list[tuple[Path, bool, str | None, float]] = []
interrupt = False
with multiprocessing.Pool(N_PROCESSES) as pool:
try:
for elems in tqdm.tqdm(
pool.imap_unordered(
load_file_star,
[*ALL_SMILES_INPUT.items()],
),
total=len(ALL_SMILES_INPUT),
):
results.extend(elems)
except KeyboardInterrupt:
interrupt = True

longest_pdb_file = max(len(str(pdb_file)) for pdb_file, *_ in results)

n_successes = 0
n_failures = 0
for pdb_file, success, exc, time_to_load in sorted(
results,
key=lambda tup: tuple(tup[i] for i in SORT_BY),
):
if success:
msg = ""
elif exc is None:
msg = ": topology loaded but failed to validate"
else:
msg = f": {exc}"

if success:
n_successes += 1
success_str = "SUCCESS"
else:
n_failures += 1
success_str = "FAILURE"

print(
f"{success_str}: {str(pdb_file):{longest_pdb_file}} in {time_to_load:10.3f} s{msg}",
)

print(f"There were {n_successes} successes out of {len(results)} completed PDB files!")
if interrupt:
print("(processing was interrupted by user)")
14 changes: 7 additions & 7 deletions monomer_generation/monomer_smiles_input.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Simply a file to store input smiles info. There are two general ways to input monomer info.
Simply a file to store input smiles info. There are two general ways to input monomer info.

1) There is the "hard" way where you input each mononmer and terminal group individually:
ALL_SMILES_INPUT["peg_modified"] = {
Expand All @@ -8,7 +8,7 @@
"peg_TERM4": ("[#1:1]-[#6:4](-[#1:6])(-[#8:7]-[#6:11](-[#1:3])(-[#1:9])-[#6:12](-[#1:2])(-[#1:5])-[#8:8]-*)-[#1:10]", [])
}
2) There is an easiler way where you input the monomer as a complete molecule and "chop" off the ends. The ends that
were removed become the different possible terminal groups.
were removed become the different possible terminal groups.
ALL_SMILES_INPUT["peg_modified"] = {
"peg": ("[#1:1]-[#6:4](-[#1:6])(-[#8:7]-[#6:11](-[#1:3])(-[#1:9])-[#6:12](-[#1:2])(-[#1:5])-[#8:8]-[C](-[H])(-[H])(-[H]))-[#1:10]", [0,1,2,3,11,12,13,14,15])
}
Expand All @@ -17,7 +17,7 @@
is necessary because the terminal groups have atoms with different connectivity (example: carboxylic acis terminals)
"""

ALL_SMILES_INPUT = dict()
ALL_SMILES_INPUT: dict[str, dict[str, tuple[str, list[int]]]] = dict()

# MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
# SIMPLE POLYMERS
Expand Down Expand Up @@ -153,13 +153,13 @@
"rubber8": ("*-[#6:1](-[#6:2](-[#6:3](-[#6:4](-*)(-[#1:9])-[#1:10])(-[#1:8])-*)(-[#1:7])-[#1:11])(-[#1:5])-[#1:6]", []),
"rubber9": ("*-[#6:1](-[#6:2](-[#6:3](-*)(-[#1:8])-[#1:9])(-[#1:6])-[#1:7])(-[#6:4](-*)(-*)-[#1:10])-[#1:5]", [])
}

#MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
# nucleic_acis
#WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW

ALL_SMILES_INPUT["2q1r"] = {
"backbone": ("[#8:1](-[#6:2](-[#6:3]1(-[#8:4]-[#6:9](-[#6:7](-[#6:5]-1(-[#8:6]-[#15:10](=[#8:11])(-[#8-:12])-*)-[#1:16])(-[#8:8]-[#1:18])-[#1:17])(-*)-[#1:19])-[#1:15])(-[#1:13])-[#1:14])-*", []),
"backbone": ("[#8:1](-[#6:2](-[#6:3]1(-[#8:4]-[#6:9](-[#6:7](-[#6:5]-1(-[#8:6]-[#15:10](=[#8:11])(-[#8-:12])-*)-[#1:16])(-[#8:8]-[#1:18])-[#1:17])(-*)-[#1:19])-[#1:15])(-[#1:13])-[#1:14])-*", []),
"mid1": ("*-[#7:1]1-[#6:2](=[#7:3]-[#6:4]2-[#6:5](-[#7:6](-[#1:12])-[#1:13])=[#7:7]-[#6:8](=[#7:9]-[#6:10]=2-1)-[#1:14])-[#1:11]", []),
"mid2": ("*-[#7:1]1-[#6:2](=[#8:3])-[#7:4](-[#6:5](=[#8:6])-[#6:7](=[#6:8]-1-[#1:11])-[#1:10])-[#1:9]", []),
"mid3": ("*-[#7:1]1-[#6:2](=[#8:3])-[#7:4]=[#6:5](-[#7:6](-[#1:9])-[#1:10])-[#6:7](=[#6:8]-1-[#1:12])-[#1:11]", []),
Expand Down Expand Up @@ -319,7 +319,7 @@
# "COO_mid": ("*-N(C([H])([H])-C(-[O-])=O)-C([H])([H])-C(=O)-*", []),
# "Me_term": ("*-N([H])-C(C([H])([H])([H]))([H])-C(=O)-[O-]", []),
# "ring": ("n1:c([H]):c([H]):c([H]):c([H]):c([H]):1", [])}
# ALL_SMILES_INPUT["c8sc04240c3"] = {"": ("", [])} # poorly formatted input
# ALL_SMILES_INPUT["c8sc04240c3"] = {"": ("", [])} # poorly formatted input

ALL_SMILES_INPUT["ja7b02319_si_002"] = {"double_ring": ("[*]-[C](=[O])-[C](-[H])(-[H])-[N](-[*])-[C](-[H])(-[C]1=[C]2-[C](-[H])=[C](-[H])-[C](-[H])=[C](-[H])-[C]-2=[C](-[H])-[C](-[H])=[C]-1-[H])-[C](-[H])(-[H])-[H]", []),
"ring_term1": ("[*]-[N](-[H])-[C](-[H])(-[C]1=[C]2-[C](-[H])=[C](-[H])-[C](-[H])=[C](-[H])-[C]-2=[C](-[H])-[C](-[H])=[C]-1-[H])-[C](-[H])(-[H])-[H]" ,[]),
Expand All @@ -338,4 +338,4 @@
# ALL_SMILES_INPUT["ja9b10497_si_005"] = {"": ("", [])} # poorly formatted
ALL_SMILES_INPUT["Nspe5_gaff-phi_top6pops"] = {"mid": ("C([H])([H])([H])-C(=O)-N(-C(-C([H])([H])([H]))([H])-C1-C([H])=C([H])-C([H])=C([H])-C([H])=1)-C([H])([H])-C(=O)-N([H])([H])", [0,1,2,3,4,5, -1,-2,-3])}

# # ALL_SMILES_INPUT[""] = {"": ("", [])}
# # ALL_SMILES_INPUT[""] = {"": ("", [])}