openforcefield · Yoshanuikabundi · Dec 10, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,5 @@ File Search Benchmark.png
 # PyCache
 __pycache__
 */__pycache__
+
+results/
diff --git a/monomer_generation/generate_monomers.py b/monomer_generation/generate_monomers.py
@@ -2,37 +2,70 @@
 Generates jsons files using the new format up-to-date as of 3/14/23
 """
 
-from openff.toolkit import Topology, Molecule
-from substructure_generator import SubstructureGenerator
-import sys
+import enum
+import json
+import multiprocessing
 import os
-import numpy as np
-from monomer_smiles_input import ALL_SMILES_INPUT
-from pathlib import Path
-from partition import partition
-from rdkit import Chem
+import signal
+import sys
 import time
+from collections.abc import Generator
+from pathlib import Path
+from types import FrameType
+import logging
+
 import openmm
-from openmm.app import PDBFile
-from copy import deepcopy
-from simtk import unit
-from openff.toolkit.typing.engines.smirnoff import ForceField
-from openff.toolkit.typing.engines.smirnoff import vdWHandler, AngleHandler
-from openff.toolkit.utils.toolkits import GLOBAL_TOOLKIT_REGISTRY, ToolkitRegistry, OpenEyeToolkitWrapper
+import tqdm
+import tqdm.contrib
+from tqdm.contrib.logging import logging_redirect_tqdm
+from monomer_smiles_input import ALL_SMILES_INPUT
+from openff.toolkit import Topology
+from openff.toolkit.utils.toolkits import (
+    GLOBAL_TOOLKIT_REGISTRY,
+    OpenEyeToolkitWrapper,
+)
 from openff.units.openmm import to_openmm as to_openmm_quantity
+from simtk import unit
+from substructure_generator import SubstructureGenerator
 
-from typing import (
-    Dict,
-    List,
-    Tuple,
-)
+sys.path.append(os.path.abspath(__file__ + "/../.."))  # TODO: fix this mess
+from pdb_file_search import PDBFiles
+
+
+class Columns(enum.IntEnum):
+    FILENAME = 0
+    SUCCESS = 1
+    EXCEPTION_MESSAGE = 2
+    TIME = 3
+
+
+# CONFIGURATION CONSTANTS
+WRITE_RESULTING_TOPOLOGIES: bool = False
+SKIP_EXISTING_JSONTOPS: bool = False
+N_PROCESSES: int | None = 8  # None: Auto-detect
+SORT_BY: tuple[Columns, ...] = (Columns.SUCCESS, Columns.TIME)
+PDB_FILE_TIMEOUT_SEC: None | int = None  # None: No timeout.
+# END OF CONFIGURATION CONSTANTS
+
+# Note that PDB_FILE_TIMEOUT_SEC uses signal.alarm, which can't interrupt in the
+# middle of a Python bytecode instruction. This means that if the process is
+# in a long-running C calculation (such as matching SMARTS to an RDMol, or
+# splitting up an RDMol into its constituent molecules), the timeout will
+# fire at the end of that calculation. If the process is taking longer than you
+# expect, you can use ctrl+c to interrupt - you'll still get results for what's
+# already finished
 
 GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper)
 
+
+def timeout_handler(signum: int, frame: FrameType | None):
+    raise TimeoutError(f"Timeout after {PDB_FILE_TIMEOUT_SEC} seconds")
+
+
 def _identify_all_molecules(
     self,
-    ) -> Dict[int, Tuple[int, Dict[int, int]]]:
-    identity_maps: Dict[int, Tuple[int, Dict[int, int]]] = dict()
+) -> dict[int, tuple[int, dict[int, int]]]:
+    identity_maps: dict[int, tuple[int, dict[int, int]]] = dict()
     already_matched_mols = set()
 
     for mol1_idx in range(self.n_molecules):
@@ -46,6 +79,7 @@ def _identify_all_molecules(
 
     return identity_maps
 
+
 def minimize_energy(off_topology, forcefield, max_iters):
     # mol should already have one conformer...
 
@@ -57,45 +91,34 @@ def minimize_energy(off_topology, forcefield, max_iters):
         m.assign_partial_charges(partial_charge_method="gasteiger")
 
     start = time.time()
-    system = forcefield.create_openmm_system(off_topology, allow_nonintegral_charges=True, charge_from_molecules=off_topology.molecules)
+    system = forcefield.create_openmm_system(
+        off_topology,
+        allow_nonintegral_charges=True,
+        charge_from_molecules=off_topology.molecules,
+    )
     time_to_parameterize = time.time() - start
 
-    time_step = 2*unit.femtoseconds  # simulation timestep
-    temperature = 1000*unit.kelvin  # simulation temperature
-    friction = 1/unit.picosecond  # collision rate
+    time_step = 2 * unit.femtoseconds  # simulation timestep
+    temperature = 1000 * unit.kelvin  # simulation temperature
+    friction = 1 / unit.picosecond  # collision rate
     integrator = openmm.LangevinIntegrator(temperature, friction, time_step)
     simulation = openmm.app.Simulation(omm_topology, system, integrator)
     openmm_positions = to_openmm_quantity(off_topology.get_positions())
     simulation.context.setPositions(openmm_positions)
-    
+
     start = time.time()
     simulation.minimizeEnergy(maxIterations=max_iters)
     time_to_energy_minimize = time.time() - start
     simulation.step(2)
     st = simulation.context.getState(getPositions=True, getEnergy=True)
-    return st.getPotentialEnergy(), time_to_parameterize, time_to_energy_minimize 
+    return st.getPotentialEnergy(), time_to_parameterize, time_to_energy_minimize
 
-sys.path.append(os.path.abspath(__file__ + "/../..")) # TODO: fix this mess
-from pdb_file_search import PDBFiles
 
-def successfully_loaded(top):
-    match_info = [atom.metadata["match_info"] for atom in top.atoms]
-    return all([bool(match) for match in match_info])
-
-# Make a file to store new jsons (TODO: change this to any new file structure)
-current_dir = Path(__file__).parent.resolve()
-json_dir = current_dir / Path("json_files")
-json_dir.mkdir(parents=False, exist_ok=True)
-os.chdir(current_dir)
-
-# set flag if the script should try to test_load the new json file
-test_load = True
-
-# create object for json creation and loading:
-# with open("polymer_energies.txt", "w") as file:
-#     file.write("name, num_atoms, energy, time_to_load, time_to_parameterize, time_to_energy_minimize\n")
-
-for file_name, monomer_info in ALL_SMILES_INPUT.items():
+def load_file(
+    file_name: str,
+    monomer_info: dict[str, tuple[str, list[int]]],
+    test_load: bool = True,
+) -> Generator[tuple[Path, bool, str | None, float]]:
     # time_to_load
     # time_to_parameterize
     # time_to_energy_minimize
@@ -112,20 +135,56 @@ def successfully_loaded(top):
     pdb_files = PDBFiles.search(file_name)
     for pdb_file in pdb_files:
         if test_load:
-            print(f"testing {file_name}")
             assert pdb_file != None
             substructs = engine.get_monomer_info_dict()["monomers"]
 
-            # manually ensure that no molecules are cached to obtain the worst-case time complexity 
+            # manually ensure that no molecules are cached to obtain the worst-case time complexity
             # for if parameterization should be done over ALL atoms, since that is the time-complexity
             # problem we are interested in solving. This can be done with some manipulation of
             # openff's identical_molecule_groups property for version 0.13.2
             Topology._identify_chemically_identical_molecules = _identify_all_molecules
 
+            current_dir = Path(__file__).parent.parent.resolve()
+            pdb_out = (
+                current_dir
+                / "results"
+                / Path(pdb_file).relative_to(current_dir / "compatible_pdbs")
+            )
+            monomers_out = pdb_out.with_suffix(".monomers.json")
+            jsontop_out = pdb_out.with_suffix(".topology.json")
+            pdb_file_short = pdb_file.relative_to(pdb_file.parent.parent)
+
+            if WRITE_RESULTING_TOPOLOGIES:
+                pdb_out.parent.mkdir(parents=True, exist_ok=True)
+                monomers_out.write_text(json.dumps(substructs))
+                pdb_out.write_text(pdb_file.read_text())
+
+            if SKIP_EXISTING_JSONTOPS and jsontop_out.exists():
+                continue
+
             start = time.time()
-            top = Topology.from_pdb(str(pdb_file), _custom_substructures=substructs)
+            if PDB_FILE_TIMEOUT_SEC is not None:
+                signal.signal(signal.SIGALRM, timeout_handler)
+                signal.alarm(PDB_FILE_TIMEOUT_SEC)
+            try:
+                top: Topology = Topology.from_pdb(
+                    str(pdb_file),
+                    _custom_substructures=substructs,
+                )
+            except Exception as e:
+                success = False
+                exc = str(e)
+            else:
+                success = successfully_loaded(top)
+                exc = None
+                if WRITE_RESULTING_TOPOLOGIES and success:
+                    jsontop_out.write_text(top.to_json())
+            finally:
+                if PDB_FILE_TIMEOUT_SEC is not None:
+                    signal.alarm(0)
             time_to_load = time.time() - start
-            assert successfully_loaded(top)
+
+            yield (pdb_file_short, success, exc, time_to_load)
 
             # # desolvate since not all systems have solvent
             # new_top = Topology()
@@ -156,4 +215,72 @@ def successfully_loaded(top):
             # print(energy)
             # # with open("polymer_energies.txt", "a") as file:
             # #     file.write(f"{pdb_file.stem}, {num_atoms}, {energy}, {time_to_load}, {time_to_parameterize}, {time_to_energy_minimize}\n")
-            # # #______________________________________________________________________________
+            # # #______________________________________________________________________________
+
+
+def load_file_star(
+    tup: tuple[str, dict[str, tuple[str, list[int]]]],
+) -> list[tuple[Path, bool, str | None, float]]:
+    return list(load_file(*tup))
+
+
+def successfully_loaded(top: Topology) -> bool:
+    match_info = [atom.metadata["match_info"] for atom in top.atoms]
+    return all([bool(match) for match in match_info])
+
+
+if __name__ == "__main__":
+    # Make a file to store new jsons (TODO: change this to any new file structure)
+    current_dir = Path(__file__).parent.resolve()
+    json_dir = current_dir / Path("json_files")
+    json_dir.mkdir(parents=False, exist_ok=True)
+    os.chdir(current_dir)
+
+    # create object for json creation and loading:
+    # with open("polymer_energies.txt", "w") as file:
+    #     file.write("name, num_atoms, energy, time_to_load, time_to_parameterize, time_to_energy_minimize\n")
+
+    results: list[tuple[Path, bool, str | None, float]] = []
+    interrupt = False
+    with multiprocessing.Pool(N_PROCESSES) as pool:
+        try:
+            for elems in tqdm.tqdm(
+                pool.imap_unordered(
+                    load_file_star,
+                    [*ALL_SMILES_INPUT.items()],
+                ),
+                total=len(ALL_SMILES_INPUT),
+            ):
+                results.extend(elems)
+        except KeyboardInterrupt:
+            interrupt = True
+
+    longest_pdb_file = max(len(str(pdb_file)) for pdb_file, *_ in results)
+
+    n_successes = 0
+    n_failures = 0
+    for pdb_file, success, exc, time_to_load in sorted(
+        results,
+        key=lambda tup: tuple(tup[i] for i in SORT_BY),
+    ):
+        if success:
+            msg = ""
+        elif exc is None:
+            msg = ": topology loaded but failed to validate"
+        else:
+            msg = f": {exc}"
+
+        if success:
+            n_successes += 1
+            success_str = "SUCCESS"
+        else:
+            n_failures += 1
+            success_str = "FAILURE"
+
+        print(
+            f"{success_str}: {str(pdb_file):{longest_pdb_file}} in {time_to_load:10.3f} s{msg}",
+        )
+
+    print(f"There were {n_successes} successes out of {len(results)} completed PDB files!")
+    if interrupt:
+        print("(processing was interrupted by user)")
diff --git a/monomer_generation/monomer_smiles_input.py b/monomer_generation/monomer_smiles_input.py
@@ -1,5 +1,5 @@
 """
-Simply a file to store input smiles info. There are two general ways to input monomer info. 
+Simply a file to store input smiles info. There are two general ways to input monomer info.
 
 1) There is the "hard" way where you input each mononmer and terminal group individually:
         ALL_SMILES_INPUT["peg_modified"] = {
@@ -8,7 +8,7 @@
                 "peg_TERM4": ("[#1:1]-[#6:4](-[#1:6])(-[#8:7]-[#6:11](-[#1:3])(-[#1:9])-[#6:12](-[#1:2])(-[#1:5])-[#8:8]-*)-[#1:10]", [])
             }
 2) There is an easiler way where you input the monomer as a complete molecule and "chop" off the ends. The ends that
-   were removed become the different possible terminal groups. 
+   were removed become the different possible terminal groups.
         ALL_SMILES_INPUT["peg_modified"] = {
                 "peg": ("[#1:1]-[#6:4](-[#1:6])(-[#8:7]-[#6:11](-[#1:3])(-[#1:9])-[#6:12](-[#1:2])(-[#1:5])-[#8:8]-[C](-[H])(-[H])(-[H]))-[#1:10]", [0,1,2,3,11,12,13,14,15])
             }
@@ -17,7 +17,7 @@
 is necessary because the terminal groups have atoms with different connectivity (example: carboxylic acis terminals)
 """
 
-ALL_SMILES_INPUT = dict()
+ALL_SMILES_INPUT: dict[str, dict[str, tuple[str, list[int]]]] = dict()
 
 # MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
 #                             SIMPLE POLYMERS
@@ -153,13 +153,13 @@
         "rubber8": ("*-[#6:1](-[#6:2](-[#6:3](-[#6:4](-*)(-[#1:9])-[#1:10])(-[#1:8])-*)(-[#1:7])-[#1:11])(-[#1:5])-[#1:6]", []),
         "rubber9": ("*-[#6:1](-[#6:2](-[#6:3](-*)(-[#1:8])-[#1:9])(-[#1:6])-[#1:7])(-[#6:4](-*)(-*)-[#1:10])-[#1:5]", [])
     }
-        
+
 #MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
 #                           nucleic_acis
 #WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
 
 ALL_SMILES_INPUT["2q1r"] = {
-        "backbone": ("[#8:1](-[#6:2](-[#6:3]1(-[#8:4]-[#6:9](-[#6:7](-[#6:5]-1(-[#8:6]-[#15:10](=[#8:11])(-[#8-:12])-*)-[#1:16])(-[#8:8]-[#1:18])-[#1:17])(-*)-[#1:19])-[#1:15])(-[#1:13])-[#1:14])-*", []), 
+        "backbone": ("[#8:1](-[#6:2](-[#6:3]1(-[#8:4]-[#6:9](-[#6:7](-[#6:5]-1(-[#8:6]-[#15:10](=[#8:11])(-[#8-:12])-*)-[#1:16])(-[#8:8]-[#1:18])-[#1:17])(-*)-[#1:19])-[#1:15])(-[#1:13])-[#1:14])-*", []),
         "mid1": ("*-[#7:1]1-[#6:2](=[#7:3]-[#6:4]2-[#6:5](-[#7:6](-[#1:12])-[#1:13])=[#7:7]-[#6:8](=[#7:9]-[#6:10]=2-1)-[#1:14])-[#1:11]", []),
         "mid2": ("*-[#7:1]1-[#6:2](=[#8:3])-[#7:4](-[#6:5](=[#8:6])-[#6:7](=[#6:8]-1-[#1:11])-[#1:10])-[#1:9]", []),
         "mid3": ("*-[#7:1]1-[#6:2](=[#8:3])-[#7:4]=[#6:5](-[#7:6](-[#1:9])-[#1:10])-[#6:7](=[#6:8]-1-[#1:12])-[#1:11]", []),
@@ -319,7 +319,7 @@
 #                                    "COO_mid": ("*-N(C([H])([H])-C(-[O-])=O)-C([H])([H])-C(=O)-*", []),
 #                                    "Me_term": ("*-N([H])-C(C([H])([H])([H]))([H])-C(=O)-[O-]", []),
 #                                    "ring": ("n1:c([H]):c([H]):c([H]):c([H]):c([H]):1", [])}
-# ALL_SMILES_INPUT["c8sc04240c3"] = {"": ("", [])}  # poorly formatted input 
+# ALL_SMILES_INPUT["c8sc04240c3"] = {"": ("", [])}  # poorly formatted input
 
 ALL_SMILES_INPUT["ja7b02319_si_002"] = {"double_ring": ("[*]-[C](=[O])-[C](-[H])(-[H])-[N](-[*])-[C](-[H])(-[C]1=[C]2-[C](-[H])=[C](-[H])-[C](-[H])=[C](-[H])-[C]-2=[C](-[H])-[C](-[H])=[C]-1-[H])-[C](-[H])(-[H])-[H]", []),
                                         "ring_term1": ("[*]-[N](-[H])-[C](-[H])(-[C]1=[C]2-[C](-[H])=[C](-[H])-[C](-[H])=[C](-[H])-[C]-2=[C](-[H])-[C](-[H])=[C]-1-[H])-[C](-[H])(-[H])-[H]" ,[]),
@@ -338,4 +338,4 @@
 # ALL_SMILES_INPUT["ja9b10497_si_005"] = {"": ("", [])}        # poorly formatted
 ALL_SMILES_INPUT["Nspe5_gaff-phi_top6pops"] = {"mid": ("C([H])([H])([H])-C(=O)-N(-C(-C([H])([H])([H]))([H])-C1-C([H])=C([H])-C([H])=C([H])-C([H])=1)-C([H])([H])-C(=O)-N([H])([H])", [0,1,2,3,4,5, -1,-2,-3])}
 
-# # ALL_SMILES_INPUT[""] = {"": ("", [])}
+# # ALL_SMILES_INPUT[""] = {"": ("", [])}