Merge pull request #24 from jcorreia11/develop

Develop
BioSystemsUM · Oct 27, 2022 · f656ca4 · f656ca4
2 parents 6ce4996 + 087d14f
commit f656ca4
Show file tree

Hide file tree

Showing 8 changed files with 79 additions and 45 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -1,14 +1,14 @@
 [metadata]
 name = biocatalyzer
-version = 0.1.0-beta
+version = 0.1.1-beta
 description = BioCatalyzer: a rule-based tool to predict compound metabolism
 long_description = file: README.md
 long_description_content_type = text/markdown
 keywords = reaction-rules, metabolism, enzymatic-reactions, chemoinformatics, cheminformatics
 author = João Correia
 author_email = [email protected]
 url = https://github.com/jcorreia11/BioCatalyzer
-download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.1.0-beta.tar.gz
+download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.1.1-beta.tar.gz
 license = MIT
 license_file = LICENSE
 platforms = unix, linux, osx, cygwin, win32

diff --git a/src/biocatalyzer/bioreactor.py b/src/biocatalyzer/bioreactor.py
@@ -581,18 +581,22 @@ def _react_single(self, smiles: str, smarts: str):
         if len(results) > 0:
             smiles_id = self._compounds[self._compounds.smiles == smiles].compound_id.values[0]
             smarts_id = self._reaction_rules[self._reaction_rules.SMARTS == smarts].InternalID.values[0]
+            most_similar_products_set = set()
             for i, result in enumerate(results):
                 products = result.split('>')[-1].split('.')
                 # keep only the most similar compound to the input compound
                 most_similar_product = ChemUtils.most_similar_compound(smiles, products)
-                if self._match_conditions(most_similar_product):
-                    if self._neutralize:
-                        most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
-                    ecs = self._get_ec_numbers(smarts_id)
-                    with open(self._new_compounds_path, 'a') as f:
-                        f.write(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t"
-                                f"{most_similar_product}\t{result}\t{ecs}\n")
-                    self._new_compounds_flag = True
+                most_similar_product = ChemUtils.smiles_to_isomerical_smiles(most_similar_product)
+                if most_similar_product not in most_similar_products_set:
+                    most_similar_products_set.add(most_similar_product)
+                    if self._match_conditions(most_similar_product):
+                        if self._neutralize:
+                            most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
+                        ecs = self._get_ec_numbers(smarts_id)
+                        with open(self._new_compounds_path, 'a') as f:
+                            f.write(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t"
+                                    f"{most_similar_product}\t{result}\t{ecs}\n")
+                        self._new_compounds_flag = True
 
     def react(self):
         """

diff --git a/src/biocatalyzer/chem/chem_utils.py b/src/biocatalyzer/chem/chem_utils.py
@@ -15,22 +15,22 @@ class ChemUtils:
     """
 
     @staticmethod
-    def mol_to_isomerical_smiles(mol: Mol):
+    def smiles_to_isomerical_smiles(smiles: str):
         """
         Converts a molecule to its canonical SMILES.
 
         Parameters
         ----------
-        mol: Mol
-            The molecule to convert.
+        smiles: str
+            The SMILES of the molecule.
 
         Returns
         -------
         str
             The SMILES string.
         """
         try:
-            return MolToSmiles(RemoveHs(mol), isomericSmiles=True)
+            return MolToSmiles(RemoveHs(MolFromSmiles(smiles)), isomericSmiles=True)
         except TypeError:
             return None
 
@@ -100,6 +100,29 @@ def _remove_hs(mol: Mol):
         except Chem.rdchem.AtomKekulizeException:
             return mol
 
+    @staticmethod
+    def _sanitize_mol(mol: Mol):
+        """
+        Sanitizes a molecule.
+
+        Parameters
+        ----------
+        mol: Mol
+            The molecule to sanitize.
+
+        Returns
+        -------
+        Mol
+            The sanitized molecule.
+        """
+        if mol is None:
+            return None
+        try:
+            Chem.SanitizeMol(mol)
+            return mol
+        except ValueError:
+            return None
+
     @staticmethod
     def react(smiles: Union[str, List[str]], smarts: str):
         """
@@ -127,7 +150,7 @@ def react(smiles: Union[str, List[str]], smarts: str):
         try:
             return ChemUtils._create_reaction_instances(reaction, mol)
         except ValueError:
-            return None
+            return []
 
     @staticmethod
     def _create_reaction_instances(rxn: ChemicalReaction, reactants: List[Mol]):
@@ -149,12 +172,14 @@ def _create_reaction_instances(rxn: ChemicalReaction, reactants: List[Mol]):
         res = []
         ps = rxn.RunReactants(reactants)
         for pset in ps:
-            tres = ChemicalReaction()
-            for p in pset:
-                tres.AddProductTemplate(ChemUtils._remove_hs(p))
-            for reactant in reactants:
-                tres.AddReactantTemplate(ChemUtils._remove_hs(reactant))
-            res.append(tres)
+            pset = [ChemUtils._sanitize_mol(pset_i) for pset_i in pset]
+            if None not in pset:
+                tres = ChemicalReaction()
+                for p in pset:
+                    tres.AddProductTemplate(ChemUtils._remove_hs(p))
+                for reactant in reactants:
+                    tres.AddReactantTemplate(ChemUtils._remove_hs(reactant))
+                res.append(tres)
         return list(set([AllChem.ReactionToSmiles(entry, canonical=True) for entry in res]))
 
     @staticmethod

diff --git a/src/biocatalyzer/matcher.py b/src/biocatalyzer/matcher.py
@@ -289,13 +289,13 @@ def _match_masses(self):
         pd.DataFrame:
             pandas dataframe with the matches.
         """
-        ms_df = pd.DataFrame(columns=['ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
+        ms_df = pd.DataFrame(columns=['Index', 'ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
                                       self._ms_field, 'NewCompoundID', 'NewCompoundSmiles', 'NewCompoundExactMass',
                                       'EC_Numbers'])
         for i, row in self._new_compounds.iterrows():
             mv, mi = match_value(row['NewCompoundExactMass'], self._ms_data[self._ms_field].values, self._tolerance)
             if mv and self._ms_data.loc[mi, 'ParentCompound'] == '_'.join(row['NewCompoundID'].split('_')[:-1]):
-                ms_df.loc[len(ms_df)] = [self._ms_data.loc[mi, 'ParentCompound'],
+                ms_df.loc[len(ms_df)] = [mi, self._ms_data.loc[mi, 'ParentCompound'],
                                          self._ms_data.loc[mi, 'ParentCompoundSmiles'],
                                          ChemUtils.calc_exact_mass(self._ms_data.loc[mi, 'ParentCompoundSmiles']),
                                          self._ms_data.loc[mi, self._ms_field],
@@ -314,13 +314,13 @@ def _match_mass_diff(self):
         pd.DataFrame:
             pandas dataframe with the matches.
         """
-        ms_df = pd.DataFrame(columns=['ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
+        ms_df = pd.DataFrame(columns=['Index', 'ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
                                       self._ms_field, 'NewCompoundID', 'NewCompoundSmiles', 'NewCompoundExactMass',
                                       'NewCompoundExactMassDiff', 'EC_Numbers'])
         for i, row in self._new_compounds.iterrows():
             mv, mi = match_value(row['NewCompoundExactMassDiff'], self._ms_data[self._ms_field].values, self._tolerance)
             if mv and self._ms_data.loc[mi, 'ParentCompound'] == row['NewCompoundID'].split('_')[0]:
-                ms_df.loc[len(ms_df)] = [self._ms_data.loc[mi, 'ParentCompound'],
+                ms_df.loc[len(ms_df)] = [mi, self._ms_data.loc[mi, 'ParentCompound'],
                                          self._ms_data.loc[mi, 'ParentCompoundSmiles'],
                                          ChemUtils.calc_exact_mass(self._ms_data.loc[mi, 'ParentCompoundSmiles']),
                                          self._ms_data.loc[mi, self._ms_field],

diff --git a/tests/data/results_sample/matches.tsv b/tests/data/results_sample/matches.tsv
@@ -1 +1 @@
-ParentCompound	ParentCompoundSmiles	ParentCompound_ExactMass	MassDiff	NewCompoundID	NewCompoundSmiles	NewCompoundExactMass	NewCompoundExactMassDiff	EC_Numbers
+Index	ParentCompound	ParentCompoundSmiles	ParentCompound_ExactMass	MassDiff	NewCompoundID	NewCompoundSmiles	NewCompoundExactMass	NewCompoundExactMassDiff	EC_Numbers
diff --git a/tests/unit_tests/chem/test_chem_utils.py b/tests/unit_tests/chem/test_chem_utils.py
@@ -1,8 +1,8 @@
 from unittest import TestCase
 
 from rdkit import RDLogger
-from rdkit.Chem import MolFromSmiles, MolToInchi
-from rdkit.Chem.rdChemReactions import ChemicalReaction
+from rdkit.Chem import MolFromSmiles, MolToInchi, Mol
+from rdkit.Chem.rdChemReactions import ChemicalReaction, ReactionFromSmarts
 
 from biocatalyzer.chem import ChemUtils
 from biocatalyzer.chem._utils import _correct_number_of_parenthesis
@@ -12,22 +12,22 @@ class TestChemUtils(TestCase):
     # mute rdkit logs
     RDLogger.DisableLog('rdApp.*')
 
-    def test_mol_to_isomerical_smiles(self):
+    def test_smiles_to_isomerical_smiles(self):
         smiles = ['CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
                   'C(C1C(C(C(C(O1)O)O)O)O)O',
                   'CC(=O)OC1=CC=CC=C1C(=O)O']
-        mols = [MolFromSmiles(s) for s in smiles]
         invalid_smiles = 'C(C1C(C(C(C(O1)O)O)O)O)O('
-        invalid_mol = MolFromSmiles(invalid_smiles)
 
-        def same_compound(mol1, mol2):
+        def same_compound(smiles1, smiles2):
+            mol1 = MolFromSmiles(smiles1)
+            mol2 = MolFromSmiles(smiles2)
             return MolToInchi(mol1) == MolToInchi(mol2)
 
-        for i, m in enumerate(mols):
-            self.assertNotEqual(ChemUtils.mol_to_isomerical_smiles(m), smiles[i])
-            self.assertTrue(same_compound(m, MolFromSmiles(ChemUtils.mol_to_isomerical_smiles(m))))
+        for i, m in enumerate(smiles):
+            self.assertNotEqual(ChemUtils.smiles_to_isomerical_smiles(m), smiles[i])
+            self.assertTrue(same_compound(m, ChemUtils.smiles_to_isomerical_smiles(m)))
 
-        self.assertIsNone(ChemUtils.mol_to_isomerical_smiles(invalid_mol))
+        self.assertIsNone(ChemUtils.smiles_to_isomerical_smiles(invalid_smiles))
 
     def test_validate_smiles(self):
         smiles = ['CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
@@ -73,6 +73,14 @@ def check_if_molecule_has_hydrogens(mol):
         atom_valence_exception_mol = MolFromSmiles('CN(C)(C)C', sanitize=False)
         self.assertEqual(ChemUtils._remove_hs(atom_valence_exception_mol), atom_valence_exception_mol)
 
+    def test_sanitized_mol(self):
+        t_butanol = MolFromSmiles('C(C)(C)(C)O')
+        self.assertIsInstance(ChemUtils._sanitize_mol(t_butanol), Mol)
+
+        rxn_1 = ReactionFromSmarts('[#6:1][O:2]>>[#6:1]=[O:2]')
+        invalid_product = rxn_1.RunReactants((t_butanol, ))[0][0]
+        self.assertIsNone(ChemUtils._sanitize_mol(invalid_product))
+
     def test_react(self):
         smiles = ['CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
                   'C(C1C(C(C(C(O1)O)O)O)O)O',
@@ -83,7 +91,7 @@ def test_react(self):
             '[#6:1]-[#6H1:2]=[O:3].[#8:4]-[#8:5]>>[#6:1]-[#6:2](-[#8:5])=[O:3].[#8:4]']
 
         for s in smarts:
-            self.assertIsNone(ChemUtils.react(smiles, s))
+            self.assertEqual(len(ChemUtils.react(smiles, s)), 0)
 
         known_reactant = 'Nc1nc(NC2CC2)c2ncn(C3C=CC(CO)C3)c2n1'
         coreactant = 'O=C1C=CC=CC1=O'
@@ -93,7 +101,7 @@ def test_react(self):
         self.assertEqual(known_reactant, reaction_smiles[0].split('.')[0])
         self.assertEqual(coreactant, reaction_smiles[0].split('.')[1].split('>>')[0])
 
-        self.assertIsNone(ChemUtils.react(smiles[0], smarts[0]))
+        self.assertEqual(len(ChemUtils.react(smiles[0], smarts[0])), 0)
 
         invalid_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C('
         self.assertEqual(len(ChemUtils.react(invalid_smiles, smarts[1])), 0)
@@ -104,7 +112,7 @@ def test_create_reaction_instances(self):
         reactants_smiles = 'C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12;O;*C1=C(*)C(=O)C(*)=C(*)C1=O'
         reactants = [MolFromSmiles(s) for s in reactants_smiles.split(';')]
         reaction_instances = ChemUtils._create_reaction_instances(rxn, reactants)
-        self.assertEqual(3, len(reaction_instances))
+        self.assertEqual(2, len(reaction_instances))
         for instance in reaction_instances:
             self.assertEqual(3, len(instance.split('>')))
             for reac in reactants_smiles.split(';'):

diff --git a/tests/unit_tests/test_bioreactor.py b/tests/unit_tests/test_bioreactor.py
@@ -2,8 +2,6 @@
 import shutil
 from unittest import TestCase
 
-import pandas as pd
-
 from biocatalyzer.bioreactor import BioReactor
 
 from tests import TESTS_DATA_PATH
@@ -60,7 +58,7 @@ def test_bioreactor_all_orgs(self):
             _ = br_no_orgs_filter.new_compounds
 
         r = br_no_orgs_filter.process_results(False)
-        self.assertEqual(r.shape, (380, 7))
+        self.assertEqual(r.shape, (352, 7))
 
     def test_bioreactor_all_orgs_keep_all(self):
         compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')
@@ -150,4 +148,3 @@ def test_bioreactor_properties_and_setters(self):
         _ = br.n_jobs
         br.n_jobs = -1
         br.n_jobs = 6
-
diff --git a/tests/unit_tests/test_ms_matcher.py b/tests/unit_tests/test_ms_matcher.py
@@ -39,7 +39,7 @@ def test_ms_data_matcher_mass_mode(self):
         self.assertEqual(ms.tolerance, 0.0015)
         self.assertEqual(ms.compounds_to_match.shape, (266, 8))
         self.assertIsInstance(ms.matches, pd.DataFrame)
-        self.assertEqual(ms.matches.shape, (0, 8))
+        self.assertEqual(ms.matches.shape, (0, 9))
 
     def test_ms_data_matcher_massdiff_mode(self):
         ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
@@ -56,7 +56,7 @@ def test_ms_data_matcher_massdiff_mode(self):
         self.assertEqual(ms.tolerance, 0.0015)
         self.assertEqual(ms.compounds_to_match.shape, (266, 9))
         self.assertIsInstance(ms.matches, pd.DataFrame)
-        self.assertEqual(ms.matches.shape, (0, 9))
+        self.assertEqual(ms.matches.shape, (0, 10))
 
     def test_ms_data_matcher_properties_and_setters(self):
         ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		ParentCompound ParentCompoundSmiles ParentCompound_ExactMass MassDiff NewCompoundID NewCompoundSmiles NewCompoundExactMass NewCompoundExactMassDiff EC_Numbers
		Index ParentCompound ParentCompoundSmiles ParentCompound_ExactMass MassDiff NewCompoundID NewCompoundSmiles NewCompoundExactMass NewCompoundExactMassDiff EC_Numbers