Skip to content

Commit

Permalink
Merge pull request #24 from jcorreia11/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
jcorreia11 authored Oct 27, 2022
2 parents 6ce4996 + 087d14f commit f656ca4
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 45 deletions.
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[metadata]
name = biocatalyzer
version = 0.1.0-beta
version = 0.1.1-beta
description = BioCatalyzer: a rule-based tool to predict compound metabolism
long_description = file: README.md
long_description_content_type = text/markdown
keywords = reaction-rules, metabolism, enzymatic-reactions, chemoinformatics, cheminformatics
author = João Correia
author_email = [email protected]
url = https://github.com/jcorreia11/BioCatalyzer
download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.1.0-beta.tar.gz
download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.1.1-beta.tar.gz
license = MIT
license_file = LICENSE
platforms = unix, linux, osx, cygwin, win32
Expand Down
20 changes: 12 additions & 8 deletions src/biocatalyzer/bioreactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,18 +581,22 @@ def _react_single(self, smiles: str, smarts: str):
if len(results) > 0:
smiles_id = self._compounds[self._compounds.smiles == smiles].compound_id.values[0]
smarts_id = self._reaction_rules[self._reaction_rules.SMARTS == smarts].InternalID.values[0]
most_similar_products_set = set()
for i, result in enumerate(results):
products = result.split('>')[-1].split('.')
# keep only the most similar compound to the input compound
most_similar_product = ChemUtils.most_similar_compound(smiles, products)
if self._match_conditions(most_similar_product):
if self._neutralize:
most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
ecs = self._get_ec_numbers(smarts_id)
with open(self._new_compounds_path, 'a') as f:
f.write(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t"
f"{most_similar_product}\t{result}\t{ecs}\n")
self._new_compounds_flag = True
most_similar_product = ChemUtils.smiles_to_isomerical_smiles(most_similar_product)
if most_similar_product not in most_similar_products_set:
most_similar_products_set.add(most_similar_product)
if self._match_conditions(most_similar_product):
if self._neutralize:
most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
ecs = self._get_ec_numbers(smarts_id)
with open(self._new_compounds_path, 'a') as f:
f.write(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t"
f"{most_similar_product}\t{result}\t{ecs}\n")
self._new_compounds_flag = True

def react(self):
"""
Expand Down
47 changes: 36 additions & 11 deletions src/biocatalyzer/chem/chem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,22 @@ class ChemUtils:
"""

@staticmethod
def mol_to_isomerical_smiles(mol: Mol):
def smiles_to_isomerical_smiles(smiles: str):
"""
Converts a molecule to its canonical SMILES.
Parameters
----------
mol: Mol
The molecule to convert.
smiles: str
The SMILES of the molecule.
Returns
-------
str
The SMILES string.
"""
try:
return MolToSmiles(RemoveHs(mol), isomericSmiles=True)
return MolToSmiles(RemoveHs(MolFromSmiles(smiles)), isomericSmiles=True)
except TypeError:
return None

Expand Down Expand Up @@ -100,6 +100,29 @@ def _remove_hs(mol: Mol):
except Chem.rdchem.AtomKekulizeException:
return mol

@staticmethod
def _sanitize_mol(mol: Mol):
"""
Sanitizes a molecule.
Parameters
----------
mol: Mol
The molecule to sanitize.
Returns
-------
Mol
The sanitized molecule.
"""
if mol is None:
return None
try:
Chem.SanitizeMol(mol)
return mol
except ValueError:
return None

@staticmethod
def react(smiles: Union[str, List[str]], smarts: str):
"""
Expand Down Expand Up @@ -127,7 +150,7 @@ def react(smiles: Union[str, List[str]], smarts: str):
try:
return ChemUtils._create_reaction_instances(reaction, mol)
except ValueError:
return None
return []

@staticmethod
def _create_reaction_instances(rxn: ChemicalReaction, reactants: List[Mol]):
Expand All @@ -149,12 +172,14 @@ def _create_reaction_instances(rxn: ChemicalReaction, reactants: List[Mol]):
res = []
ps = rxn.RunReactants(reactants)
for pset in ps:
tres = ChemicalReaction()
for p in pset:
tres.AddProductTemplate(ChemUtils._remove_hs(p))
for reactant in reactants:
tres.AddReactantTemplate(ChemUtils._remove_hs(reactant))
res.append(tres)
pset = [ChemUtils._sanitize_mol(pset_i) for pset_i in pset]
if None not in pset:
tres = ChemicalReaction()
for p in pset:
tres.AddProductTemplate(ChemUtils._remove_hs(p))
for reactant in reactants:
tres.AddReactantTemplate(ChemUtils._remove_hs(reactant))
res.append(tres)
return list(set([AllChem.ReactionToSmiles(entry, canonical=True) for entry in res]))

@staticmethod
Expand Down
8 changes: 4 additions & 4 deletions src/biocatalyzer/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,13 +289,13 @@ def _match_masses(self):
pd.DataFrame:
pandas dataframe with the matches.
"""
ms_df = pd.DataFrame(columns=['ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
ms_df = pd.DataFrame(columns=['Index', 'ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
self._ms_field, 'NewCompoundID', 'NewCompoundSmiles', 'NewCompoundExactMass',
'EC_Numbers'])
for i, row in self._new_compounds.iterrows():
mv, mi = match_value(row['NewCompoundExactMass'], self._ms_data[self._ms_field].values, self._tolerance)
if mv and self._ms_data.loc[mi, 'ParentCompound'] == '_'.join(row['NewCompoundID'].split('_')[:-1]):
ms_df.loc[len(ms_df)] = [self._ms_data.loc[mi, 'ParentCompound'],
ms_df.loc[len(ms_df)] = [mi, self._ms_data.loc[mi, 'ParentCompound'],
self._ms_data.loc[mi, 'ParentCompoundSmiles'],
ChemUtils.calc_exact_mass(self._ms_data.loc[mi, 'ParentCompoundSmiles']),
self._ms_data.loc[mi, self._ms_field],
Expand All @@ -314,13 +314,13 @@ def _match_mass_diff(self):
pd.DataFrame:
pandas dataframe with the matches.
"""
ms_df = pd.DataFrame(columns=['ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
ms_df = pd.DataFrame(columns=['Index', 'ParentCompound', 'ParentCompoundSmiles', "ParentCompound_ExactMass",
self._ms_field, 'NewCompoundID', 'NewCompoundSmiles', 'NewCompoundExactMass',
'NewCompoundExactMassDiff', 'EC_Numbers'])
for i, row in self._new_compounds.iterrows():
mv, mi = match_value(row['NewCompoundExactMassDiff'], self._ms_data[self._ms_field].values, self._tolerance)
if mv and self._ms_data.loc[mi, 'ParentCompound'] == row['NewCompoundID'].split('_')[0]:
ms_df.loc[len(ms_df)] = [self._ms_data.loc[mi, 'ParentCompound'],
ms_df.loc[len(ms_df)] = [mi, self._ms_data.loc[mi, 'ParentCompound'],
self._ms_data.loc[mi, 'ParentCompoundSmiles'],
ChemUtils.calc_exact_mass(self._ms_data.loc[mi, 'ParentCompoundSmiles']),
self._ms_data.loc[mi, self._ms_field],
Expand Down
2 changes: 1 addition & 1 deletion tests/data/results_sample/matches.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ParentCompound ParentCompoundSmiles ParentCompound_ExactMass MassDiff NewCompoundID NewCompoundSmiles NewCompoundExactMass NewCompoundExactMassDiff EC_Numbers
Index ParentCompound ParentCompoundSmiles ParentCompound_ExactMass MassDiff NewCompoundID NewCompoundSmiles NewCompoundExactMass NewCompoundExactMassDiff EC_Numbers
34 changes: 21 additions & 13 deletions tests/unit_tests/chem/test_chem_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from unittest import TestCase

from rdkit import RDLogger
from rdkit.Chem import MolFromSmiles, MolToInchi
from rdkit.Chem.rdChemReactions import ChemicalReaction
from rdkit.Chem import MolFromSmiles, MolToInchi, Mol
from rdkit.Chem.rdChemReactions import ChemicalReaction, ReactionFromSmarts

from biocatalyzer.chem import ChemUtils
from biocatalyzer.chem._utils import _correct_number_of_parenthesis
Expand All @@ -12,22 +12,22 @@ class TestChemUtils(TestCase):
# mute rdkit logs
RDLogger.DisableLog('rdApp.*')

def test_mol_to_isomerical_smiles(self):
def test_smiles_to_isomerical_smiles(self):
smiles = ['CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
'C(C1C(C(C(C(O1)O)O)O)O)O',
'CC(=O)OC1=CC=CC=C1C(=O)O']
mols = [MolFromSmiles(s) for s in smiles]
invalid_smiles = 'C(C1C(C(C(C(O1)O)O)O)O)O('
invalid_mol = MolFromSmiles(invalid_smiles)

def same_compound(mol1, mol2):
def same_compound(smiles1, smiles2):
mol1 = MolFromSmiles(smiles1)
mol2 = MolFromSmiles(smiles2)
return MolToInchi(mol1) == MolToInchi(mol2)

for i, m in enumerate(mols):
self.assertNotEqual(ChemUtils.mol_to_isomerical_smiles(m), smiles[i])
self.assertTrue(same_compound(m, MolFromSmiles(ChemUtils.mol_to_isomerical_smiles(m))))
for i, m in enumerate(smiles):
self.assertNotEqual(ChemUtils.smiles_to_isomerical_smiles(m), smiles[i])
self.assertTrue(same_compound(m, ChemUtils.smiles_to_isomerical_smiles(m)))

self.assertIsNone(ChemUtils.mol_to_isomerical_smiles(invalid_mol))
self.assertIsNone(ChemUtils.smiles_to_isomerical_smiles(invalid_smiles))

def test_validate_smiles(self):
smiles = ['CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
Expand Down Expand Up @@ -73,6 +73,14 @@ def check_if_molecule_has_hydrogens(mol):
atom_valence_exception_mol = MolFromSmiles('CN(C)(C)C', sanitize=False)
self.assertEqual(ChemUtils._remove_hs(atom_valence_exception_mol), atom_valence_exception_mol)

def test_sanitized_mol(self):
t_butanol = MolFromSmiles('C(C)(C)(C)O')
self.assertIsInstance(ChemUtils._sanitize_mol(t_butanol), Mol)

rxn_1 = ReactionFromSmarts('[#6:1][O:2]>>[#6:1]=[O:2]')
invalid_product = rxn_1.RunReactants((t_butanol, ))[0][0]
self.assertIsNone(ChemUtils._sanitize_mol(invalid_product))

def test_react(self):
smiles = ['CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
'C(C1C(C(C(C(O1)O)O)O)O)O',
Expand All @@ -83,7 +91,7 @@ def test_react(self):
'[#6:1]-[#6H1:2]=[O:3].[#8:4]-[#8:5]>>[#6:1]-[#6:2](-[#8:5])=[O:3].[#8:4]']

for s in smarts:
self.assertIsNone(ChemUtils.react(smiles, s))
self.assertEqual(len(ChemUtils.react(smiles, s)), 0)

known_reactant = 'Nc1nc(NC2CC2)c2ncn(C3C=CC(CO)C3)c2n1'
coreactant = 'O=C1C=CC=CC1=O'
Expand All @@ -93,7 +101,7 @@ def test_react(self):
self.assertEqual(known_reactant, reaction_smiles[0].split('.')[0])
self.assertEqual(coreactant, reaction_smiles[0].split('.')[1].split('>>')[0])

self.assertIsNone(ChemUtils.react(smiles[0], smarts[0]))
self.assertEqual(len(ChemUtils.react(smiles[0], smarts[0])), 0)

invalid_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C('
self.assertEqual(len(ChemUtils.react(invalid_smiles, smarts[1])), 0)
Expand All @@ -104,7 +112,7 @@ def test_create_reaction_instances(self):
reactants_smiles = 'C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12;O;*C1=C(*)C(=O)C(*)=C(*)C1=O'
reactants = [MolFromSmiles(s) for s in reactants_smiles.split(';')]
reaction_instances = ChemUtils._create_reaction_instances(rxn, reactants)
self.assertEqual(3, len(reaction_instances))
self.assertEqual(2, len(reaction_instances))
for instance in reaction_instances:
self.assertEqual(3, len(instance.split('>')))
for reac in reactants_smiles.split(';'):
Expand Down
5 changes: 1 addition & 4 deletions tests/unit_tests/test_bioreactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import shutil
from unittest import TestCase

import pandas as pd

from biocatalyzer.bioreactor import BioReactor

from tests import TESTS_DATA_PATH
Expand Down Expand Up @@ -60,7 +58,7 @@ def test_bioreactor_all_orgs(self):
_ = br_no_orgs_filter.new_compounds

r = br_no_orgs_filter.process_results(False)
self.assertEqual(r.shape, (380, 7))
self.assertEqual(r.shape, (352, 7))

def test_bioreactor_all_orgs_keep_all(self):
compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')
Expand Down Expand Up @@ -150,4 +148,3 @@ def test_bioreactor_properties_and_setters(self):
_ = br.n_jobs
br.n_jobs = -1
br.n_jobs = 6

4 changes: 2 additions & 2 deletions tests/unit_tests/test_ms_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_ms_data_matcher_mass_mode(self):
self.assertEqual(ms.tolerance, 0.0015)
self.assertEqual(ms.compounds_to_match.shape, (266, 8))
self.assertIsInstance(ms.matches, pd.DataFrame)
self.assertEqual(ms.matches.shape, (0, 8))
self.assertEqual(ms.matches.shape, (0, 9))

def test_ms_data_matcher_massdiff_mode(self):
ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
Expand All @@ -56,7 +56,7 @@ def test_ms_data_matcher_massdiff_mode(self):
self.assertEqual(ms.tolerance, 0.0015)
self.assertEqual(ms.compounds_to_match.shape, (266, 9))
self.assertIsInstance(ms.matches, pd.DataFrame)
self.assertEqual(ms.matches.shape, (0, 9))
self.assertEqual(ms.matches.shape, (0, 10))

def test_ms_data_matcher_properties_and_setters(self):
ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
Expand Down

0 comments on commit f656ca4

Please sign in to comment.