BioSystemsUM · jcorreia11 · Oct 24, 2022 · Oct 24, 2022 · Oct 24, 2022 · Oct 24, 2022
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 rdkit-pypi==2022.3.5
 click==8.1.3
-pandas==1.5.0
+pandas==1.5.1
+numpy==1.23.3
diff --git a/setup.cfg b/setup.cfg
@@ -1,14 +1,14 @@
 [metadata]
 name = biocatalyzer
-version = 0.0.4-beta
+version = 0.0.5-beta
 description = BioCatalyzer: a rule-based tool to predict compound metabolism
 long_description = file: README.md
 long_description_content_type = text/markdown
 keywords = reaction-rules, metabolism, enzymatic-reactions, chemoinformatics, cheminformatics
 author = João Correia
 author_email = jfscorreia95@gmail.com
 url = https://github.com/jcorreia11/BioCatalyzer
-download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.0.4-beta.tar.gz
+download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.0.5-beta.tar.gz
 license = MIT
 license_file = LICENSE
 platforms = unix, linux, osx, cygwin, win32

diff --git a/src/biocatalyzer/_utils.py b/src/biocatalyzer/_utils.py
@@ -1,5 +1,8 @@
 from typing import List
 
+import numpy as np
+import pandas as pd
+
 
 def match_value(v: float, values: List[float], tol: float = 0.1) -> tuple:
     """
@@ -23,3 +26,49 @@ def match_value(v: float, values: List[float], tol: float = 0.1) -> tuple:
         if value - tol <= v <= value + tol:
             return True, i
     return False, None
+
+
+def _empty_dfs(dfs: List[pd.DataFrame]):
+    """
+    Check if at least one dataframe is not empty.
+
+    Parameters
+    ----------
+    dfs: List[pd.DataFrame]
+        The list of dataframes to check.
+
+    Returns
+    -------
+    bool:
+        True if at least one dataframe is not empty. Otherwise, False.
+    """
+    for r in dfs:
+        if not r.empty:
+            return False
+    return True
+
+
+def _merge_fields(value):
+    """
+    Merge multiple fields.
+
+    Parameters
+    ----------
+    value: str
+        The fields to merge.
+
+    Returns
+    -------
+    str:
+        The merged fields.
+    """
+    if value == '':
+        return np.NaN
+    if len(value.split(';')) == 1:
+        return value
+    seen = set()
+    seen_add = seen.add
+    fields = ';'.join([x for x in value.split(';') if not (x in seen or seen_add(x) or x == '')])
+    if fields == '':
+        return np.NaN
+    return fields
diff --git a/src/biocatalyzer/bioreactor.py b/src/biocatalyzer/bioreactor.py
@@ -1,15 +1,16 @@
+import itertools
 import logging
 import multiprocessing
 import os
 import time
 import uuid
 from typing import Union
 
-import numpy as np
 import pandas as pd
 from rdkit import RDLogger
 from rdkit.Chem import MolFromSmiles
 
+from biocatalyzer._utils import _empty_dfs, _merge_fields
 from biocatalyzer.chem import ChemUtils
 from biocatalyzer.io_utils import Loaders
 
@@ -483,6 +484,31 @@ def _match_byproducts(self, smiles: str):
         else:
             return False
 
+    def _match_conditions(self, smiles: str):
+        """
+        Check if mol matches conditions to remove.
+
+        Parameters
+        ----------
+        smiles: str
+            The smiles to check.
+
+        Returns
+        -------
+        bool
+            True if mol matches conditions to remove, False otherwise.
+        """
+        if self._min_atom_count > 0:
+            if not self._min_atom_count_filter(smiles):
+                return False
+        if len(self._molecules_to_remove) > 0:
+            if self._match_byproducts(smiles):
+                return False
+        if len(self._patterns_to_remove) > 0:
+            if self._match_patterns(smiles):
+                return False
+        return True
+
     def _get_ec_numbers(self, reaction_rule_id: str):
         """
         Get the EC numbers associated with a reaction rule.
@@ -526,25 +552,9 @@ def process_results(results: pd.DataFrame):
         results = results[['OriginalCompoundID', 'OriginalCompoundSmiles', 'OriginalReactionRuleID', 'NewCompoundID',
                            'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers']]
 
-        def merge_fields(value):
-            if len(value.split(';')) == 1:
-                return value
-            values = []
-            for v in value.split(';'):
-                if v not in values:
-                    values.append(v)
-            return ';'.join(values)
-        results['OriginalReactionRuleID'] = results['OriginalReactionRuleID'].apply(lambda x: merge_fields(x))
-        results['NewReactionSmiles'] = results['NewReactionSmiles'].apply(lambda x: merge_fields(x))
-
-        def merge_ec_numbers(x):
-            if x == '':
-                return np.NaN
-            x = list(set(x.split(';')))
-            x = [i for i in x if i != '']
-            return ';'.join(x)
-
-        results['EC_Numbers'] = results['EC_Numbers'].apply(lambda x: merge_ec_numbers(x))
+        results['OriginalReactionRuleID'] = results['OriginalReactionRuleID'].apply(lambda x: _merge_fields(x))
+        results['NewReactionSmiles'] = results['NewReactionSmiles'].apply(lambda x: _merge_fields(x))
+        results['EC_Numbers'] = results['EC_Numbers'].apply(lambda x: _merge_fields(x))
         return results
 
     def _react_single(self, smiles: str, smarts: str):
@@ -560,7 +570,8 @@ def _react_single(self, smiles: str, smarts: str):
             The SMARTS string of the reaction.
         """
         new_compounds = pd.DataFrame(columns=['OriginalCompoundID', 'OriginalCompoundSmiles', 'OriginalReactionRuleID',
-                                              'NewCompoundID', 'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers'])
+                                              'NewCompoundID', 'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers'],
+                                     dtype=str)
         reactants = self._reaction_rules[self._reaction_rules.SMARTS == smarts].Reactants.values[0]
         reactants = reactants.replace("Any", smiles).split(';')
         results = ChemUtils.react(reactants, smarts)
@@ -572,9 +583,7 @@ def _react_single(self, smiles: str, smarts: str):
                 # keep only the most similar compound to the input compound
                 most_similar_product = ChemUtils.most_similar_compound(smiles, products)
                 if most_similar_product not in new_compounds.NewCompoundSmiles.values:
-                    if not self._match_byproducts(most_similar_product) \
-                            and not self._match_patterns(most_similar_product) \
-                            and self._min_atom_count_filter(most_similar_product):
+                    if self._match_conditions(most_similar_product):
                         if self._neutralize:
                             most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
                         ecs = self._get_ec_numbers(smarts_id)
@@ -588,18 +597,15 @@ def react(self):
         Transform reactants into products using the reaction rules.
         """
         t0 = time.time()
-        results_ = []
-        for compound in self._compounds.smiles:
-            with multiprocessing.Pool(self._n_jobs) as pool:
-                results_.extend(pool.starmap(self._react_single, zip([compound] * self._reaction_rules.shape[0],
-                                                                     self._reaction_rules.SMARTS)))
-
-        not_empty = [not df.empty for df in results_]
-        if not any(not_empty):
+        params = list(itertools.product(self._compounds.smiles, self._reaction_rules.SMARTS))
+        with multiprocessing.Pool(self._n_jobs) as pool:
+            results_ = pool.starmap(self._react_single, params)
+
+        if _empty_dfs(results_):
             logging.info('No new compounds could be generated using this reaction rules.')
             t1 = time.time()
             logging.info(f"Time elapsed: {t1 - t0} seconds")
-            return False
+            return
         results = pd.concat(results_)
         results = self.process_results(results)
 
@@ -609,7 +615,6 @@ def react(self):
         self._new_compounds = results
         t1 = time.time()
         logging.info(f"Time elapsed: {t1 - t0} seconds")
-        return True
 
 
 if __name__ == '__main__':

diff --git a/src/biocatalyzer/chem/_utils.py b/src/biocatalyzer/chem/_utils.py
@@ -1,24 +1,23 @@
-from typing import List
-
-
-def _correct_number_of_parenthesis(smiles_list: List[str]):
+def _correct_number_of_parenthesis(smiles: str):
     """
-    Corrects the number of parenthesis in a list of SMILES strings.
+    Corrects the number of parenthesis of a SMILES string.
     Sometimes the react method returns a SMILES string with an incorrect number of parenthesis.
     This method corrects that issue.
 
     Parameters
     ----------
-    smiles_list: List[str]
-        The list of SMILES strings to correct the parenthesis.
+    smiles: str
+        The SMILES string to correct the parenthesis.
+
+    Returns
+    -------
+    str
+        The corrected SMILES string.
     """
-    corrected_smiles = []
-    for p in smiles_list:
-        # deal with cases where invalid number of parentheses are generated
-        if (p.count('(') + p.count(')')) % 2 != 0:
-            if p[0] == '(':
-                p = p[1:]
-            elif p[-1] == ')':
-                p = p[:-1]
-        corrected_smiles.append(p)
-    return corrected_smiles
+    # deal with cases where invalid number of parentheses are generated
+    if (smiles.count('(') + smiles.count(')')) % 2 != 0:
+        if smiles[0] == '(':
+            smiles = smiles[1:]
+        elif smiles[-1] == ')':
+            smiles = smiles[:-1]
+    return smiles
diff --git a/src/biocatalyzer/chem/chem_utils.py b/src/biocatalyzer/chem/chem_utils.py
@@ -247,7 +247,7 @@ def most_similar_compound(smiles: str, smiles_list: List[str]):
         str
             The most similar compound SMILES string.
         """
-        smiles_list = _correct_number_of_parenthesis(smiles_list)
+        smiles_list = [_correct_number_of_parenthesis(s) for s in smiles_list]
         if len(smiles_list) == 1:
             return smiles_list[0]
         sims = [ChemUtils.calc_fingerprint_similarity(smiles, s) for s in smiles_list]

diff --git a/tests/unit_tests/chem/test_chem_utils.py b/tests/unit_tests/chem/test_chem_utils.py
@@ -186,7 +186,7 @@ def test_correct_number_of_parenthesis(self):
         smiles = ['CO)',
                   'C[NH2+]CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12',
                   '(Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1OP(=O)(O)O']
-        corrected_smiles = _correct_number_of_parenthesis(smiles)
+        corrected_smiles = [_correct_number_of_parenthesis(s) for s in smiles]
         self.assertEqual(corrected_smiles[0], 'CO')
         self.assertEqual(corrected_smiles[1], 'C[NH2+]CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12')
         self.assertEqual(corrected_smiles[2],

diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
@@ -1,10 +1,34 @@
 from unittest import TestCase
 
-from biocatalyzer._utils import match_value
+import numpy as np
+import pandas as pd
+
+from biocatalyzer._utils import match_value, _empty_dfs, _merge_fields
 
 
 class TestUtils(TestCase):
 
     def test_match_value(self):
         self.assertTrue(match_value(10, [10.1, 10.2], 0.1)[0])
         self.assertFalse(match_value(10, [10.1, 10.2], 0.01)[0])
+
+    def test_empty_dfs(self):
+        dfs = [pd.DataFrame(), pd.DataFrame()]
+        self.assertTrue(_empty_dfs(dfs))
+        dfs = [pd.DataFrame(), pd.DataFrame({'a': [1]})]
+        self.assertFalse(_empty_dfs(dfs))
+
+    def test_merge_fields(self):
+        self.assertEqual('1.1.1.1;1.1.1.2', _merge_fields('1.1.1.1;1.1.1.2;;1.1.1.1'))
+        self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4', _merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
+        self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4',
+                         _merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4;1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
+        self.assertTrue(np.isnan(_merge_fields(';;;')))
+
+    def test_merge_fields2(self):
+        self.assertEqual('field', _merge_fields('field'))
+        self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4', _merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
+        self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4',
+                         _merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4;1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
+        self.assertTrue(np.isnan(_merge_fields('')))
+