Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve multiprocessing #19

Merged
merged 9 commits into from
Oct 24, 2022
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
rdkit-pypi==2022.3.5
click==8.1.3
pandas==1.5.0
pandas==1.5.1
numpy==1.23.3
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[metadata]
name = biocatalyzer
version = 0.0.4-beta
version = 0.0.5-beta
description = BioCatalyzer: a rule-based tool to predict compound metabolism
long_description = file: README.md
long_description_content_type = text/markdown
keywords = reaction-rules, metabolism, enzymatic-reactions, chemoinformatics, cheminformatics
author = João Correia
author_email = jfscorreia95@gmail.com
url = https://github.com/jcorreia11/BioCatalyzer
download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.0.4-beta.tar.gz
download_url = https://github.com/jcorreia11/BioCatalyzer/archive/refs/tags/v0.0.5-beta.tar.gz
license = MIT
license_file = LICENSE
platforms = unix, linux, osx, cygwin, win32
Expand Down
49 changes: 49 additions & 0 deletions src/biocatalyzer/_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import List

import numpy as np
import pandas as pd


def match_value(v: float, values: List[float], tol: float = 0.1) -> tuple:
"""
Expand All @@ -23,3 +26,49 @@ def match_value(v: float, values: List[float], tol: float = 0.1) -> tuple:
if value - tol <= v <= value + tol:
return True, i
return False, None


def _empty_dfs(dfs: List[pd.DataFrame]):
"""
Check if at least one dataframe is not empty.

Parameters
----------
dfs: List[pd.DataFrame]
The list of dataframes to check.

Returns
-------
bool:
True if at least one dataframe is not empty. Otherwise, False.
"""
for r in dfs:
if not r.empty:
return False
return True


def _merge_fields(value):
"""
Merge multiple fields.

Parameters
----------
value: str
The fields to merge.

Returns
-------
str:
The merged fields.
"""
if value == '':
return np.NaN
if len(value.split(';')) == 1:
return value
seen = set()
seen_add = seen.add
fields = ';'.join([x for x in value.split(';') if not (x in seen or seen_add(x) or x == '')])
if fields == '':
return np.NaN
return fields
73 changes: 39 additions & 34 deletions src/biocatalyzer/bioreactor.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import itertools
import logging
import multiprocessing
import os
import time
import uuid
from typing import Union

import numpy as np
import pandas as pd
from rdkit import RDLogger
from rdkit.Chem import MolFromSmiles

from biocatalyzer._utils import _empty_dfs, _merge_fields
from biocatalyzer.chem import ChemUtils
from biocatalyzer.io_utils import Loaders

Expand Down Expand Up @@ -483,6 +484,31 @@ def _match_byproducts(self, smiles: str):
else:
return False

def _match_conditions(self, smiles: str):
"""
Check if mol matches conditions to remove.

Parameters
----------
smiles: str
The smiles to check.

Returns
-------
bool
True if mol matches conditions to remove, False otherwise.
"""
if self._min_atom_count > 0:
if not self._min_atom_count_filter(smiles):
return False
if len(self._molecules_to_remove) > 0:
if self._match_byproducts(smiles):
return False
if len(self._patterns_to_remove) > 0:
if self._match_patterns(smiles):
return False
return True

def _get_ec_numbers(self, reaction_rule_id: str):
"""
Get the EC numbers associated with a reaction rule.
Expand Down Expand Up @@ -526,25 +552,9 @@ def process_results(results: pd.DataFrame):
results = results[['OriginalCompoundID', 'OriginalCompoundSmiles', 'OriginalReactionRuleID', 'NewCompoundID',
'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers']]

def merge_fields(value):
if len(value.split(';')) == 1:
return value
values = []
for v in value.split(';'):
if v not in values:
values.append(v)
return ';'.join(values)
results['OriginalReactionRuleID'] = results['OriginalReactionRuleID'].apply(lambda x: merge_fields(x))
results['NewReactionSmiles'] = results['NewReactionSmiles'].apply(lambda x: merge_fields(x))

def merge_ec_numbers(x):
if x == '':
return np.NaN
x = list(set(x.split(';')))
x = [i for i in x if i != '']
return ';'.join(x)

results['EC_Numbers'] = results['EC_Numbers'].apply(lambda x: merge_ec_numbers(x))
results['OriginalReactionRuleID'] = results['OriginalReactionRuleID'].apply(lambda x: _merge_fields(x))
results['NewReactionSmiles'] = results['NewReactionSmiles'].apply(lambda x: _merge_fields(x))
results['EC_Numbers'] = results['EC_Numbers'].apply(lambda x: _merge_fields(x))
return results

def _react_single(self, smiles: str, smarts: str):
Expand All @@ -560,7 +570,8 @@ def _react_single(self, smiles: str, smarts: str):
The SMARTS string of the reaction.
"""
new_compounds = pd.DataFrame(columns=['OriginalCompoundID', 'OriginalCompoundSmiles', 'OriginalReactionRuleID',
'NewCompoundID', 'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers'])
'NewCompoundID', 'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers'],
dtype=str)
reactants = self._reaction_rules[self._reaction_rules.SMARTS == smarts].Reactants.values[0]
reactants = reactants.replace("Any", smiles).split(';')
results = ChemUtils.react(reactants, smarts)
Expand All @@ -572,9 +583,7 @@ def _react_single(self, smiles: str, smarts: str):
# keep only the most similar compound to the input compound
most_similar_product = ChemUtils.most_similar_compound(smiles, products)
if most_similar_product not in new_compounds.NewCompoundSmiles.values:
if not self._match_byproducts(most_similar_product) \
and not self._match_patterns(most_similar_product) \
and self._min_atom_count_filter(most_similar_product):
if self._match_conditions(most_similar_product):
if self._neutralize:
most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
ecs = self._get_ec_numbers(smarts_id)
Expand All @@ -588,18 +597,15 @@ def react(self):
Transform reactants into products using the reaction rules.
"""
t0 = time.time()
results_ = []
for compound in self._compounds.smiles:
with multiprocessing.Pool(self._n_jobs) as pool:
results_.extend(pool.starmap(self._react_single, zip([compound] * self._reaction_rules.shape[0],
self._reaction_rules.SMARTS)))

not_empty = [not df.empty for df in results_]
if not any(not_empty):
params = list(itertools.product(self._compounds.smiles, self._reaction_rules.SMARTS))
with multiprocessing.Pool(self._n_jobs) as pool:
results_ = pool.starmap(self._react_single, params)

if _empty_dfs(results_):
logging.info('No new compounds could be generated using this reaction rules.')
t1 = time.time()
logging.info(f"Time elapsed: {t1 - t0} seconds")
return False
return
results = pd.concat(results_)
results = self.process_results(results)

Expand All @@ -609,7 +615,6 @@ def react(self):
self._new_compounds = results
t1 = time.time()
logging.info(f"Time elapsed: {t1 - t0} seconds")
return True


if __name__ == '__main__':
Expand Down
33 changes: 16 additions & 17 deletions src/biocatalyzer/chem/_utils.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
from typing import List


def _correct_number_of_parenthesis(smiles_list: List[str]):
def _correct_number_of_parenthesis(smiles: str):
"""
Corrects the number of parenthesis in a list of SMILES strings.
Corrects the number of parenthesis of a SMILES string.
Sometimes the react method returns a SMILES string with an incorrect number of parenthesis.
This method corrects that issue.

Parameters
----------
smiles_list: List[str]
The list of SMILES strings to correct the parenthesis.
smiles: str
The SMILES string to correct the parenthesis.

Returns
-------
str
The corrected SMILES string.
"""
corrected_smiles = []
for p in smiles_list:
# deal with cases where invalid number of parentheses are generated
if (p.count('(') + p.count(')')) % 2 != 0:
if p[0] == '(':
p = p[1:]
elif p[-1] == ')':
p = p[:-1]
corrected_smiles.append(p)
return corrected_smiles
# deal with cases where invalid number of parentheses are generated
if (smiles.count('(') + smiles.count(')')) % 2 != 0:
if smiles[0] == '(':
smiles = smiles[1:]
elif smiles[-1] == ')':
smiles = smiles[:-1]
return smiles
2 changes: 1 addition & 1 deletion src/biocatalyzer/chem/chem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def most_similar_compound(smiles: str, smiles_list: List[str]):
str
The most similar compound SMILES string.
"""
smiles_list = _correct_number_of_parenthesis(smiles_list)
smiles_list = [_correct_number_of_parenthesis(s) for s in smiles_list]
if len(smiles_list) == 1:
return smiles_list[0]
sims = [ChemUtils.calc_fingerprint_similarity(smiles, s) for s in smiles_list]
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/chem/test_chem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_correct_number_of_parenthesis(self):
smiles = ['CO)',
'C[NH2+]CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12',
'(Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1OP(=O)(O)O']
corrected_smiles = _correct_number_of_parenthesis(smiles)
corrected_smiles = [_correct_number_of_parenthesis(s) for s in smiles]
self.assertEqual(corrected_smiles[0], 'CO')
self.assertEqual(corrected_smiles[1], 'C[NH2+]CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12')
self.assertEqual(corrected_smiles[2],
Expand Down
26 changes: 25 additions & 1 deletion tests/unit_tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,34 @@
from unittest import TestCase

from biocatalyzer._utils import match_value
import numpy as np
import pandas as pd

from biocatalyzer._utils import match_value, _empty_dfs, _merge_fields


class TestUtils(TestCase):

def test_match_value(self):
self.assertTrue(match_value(10, [10.1, 10.2], 0.1)[0])
self.assertFalse(match_value(10, [10.1, 10.2], 0.01)[0])

def test_empty_dfs(self):
dfs = [pd.DataFrame(), pd.DataFrame()]
self.assertTrue(_empty_dfs(dfs))
dfs = [pd.DataFrame(), pd.DataFrame({'a': [1]})]
self.assertFalse(_empty_dfs(dfs))

def test_merge_fields(self):
self.assertEqual('1.1.1.1;1.1.1.2', _merge_fields('1.1.1.1;1.1.1.2;;1.1.1.1'))
self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4', _merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4',
_merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4;1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
self.assertTrue(np.isnan(_merge_fields(';;;')))

def test_merge_fields2(self):
self.assertEqual('field', _merge_fields('field'))
self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4', _merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
self.assertEqual('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4',
_merge_fields('1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4;1.1.1.1;1.1.1.2;1.1.1.3;1.1.1.4'))
self.assertTrue(np.isnan(_merge_fields('')))