Skip to content

Commit

Permalink
Merge pull request #21 from jcorreia11/files_in_disk
Browse files Browse the repository at this point in the history
Files in disk
  • Loading branch information
jcorreia11 authored Oct 25, 2022
2 parents 250a0cd + ac800d6 commit c2b9687
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 63 deletions.
1 change: 1 addition & 0 deletions src/biocatalyzer/_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os.path
from typing import List

import numpy as np
Expand Down
68 changes: 31 additions & 37 deletions src/biocatalyzer/bioreactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from rdkit.Chem import MolFromSmiles
from tqdm import tqdm

from biocatalyzer._utils import _empty_dfs, _merge_fields
from biocatalyzer._utils import _merge_fields
from biocatalyzer.chem import ChemUtils
from biocatalyzer.io_utils import Loaders

Expand Down Expand Up @@ -79,7 +79,9 @@ def __init__(self,
self._n_jobs = multiprocessing.cpu_count()
else:
self._n_jobs = n_jobs
self._new_compounds_path = os.path.join(self._output_path, 'new_compounds.tsv')
self._new_compounds = None
self._new_compounds_flag = False

@property
def compounds(self):
Expand Down Expand Up @@ -147,20 +149,20 @@ def new_compounds(self):
pd.DataFrame
The new compounds generated.
"""
if isinstance(self._new_compounds, pd.DataFrame):
return self._new_compounds
if self._new_compounds_flag:
return Loaders.load_compounds(self._new_compounds_path, False)
else:
raise ValueError('No compounds generated yet. Run the BioReactor react method first.')

@new_compounds.setter
def new_compounds(self, new_compounds: pd.DataFrame):
def new_compounds(self, new_compounds: str):
"""
Set the new compounds generated.
Parameters
----------
new_compounds: pd.DataFrame
The new compounds generated.
new_compounds: str
The path to the file containing the new compounds generated.
"""
raise AttributeError('New compounds cannot be set manually! You need to run the react method!')

Expand Down Expand Up @@ -526,22 +528,22 @@ def _get_ec_numbers(self, reaction_rule_id: str):
"""
return self._reaction_rules[self._reaction_rules.InternalID == reaction_rule_id].EC_Numbers.values[0]

@staticmethod
def process_results(results: pd.DataFrame):
def process_results(self, save: bool = True):
"""
Process the results of the reactor.
Group results by unique SMILES and merges the other columns.
Parameters
----------
results: pd.DataFrame
The results dataframe to process.
save: bool
If True, save the results to a file.
Returns
-------
pd.DataFrame
The processed results.
"""
results = pd.read_csv(self._new_compounds_path, sep='\t', header=0)
results.EC_Numbers = results.EC_Numbers.fillna('')
results = results.groupby(['OriginalCompoundID', 'NewCompoundSmiles']).agg({'OriginalCompoundSmiles': 'first',
'OriginalReactionRuleID': ';'.join,
Expand All @@ -556,6 +558,9 @@ def process_results(results: pd.DataFrame):
results['OriginalReactionRuleID'] = results['OriginalReactionRuleID'].apply(lambda x: _merge_fields(x))
results['NewReactionSmiles'] = results['NewReactionSmiles'].apply(lambda x: _merge_fields(x))
results['EC_Numbers'] = results['EC_Numbers'].apply(lambda x: _merge_fields(x))
if save:
results_file_proc = os.path.join(self._output_path, 'new_compounds_processed.tsv')
results.to_csv(results_file_proc, sep='\t', index=False)
return results

def _react_single(self, smiles: str, smarts: str):
Expand All @@ -570,9 +575,6 @@ def _react_single(self, smiles: str, smarts: str):
smarts: str
The SMARTS string of the reaction.
"""
new_compounds = pd.DataFrame(columns=['OriginalCompoundID', 'OriginalCompoundSmiles', 'OriginalReactionRuleID',
'NewCompoundID', 'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers'],
dtype=str)
reactants = self._reaction_rules[self._reaction_rules.SMARTS == smarts].Reactants.values[0]
reactants = reactants.replace("Any", smiles).split(';')
results = ChemUtils.react(reactants, smarts)
Expand All @@ -583,40 +585,32 @@ def _react_single(self, smiles: str, smarts: str):
products = result.split('>')[-1].split('.')
# keep only the most similar compound to the input compound
most_similar_product = ChemUtils.most_similar_compound(smiles, products)
if most_similar_product not in new_compounds.NewCompoundSmiles.values:
if self._match_conditions(most_similar_product):
if self._neutralize:
most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
ecs = self._get_ec_numbers(smarts_id)
new_compounds.loc[len(new_compounds)] = [smiles_id, smiles, smarts_id,
f"{smiles_id}_{uuid.uuid4()}", most_similar_product,
result, ecs]
return new_compounds
if self._match_conditions(most_similar_product):
if self._neutralize:
most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
ecs = self._get_ec_numbers(smarts_id)
with open(self._new_compounds_path, 'a') as f:
f.write(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t"
f"{most_similar_product}\t{result}\t{ecs}\n")
self._new_compounds_flag = True

def react(self):
"""
Transform reactants into products using the reaction rules.
"""
t0 = time.time()
with open(self._new_compounds_path, 'w') as f:
f.write('OriginalCompoundID\tOriginalCompoundSmiles\tOriginalReactionRuleID\tNewCompoundID\t'
'NewCompoundSmiles\tNewReactionSmiles\tEC_Numbers\n')
params = list(itertools.product(self._compounds.smiles, self._reaction_rules.SMARTS))
with multiprocessing.Pool(self._n_jobs) as pool:
results_ = pool.starmap(self._react_single, tqdm(params, total=len(params)))

if _empty_dfs(results_):
logging.info('No new compounds could be generated using this reaction rules.')
t1 = time.time()
logging.info(f"Time elapsed: {t1 - t0} seconds")
return False
results = pd.concat(results_)
results = self.process_results(results)

results.to_csv(self._output_path + '/new_compounds.tsv', sep='\t', index=False)
logging.info(f"New compounds saved to {self._output_path}new_compounds.tsv")
logging.info(f"{results.shape[0]} unique new compounds generated!")
self._new_compounds = results
pool.starmap(self._react_single, tqdm(params, total=len(params)))
self._new_compounds = f"New products saved to {self._new_compounds_path}"
t1 = time.time()
logging.info(f"Time elapsed: {t1 - t0} seconds")
return True
if self._new_compounds_flag:
return True
return False


if __name__ == '__main__':
Expand Down
5 changes: 3 additions & 2 deletions src/biocatalyzer/clis/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,9 @@ def biocatalyzer_cli(compounds,
molecules_to_remove_path=molecules_to_remove,
min_atom_count=min_atom_count,
n_jobs=n_jobs)
logging.basicConfig(filename=f'{output_path}logging.log', level=logging.DEBUG)
logging.basicConfig(filename=f'{output_path}_logging.log', level=logging.DEBUG)
brr = br.react()
br.process_results()

if match_ms_data:
if not ms_data_path:
Expand All @@ -134,7 +135,7 @@ def biocatalyzer_cli(compounds,
else:

ms = MSDataMatcher(ms_data_path=ms_data_path,
compounds_to_match=os.path.join(output_path, 'new_compounds.tsv'),
compounds_to_match_path=os.path.join(output_path, 'new_compounds_processed.tsv'),
output_path=output_path,
mode=mode,
tolerance=tolerance)
Expand Down
2 changes: 1 addition & 1 deletion src/biocatalyzer/clis/cli_bioreactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def bioreactor_cli(compounds,
molecules_to_remove_path=molecules_to_remove,
min_atom_count=min_atom_count,
n_jobs=n_jobs)
logging.basicConfig(filename=f'{output_path}logging.log', level=logging.DEBUG)
logging.basicConfig(filename=f'{output_path}_logging.log', level=logging.DEBUG)
br.react()


Expand Down
4 changes: 2 additions & 2 deletions src/biocatalyzer/clis/cli_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ def matcher_cli(ms_data,
output_path: Path to the output directory.
"""
ms = MSDataMatcher(ms_data_path=ms_data,
compounds_to_match=compounds_to_match,
compounds_to_match_path=compounds_to_match,
output_path=output_path,
mode=mode,
tolerance=tolerance)
logging.basicConfig(filename=f'{output_path}logging.log', level=logging.DEBUG)
logging.basicConfig(filename=f'{output_path}_logging.log', level=logging.DEBUG)
ms.generate_ms_results()


Expand Down
21 changes: 9 additions & 12 deletions src/biocatalyzer/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class MSDataMatcher:
def __init__(self,
ms_data_path: str,
output_path: str,
compounds_to_match: Union[pd.DataFrame, str],
compounds_to_match_path: str,
mode: str = 'mass',
tolerance: float = 0.02):
"""
Expand All @@ -33,8 +33,8 @@ def __init__(self,
Path to the MS data.
output_path: str
Path to the output directory.
compounds_to_match: Union[pd.DataFrame, str]
The new predicted compounds to match.
compounds_to_match_path: str
Path to the new predicted compounds to match.
mode: str
The mode of the matcher. Either 'mass' or 'mass_diff'.
tolerance: float
Expand All @@ -46,7 +46,7 @@ def __init__(self,
self._set_output_path(self._output_path)
self._mode = mode
self._tolerance = tolerance
self._set_up_data_files(compounds_to_match)
self._set_up_data_files(compounds_to_match_path)
self._prepare_mode()
self._matches = None

Expand Down Expand Up @@ -232,20 +232,17 @@ def _prepare_mode(self):
else:
raise ValueError('The mode must be either "mass" or "mass_dif".')

def _set_up_data_files(self, new_compounds: Union[pd.DataFrame, str]):
def _set_up_data_files(self, new_compounds: str):
"""
Set up the reaction rules and new compounds data files.
Parameters
----------
new_compounds: Union[pd.DataFrame, str]
The new compounds to match.
new_compounds: str
The path to the new compounds to match.
"""
self._set_up_reaction_rules()
if not isinstance(new_compounds, pd.DataFrame):
self._set_up_new_compounds(new_compounds)
else:
self._new_compounds = new_compounds
self._set_up_new_compounds(new_compounds)

def _set_up_reaction_rules(self):
"""
Expand Down Expand Up @@ -360,7 +357,7 @@ def generate_ms_results(self):
if __name__ == '__main__':
output_path_ = 'results/results_example/'
ms = MSDataMatcher(ms_data_path='data/ms_data_example/ms_data_paper.tsv',
compounds_to_match='results/results_example/new_compounds.tsv',
compounds_to_match_path='results/results_example/new_compounds.tsv',
output_path=output_path_,
mode='mass',
tolerance=0.0015)
Expand Down
13 changes: 7 additions & 6 deletions tests/unit_tests/test_bioreactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def test_bioreactor(self):

self.assertEqual(br.reaction_rules.shape, (1368, 7))
self.assertEqual(br.compounds.shape, (4, 2))
self.assertIsInstance(br.new_compounds, pd.DataFrame)
self.assertEqual(br.new_compounds.shape[1], 7)
with self.assertRaises(ValueError):
_ = br.new_compounds

def test_bioreactor_all_orgs(self):
compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')
Expand All @@ -56,8 +56,11 @@ def test_bioreactor_all_orgs(self):

self.assertEqual(br_no_orgs_filter.reaction_rules.shape, (3332, 7))
self.assertEqual(br_no_orgs_filter.compounds.shape, (4, 2))
self.assertIsInstance(br_no_orgs_filter.new_compounds, pd.DataFrame)
self.assertEqual(br_no_orgs_filter.new_compounds.shape[1], 7)
with self.assertRaises(ValueError):
_ = br_no_orgs_filter.new_compounds

r = br_no_orgs_filter.process_results(False)
self.assertEqual(r.shape, (380, 7))

def test_bioreactor_all_orgs_keep_all(self):
compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')
Expand All @@ -72,8 +75,6 @@ def test_bioreactor_all_orgs_keep_all(self):

self.assertEqual(br_no_orgs_filter.reaction_rules.shape, (3332, 7))
self.assertEqual(br_no_orgs_filter.compounds.shape, (4, 2))
self.assertIsInstance(br_no_orgs_filter.new_compounds, pd.DataFrame)
self.assertEqual(br_no_orgs_filter.new_compounds.shape[1], 7)

def test_bioreactor_properties_and_setters(self):
compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')
Expand Down
6 changes: 3 additions & 3 deletions tests/unit_tests/test_ms_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_ms_data_matcher_mass_mode(self):
ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv')
ms = MSDataMatcher(ms_data_path=ms_data_path,
compounds_to_match=compounds_to_match,
compounds_to_match_path=compounds_to_match,
output_path=self.output_folder,
mode='mass',
tolerance=0.0015)
Expand All @@ -45,7 +45,7 @@ def test_ms_data_matcher_massdiff_mode(self):
ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv')
ms = MSDataMatcher(ms_data_path=ms_data_path,
compounds_to_match=compounds_to_match,
compounds_to_match_path=compounds_to_match,
output_path=self.output_folder,
mode='mass_diff',
tolerance=0.0015)
Expand All @@ -62,7 +62,7 @@ def test_ms_data_matcher_properties_and_setters(self):
ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv')
ms = MSDataMatcher(ms_data_path=ms_data_path,
compounds_to_match=compounds_to_match,
compounds_to_match_path=compounds_to_match,
output_path=self.output_folder,
mode='mass_diff',
tolerance=0.0015)
Expand Down

0 comments on commit c2b9687

Please sign in to comment.