Merge pull request #21 from jcorreia11/files_in_disk

Files in disk
BioSystemsUM · Oct 25, 2022 · c2b9687 · c2b9687
2 parents 250a0cd + ac800d6
commit c2b9687
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 63 deletions.
diff --git a/src/biocatalyzer/_utils.py b/src/biocatalyzer/_utils.py
@@ -1,3 +1,4 @@
+import os.path
 from typing import List
 
 import numpy as np

diff --git a/src/biocatalyzer/bioreactor.py b/src/biocatalyzer/bioreactor.py
@@ -11,7 +11,7 @@
 from rdkit.Chem import MolFromSmiles
 from tqdm import tqdm
 
-from biocatalyzer._utils import _empty_dfs, _merge_fields
+from biocatalyzer._utils import _merge_fields
 from biocatalyzer.chem import ChemUtils
 from biocatalyzer.io_utils import Loaders
 
@@ -79,7 +79,9 @@ def __init__(self,
             self._n_jobs = multiprocessing.cpu_count()
         else:
             self._n_jobs = n_jobs
+        self._new_compounds_path = os.path.join(self._output_path, 'new_compounds.tsv')
         self._new_compounds = None
+        self._new_compounds_flag = False
 
     @property
     def compounds(self):
@@ -147,20 +149,20 @@ def new_compounds(self):
         pd.DataFrame
             The new compounds generated.
         """
-        if isinstance(self._new_compounds, pd.DataFrame):
-            return self._new_compounds
+        if self._new_compounds_flag:
+            return Loaders.load_compounds(self._new_compounds_path, False)
         else:
             raise ValueError('No compounds generated yet. Run the BioReactor react method first.')
 
     @new_compounds.setter
-    def new_compounds(self, new_compounds: pd.DataFrame):
+    def new_compounds(self, new_compounds: str):
         """
         Set the new compounds generated.
 
         Parameters
         ----------
-        new_compounds: pd.DataFrame
-            The new compounds generated.
+        new_compounds: str
+            The path to the file containing the new compounds generated.
         """
         raise AttributeError('New compounds cannot be set manually! You need to run the react method!')
 
@@ -526,22 +528,22 @@ def _get_ec_numbers(self, reaction_rule_id: str):
         """
         return self._reaction_rules[self._reaction_rules.InternalID == reaction_rule_id].EC_Numbers.values[0]
 
-    @staticmethod
-    def process_results(results: pd.DataFrame):
+    def process_results(self, save: bool = True):
         """
         Process the results of the reactor.
         Group results by unique SMILES and merges the other columns.
 
         Parameters
         ----------
-        results: pd.DataFrame
-            The results dataframe to process.
+        save: bool
+            If True, save the results to a file.
 
         Returns
         -------
         pd.DataFrame
             The processed results.
         """
+        results = pd.read_csv(self._new_compounds_path, sep='\t', header=0)
         results.EC_Numbers = results.EC_Numbers.fillna('')
         results = results.groupby(['OriginalCompoundID', 'NewCompoundSmiles']).agg({'OriginalCompoundSmiles': 'first',
                                                                                     'OriginalReactionRuleID': ';'.join,
@@ -556,6 +558,9 @@ def process_results(results: pd.DataFrame):
         results['OriginalReactionRuleID'] = results['OriginalReactionRuleID'].apply(lambda x: _merge_fields(x))
         results['NewReactionSmiles'] = results['NewReactionSmiles'].apply(lambda x: _merge_fields(x))
         results['EC_Numbers'] = results['EC_Numbers'].apply(lambda x: _merge_fields(x))
+        if save:
+            results_file_proc = os.path.join(self._output_path, 'new_compounds_processed.tsv')
+            results.to_csv(results_file_proc, sep='\t', index=False)
         return results
 
     def _react_single(self, smiles: str, smarts: str):
@@ -570,9 +575,6 @@ def _react_single(self, smiles: str, smarts: str):
         smarts: str
             The SMARTS string of the reaction.
         """
-        new_compounds = pd.DataFrame(columns=['OriginalCompoundID', 'OriginalCompoundSmiles', 'OriginalReactionRuleID',
-                                              'NewCompoundID', 'NewCompoundSmiles', 'NewReactionSmiles', 'EC_Numbers'],
-                                     dtype=str)
         reactants = self._reaction_rules[self._reaction_rules.SMARTS == smarts].Reactants.values[0]
         reactants = reactants.replace("Any", smiles).split(';')
         results = ChemUtils.react(reactants, smarts)
@@ -583,40 +585,32 @@ def _react_single(self, smiles: str, smarts: str):
                 products = result.split('>')[-1].split('.')
                 # keep only the most similar compound to the input compound
                 most_similar_product = ChemUtils.most_similar_compound(smiles, products)
-                if most_similar_product not in new_compounds.NewCompoundSmiles.values:
-                    if self._match_conditions(most_similar_product):
-                        if self._neutralize:
-                            most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
-                        ecs = self._get_ec_numbers(smarts_id)
-                        new_compounds.loc[len(new_compounds)] = [smiles_id, smiles, smarts_id,
-                                                                 f"{smiles_id}_{uuid.uuid4()}", most_similar_product,
-                                                                 result, ecs]
-        return new_compounds
+                if self._match_conditions(most_similar_product):
+                    if self._neutralize:
+                        most_similar_product = ChemUtils.uncharge_smiles(most_similar_product)
+                    ecs = self._get_ec_numbers(smarts_id)
+                    with open(self._new_compounds_path, 'a') as f:
+                        f.write(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t"
+                                f"{most_similar_product}\t{result}\t{ecs}\n")
+                    self._new_compounds_flag = True
 
     def react(self):
         """
         Transform reactants into products using the reaction rules.
         """
         t0 = time.time()
+        with open(self._new_compounds_path, 'w') as f:
+            f.write('OriginalCompoundID\tOriginalCompoundSmiles\tOriginalReactionRuleID\tNewCompoundID\t'
+                    'NewCompoundSmiles\tNewReactionSmiles\tEC_Numbers\n')
         params = list(itertools.product(self._compounds.smiles, self._reaction_rules.SMARTS))
         with multiprocessing.Pool(self._n_jobs) as pool:
-            results_ = pool.starmap(self._react_single, tqdm(params, total=len(params)))
-
-        if _empty_dfs(results_):
-            logging.info('No new compounds could be generated using this reaction rules.')
-            t1 = time.time()
-            logging.info(f"Time elapsed: {t1 - t0} seconds")
-            return False
-        results = pd.concat(results_)
-        results = self.process_results(results)
-
-        results.to_csv(self._output_path + '/new_compounds.tsv', sep='\t', index=False)
-        logging.info(f"New compounds saved to {self._output_path}new_compounds.tsv")
-        logging.info(f"{results.shape[0]} unique new compounds generated!")
-        self._new_compounds = results
+            pool.starmap(self._react_single, tqdm(params, total=len(params)))
+        self._new_compounds = f"New products saved to {self._new_compounds_path}"
         t1 = time.time()
         logging.info(f"Time elapsed: {t1 - t0} seconds")
-        return True
+        if self._new_compounds_flag:
+            return True
+        return False
 
 
 if __name__ == '__main__':

diff --git a/src/biocatalyzer/clis/cli.py b/src/biocatalyzer/clis/cli.py
@@ -122,8 +122,9 @@ def biocatalyzer_cli(compounds,
                     molecules_to_remove_path=molecules_to_remove,
                     min_atom_count=min_atom_count,
                     n_jobs=n_jobs)
-    logging.basicConfig(filename=f'{output_path}logging.log', level=logging.DEBUG)
+    logging.basicConfig(filename=f'{output_path}_logging.log', level=logging.DEBUG)
     brr = br.react()
+    br.process_results()
 
     if match_ms_data:
         if not ms_data_path:
@@ -134,7 +135,7 @@ def biocatalyzer_cli(compounds,
         else:
 
             ms = MSDataMatcher(ms_data_path=ms_data_path,
-                               compounds_to_match=os.path.join(output_path, 'new_compounds.tsv'),
+                               compounds_to_match_path=os.path.join(output_path, 'new_compounds_processed.tsv'),
                                output_path=output_path,
                                mode=mode,
                                tolerance=tolerance)

diff --git a/src/biocatalyzer/clis/cli_bioreactor.py b/src/biocatalyzer/clis/cli_bioreactor.py
@@ -93,7 +93,7 @@ def bioreactor_cli(compounds,
                     molecules_to_remove_path=molecules_to_remove,
                     min_atom_count=min_atom_count,
                     n_jobs=n_jobs)
-    logging.basicConfig(filename=f'{output_path}logging.log', level=logging.DEBUG)
+    logging.basicConfig(filename=f'{output_path}_logging.log', level=logging.DEBUG)
     br.react()
 
 

diff --git a/src/biocatalyzer/clis/cli_matcher.py b/src/biocatalyzer/clis/cli_matcher.py
@@ -49,11 +49,11 @@ def matcher_cli(ms_data,
         output_path: Path to the output directory.
     """
     ms = MSDataMatcher(ms_data_path=ms_data,
-                       compounds_to_match=compounds_to_match,
+                       compounds_to_match_path=compounds_to_match,
                        output_path=output_path,
                        mode=mode,
                        tolerance=tolerance)
-    logging.basicConfig(filename=f'{output_path}logging.log', level=logging.DEBUG)
+    logging.basicConfig(filename=f'{output_path}_logging.log', level=logging.DEBUG)
     ms.generate_ms_results()
 
 

diff --git a/src/biocatalyzer/matcher.py b/src/biocatalyzer/matcher.py
@@ -21,7 +21,7 @@ class MSDataMatcher:
     def __init__(self,
                  ms_data_path: str,
                  output_path: str,
-                 compounds_to_match: Union[pd.DataFrame, str],
+                 compounds_to_match_path: str,
                  mode: str = 'mass',
                  tolerance: float = 0.02):
         """
@@ -33,8 +33,8 @@ def __init__(self,
             Path to the MS data.
         output_path: str
             Path to the output directory.
-        compounds_to_match: Union[pd.DataFrame, str]
-            The new predicted compounds to match.
+        compounds_to_match_path: str
+            Path to the new predicted compounds to match.
         mode: str
             The mode of the matcher. Either 'mass' or 'mass_diff'.
         tolerance: float
@@ -46,7 +46,7 @@ def __init__(self,
         self._set_output_path(self._output_path)
         self._mode = mode
         self._tolerance = tolerance
-        self._set_up_data_files(compounds_to_match)
+        self._set_up_data_files(compounds_to_match_path)
         self._prepare_mode()
         self._matches = None
 
@@ -232,20 +232,17 @@ def _prepare_mode(self):
         else:
             raise ValueError('The mode must be either "mass" or "mass_dif".')
 
-    def _set_up_data_files(self, new_compounds: Union[pd.DataFrame, str]):
+    def _set_up_data_files(self, new_compounds: str):
         """
         Set up the reaction rules and new compounds data files.
 
         Parameters
         ----------
-        new_compounds: Union[pd.DataFrame, str]
-            The new compounds to match.
+        new_compounds: str
+            The path to the new compounds to match.
         """
         self._set_up_reaction_rules()
-        if not isinstance(new_compounds, pd.DataFrame):
-            self._set_up_new_compounds(new_compounds)
-        else:
-            self._new_compounds = new_compounds
+        self._set_up_new_compounds(new_compounds)
 
     def _set_up_reaction_rules(self):
         """
@@ -360,7 +357,7 @@ def generate_ms_results(self):
 if __name__ == '__main__':
     output_path_ = 'results/results_example/'
     ms = MSDataMatcher(ms_data_path='data/ms_data_example/ms_data_paper.tsv',
-                       compounds_to_match='results/results_example/new_compounds.tsv',
+                       compounds_to_match_path='results/results_example/new_compounds.tsv',
                        output_path=output_path_,
                        mode='mass',
                        tolerance=0.0015)

diff --git a/tests/unit_tests/test_bioreactor.py b/tests/unit_tests/test_bioreactor.py
@@ -39,8 +39,8 @@ def test_bioreactor(self):
 
         self.assertEqual(br.reaction_rules.shape, (1368, 7))
         self.assertEqual(br.compounds.shape, (4, 2))
-        self.assertIsInstance(br.new_compounds, pd.DataFrame)
-        self.assertEqual(br.new_compounds.shape[1], 7)
+        with self.assertRaises(ValueError):
+            _ = br.new_compounds
 
     def test_bioreactor_all_orgs(self):
         compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')
@@ -56,8 +56,11 @@ def test_bioreactor_all_orgs(self):
 
         self.assertEqual(br_no_orgs_filter.reaction_rules.shape, (3332, 7))
         self.assertEqual(br_no_orgs_filter.compounds.shape, (4, 2))
-        self.assertIsInstance(br_no_orgs_filter.new_compounds, pd.DataFrame)
-        self.assertEqual(br_no_orgs_filter.new_compounds.shape[1], 7)
+        with self.assertRaises(ValueError):
+            _ = br_no_orgs_filter.new_compounds
+
+        r = br_no_orgs_filter.process_results(False)
+        self.assertEqual(r.shape, (380, 7))
 
     def test_bioreactor_all_orgs_keep_all(self):
         compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')
@@ -72,8 +75,6 @@ def test_bioreactor_all_orgs_keep_all(self):
 
         self.assertEqual(br_no_orgs_filter.reaction_rules.shape, (3332, 7))
         self.assertEqual(br_no_orgs_filter.compounds.shape, (4, 2))
-        self.assertIsInstance(br_no_orgs_filter.new_compounds, pd.DataFrame)
-        self.assertEqual(br_no_orgs_filter.new_compounds.shape[1], 7)
 
     def test_bioreactor_properties_and_setters(self):
         compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv')

diff --git a/tests/unit_tests/test_ms_matcher.py b/tests/unit_tests/test_ms_matcher.py
@@ -28,7 +28,7 @@ def test_ms_data_matcher_mass_mode(self):
         ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
         compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv')
         ms = MSDataMatcher(ms_data_path=ms_data_path,
-                           compounds_to_match=compounds_to_match,
+                           compounds_to_match_path=compounds_to_match,
                            output_path=self.output_folder,
                            mode='mass',
                            tolerance=0.0015)
@@ -45,7 +45,7 @@ def test_ms_data_matcher_massdiff_mode(self):
         ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
         compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv')
         ms = MSDataMatcher(ms_data_path=ms_data_path,
-                           compounds_to_match=compounds_to_match,
+                           compounds_to_match_path=compounds_to_match,
                            output_path=self.output_folder,
                            mode='mass_diff',
                            tolerance=0.0015)
@@ -62,7 +62,7 @@ def test_ms_data_matcher_properties_and_setters(self):
         ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv')
         compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv')
         ms = MSDataMatcher(ms_data_path=ms_data_path,
-                           compounds_to_match=compounds_to_match,
+                           compounds_to_match_path=compounds_to_match,
                            output_path=self.output_folder,
                            mode='mass_diff',
                            tolerance=0.0015)