From 755c5e52d57d0389074ad59dce3adc49c23d7d77 Mon Sep 17 00:00:00 2001 From: Colin Grambow Date: Mon, 5 Aug 2019 12:26:22 -0400 Subject: [PATCH 1/5] Switch ML estimator to chemprop Only use two models, one that predicts enthalpy and one that predicts entropy and heat capacity simultaneously. Uncertainty cannot be estimated with the new models. If chemprop is not available, don't raise error until trying to use it. --- rmgpy/data/thermoTest.py | 7 +- rmgpy/ml/__init__.py | 3 - rmgpy/ml/estimator.py | 153 ++++++++++++++++++++++--------------- rmgpy/ml/estimator_test.py | 16 ++-- rmgpy/rmg/input.py | 7 +- 5 files changed, 103 insertions(+), 83 deletions(-) diff --git a/rmgpy/data/thermoTest.py b/rmgpy/data/thermoTest.py index 9b939ae4e6..5970f2aa0d 100644 --- a/rmgpy/data/thermoTest.py +++ b/rmgpy/data/thermoTest.py @@ -81,10 +81,9 @@ def setUpClass(self): # Set up ML estimator models_path = os.path.join(settings['database.directory'], 'thermo', 'ml', 'main') - Hf298_path = os.path.join(models_path, 'H298') - S298_path = os.path.join(models_path, 'S298') - Cp_path = os.path.join(models_path, 'Cp') - self.ml_estimator = MLEstimator(Hf298_path, S298_path, Cp_path) + hf298_path = os.path.join(models_path, 'hf298') + s298_cp_path = os.path.join(models_path, 's298_cp') + self.ml_estimator = MLEstimator(hf298_path, s298_cp_path) def testPickle(self): """ diff --git a/rmgpy/ml/__init__.py b/rmgpy/ml/__init__.py index 930c261434..4ea7187055 100644 --- a/rmgpy/ml/__init__.py +++ b/rmgpy/ml/__init__.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - ############################################################################### # # # RMG - Reaction Mechanism Generator # diff --git a/rmgpy/ml/estimator.py b/rmgpy/ml/estimator.py index 4dd1e4e299..eb0569d6f5 100644 --- a/rmgpy/ml/estimator.py +++ b/rmgpy/ml/estimator.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - ############################################################################### # # # RMG - Reaction Mechanism Generator # @@ -28,17 +25,24 @@ # # ############################################################################### +import contextlib import os +from argparse import Namespace +from typing import Callable, Union + +try: + import chemprop +except ImportError as e: + chemprop = None + chemprop_exception = e import numpy as np -os.environ['KERAS_BACKEND'] = 'theano' -from dde.predictor import Predictor - +from rmgpy.molecule import Molecule +from rmgpy.species import Species from rmgpy.thermo import ThermoData -class MLEstimator(): - +class MLEstimator: """ A machine learning based estimator for thermochemistry prediction. @@ -48,55 +52,41 @@ class MLEstimator(): Attribute Type Description ==================== ======================= ======================= `hf298_estimator` :class:`Predictor` Hf298 estimator - `hf298_uncertainty` ``bool`` Hf298 uncertainty flag - `s298_estimator` :class:`Predictor` S298 estimator - `s298_uncertainty` ``bool`` S298 uncertainty flag - `cp_estimator` :class:`Predictor` Cp estimator - `cp_uncertainty` ``bool`` Cp uncertainty flag + `s298_cp_estimator` :class:`Predictor` S298 and Cp estimator + `temps` ``list`` Cp temperatures ==================== ======================= ======================= - """ - def __init__(self, hf298_path, s298_path, cp_path): - self.hf298_estimator, self.hf298_uncertainty = load_estimator(hf298_path) - self.s298_estimator, self.s298_uncertainty = load_estimator(s298_path) - self.cp_estimator, self.cp_uncertainty = load_estimator(cp_path) + # These should correspond to the temperatures that the ML model was + # trained on for Cp. + temps = [300.0, 400.0, 500.0, 600.0, 800.0, 1000.0, 1500.0] - def get_thermo_data(self, molecule): + def __init__(self, hf298_path: str, s298_cp_path: str): + self.hf298_estimator = load_estimator(hf298_path) + self.s298_cp_estimator = load_estimator(s298_cp_path) + + def get_thermo_data(self, molecule: Union[Molecule, str]) -> ThermoData: """ Return thermodynamic parameters corresponding to a given - :class:`Molecule` object `molecule`. Also set the - uncertainties estimated by the ML model if available. + :class:`Molecule` object `molecule` or a SMILES string. Returns: ThermoData """ + molecule = Molecule(SMILES=molecule) if isinstance(molecule, str) else molecule - # These should correspond to the temperatures that the ML model was - # trained on for Cp. - temps = [300.0, 400.0, 500.0, 600.0, 800.0, 1000.0, 1500.0] - - hf298 = self.hf298_estimator.predict(molecule=molecule, sigma=self.hf298_uncertainty) - s298 = self.s298_estimator.predict(molecule=molecule, sigma=self.s298_uncertainty) - cp = self.cp_estimator.predict(molecule=molecule, sigma=self.cp_uncertainty) - - # If uncertainty is available for the ML model, a tuple of predicted - # value and estimated uncertainty is returned. An uncertainty of None - # gets set to a valua of 0 by :class:`Quantity`. - hf298, hf298u = hf298 if isinstance(hf298, tuple) else (hf298, None) - s298, s298u = s298 if isinstance(s298, tuple) else (s298, None) - cp, cpu = cp if isinstance(cp, tuple) else (cp, None) - - cp = [np.float64(cp_i) for cp_i in cp] - if cpu is not None: - cpu = [np.float64(cpu_i) for cpu_i in cpu] + hf298 = self.hf298_estimator(molecule.SMILES)[0][0] + s298_cp = self.s298_cp_estimator(molecule.SMILES)[0] + s298, cp = s298_cp[0], s298_cp[1:] cp0 = molecule.calculateCp0() cpinf = molecule.calculateCpInf() + + # Set uncertainties to 0 because the current model cannot estimate them thermo = ThermoData( - Tdata=(temps, 'K'), - Cpdata=(cp, 'cal/(mol*K)', cpu), - H298=(hf298, 'kcal/mol', hf298u), - S298=(s298, 'cal/(mol*K)', s298u), + Tdata=(self.temps, 'K'), + Cpdata=(cp, 'cal/(mol*K)', np.zeros(len(self.temps))), + H298=(hf298, 'kcal/mol', 0), + S298=(s298, 'cal/(mol*K)', 0), Cp0=(cp0, 'J/(mol*K)'), CpInf=(cpinf, 'J/(mol*K)'), Tmin=(300.0, 'K'), @@ -106,7 +96,7 @@ def get_thermo_data(self, molecule): return thermo - def get_thermo_data_for_species(self, species): + def get_thermo_data_for_species(self, species: Species) -> ThermoData: """ Return the set of thermodynamic parameters corresponding to a given :class:`Species` object `species`. @@ -119,21 +109,62 @@ def get_thermo_data_for_species(self, species): return self.get_thermo_data(species.molecule[0]) -def load_estimator(model_path): - estimator = Predictor() - - input_file = os.path.join(model_path, 'predictor_input.py') - weights_file = os.path.join(model_path, 'full_train.h5') - model_file = os.path.join(model_path, 'full_train.json') - mean_and_std_file = os.path.join(model_path, 'full_train_mean_std.npz') - - estimator.load_input(input_file) - if os.path.exists(model_file): - estimator.load_architecture(model_file) - uncertainty = True - else: - uncertainty = False - mean_and_std_file = mean_and_std_file if os.path.exists(mean_and_std_file) else None - estimator.load_parameters(param_path=weights_file, mean_and_std_path=mean_and_std_file) +def load_estimator(model_dir: str) -> Callable[[str], np.ndarray]: + """ + Load chemprop model and return function for evaluating it. + """ + if chemprop is None: + # Delay chemprop ImportError until we actually try to use it + # so that RMG can load successfully without chemprop. + raise chemprop_exception + + args = Namespace() # Simple class to hold attributes + + # Set up chemprop predict arguments + args.checkpoint_dir = model_dir + args.checkpoint_path = None + chemprop.parsing.update_checkpoint_args(args) + args.cuda = False + + scaler, features_scaler = chemprop.utils.load_scalers(args.checkpoint_paths[0]) + train_args = chemprop.utils.load_args(args.checkpoint_paths[0]) + + # Update args with training arguments + for key, value in vars(train_args).items(): + if not hasattr(args, key): + setattr(args, key, value) + + # Load models in ensemble + models = [] + for checkpoint_path in args.checkpoint_paths: + models.append(chemprop.utils.load_checkpoint(checkpoint_path, cuda=args.cuda)) + + # Set up estimator + def estimator(smi: str): + # Make dataset + data = chemprop.data.MoleculeDataset( + [chemprop.data.MoleculeDatapoint(line=[smi], args=args)] + ) - return estimator, uncertainty + # Normalize features + if train_args.features_scaling: + data.normalize_features(features_scaler) + + # Redirect chemprop stderr to null device so that it doesn't + # print progress bars every time a prediction is made + with open(os.devnull, 'w') as f, contextlib.redirect_stderr(f): + # Predict with each model individually and sum predictions + sum_preds = np.zeros((len(data), args.num_tasks)) + for model in models: + model_preds = chemprop.train.predict( + model=model, + data=data, + batch_size=1, # We'll only predict one molecule at a time + scaler=scaler + ) + sum_preds += np.array(model_preds) + + avg_preds = sum_preds / len(models) + return avg_preds + + return estimator diff --git a/rmgpy/ml/estimator_test.py b/rmgpy/ml/estimator_test.py index 807d3e5d8a..3118239cbd 100644 --- a/rmgpy/ml/estimator_test.py +++ b/rmgpy/ml/estimator_test.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - ############################################################################### # # # RMG - Reaction Mechanism Generator # @@ -32,7 +29,6 @@ import unittest from rmgpy import settings -from rmgpy.molecule import Molecule from rmgpy.ml.estimator import MLEstimator @@ -47,20 +43,18 @@ def setUp(self): other unit tests. """ models_path = os.path.join(settings['database.directory'], 'thermo', 'ml', 'main') - Hf298_path = os.path.join(models_path, 'H298') - S298_path = os.path.join(models_path, 'S298') - Cp_path = os.path.join(models_path, 'Cp') - self.ml_estimator = MLEstimator(Hf298_path, S298_path, Cp_path) + hf298_path = os.path.join(models_path, 'hf298') + s298_cp_path = os.path.join(models_path, 's298_cp') + self.ml_estimator = MLEstimator(hf298_path, s298_cp_path) def test_get_thermo_data(self): """ Test that we can make a prediction using MLEstimator. """ - mol = Molecule().fromSMILES('C1C2C1C2') - thermo = self.ml_estimator.get_thermo_data(mol) + smi = 'C1C2C1C2' + thermo = self.ml_estimator.get_thermo_data(smi) self.assertTrue(thermo.comment.startswith('ML Estimation')) self.assertAlmostEqual(thermo.Cp0.value_si, 33.3, 1) self.assertAlmostEqual(thermo.CpInf.value_si, 232.8, 1) self.assertEqual(len(thermo.Cpdata.value_si), 7) - self.assertGreater(thermo.S298.uncertainty_si, 0.01) diff --git a/rmgpy/rmg/input.py b/rmgpy/rmg/input.py index 34ab5ec477..4ef9fee444 100644 --- a/rmgpy/rmg/input.py +++ b/rmgpy/rmg/input.py @@ -641,10 +641,9 @@ def mlEstimator(thermo=True, models_path = os.path.join(settings['database.directory'], 'thermo', 'ml', name) if not os.path.exists(models_path): raise InputError('Cannot find ML models folder {}'.format(models_path)) - H298_path = os.path.join(models_path, 'H298') - S298_path = os.path.join(models_path, 'S298') - Cp_path = os.path.join(models_path, 'Cp') - rmg.ml_estimator = MLEstimator(H298_path, S298_path, Cp_path) + hf298_path = os.path.join(models_path, 'hf298') + s298_cp_path = os.path.join(models_path, 's298_cp') + rmg.ml_estimator = MLEstimator(hf298_path, s298_cp_path) uncertainty_cutoffs = dict( H298=Quantity(*H298UncertaintyCutoff), From 128a45890b020380730cde9da932ee5c9759a905 Mon Sep 17 00:00:00 2001 From: Colin Grambow Date: Mon, 5 Aug 2019 12:30:41 -0400 Subject: [PATCH 2/5] Update documentation and ML example --- documentation/source/users/rmg/input.rst | 4 ++-- examples/rmg/minimal_ml/RMG.sh | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/documentation/source/users/rmg/input.rst b/documentation/source/users/rmg/input.rst index 1bdcd467b4..a61983a0cc 100644 --- a/documentation/source/users/rmg/input.rst +++ b/documentation/source/users/rmg/input.rst @@ -662,8 +662,8 @@ cyclic species regardless of the ``onlyCyclics`` setting. If ``onlyCyclics`` is than zero, RMG will log a warning that ``onlyCyclics`` should also be True and the machine learning estimator will be restricted to only cyclic species with the specified minimum cycle overlap. -If the estimated uncertainty of the thermo prediction is greater than any of the ``UncertaintyCutoff`` values, then -machine learning estimation is not used for that species. +Note that the current machine learning model is not yet capable of estimating uncertainty so the ``UncertaintyCutoff`` +values do not yet have any effect. .. _pressuredependence: diff --git a/examples/rmg/minimal_ml/RMG.sh b/examples/rmg/minimal_ml/RMG.sh index d3c233ad22..538ccf6a5a 100644 --- a/examples/rmg/minimal_ml/RMG.sh +++ b/examples/rmg/minimal_ml/RMG.sh @@ -16,5 +16,4 @@ # command. # Run RMG on the input.py file. -export KERAS_BACKEND=theano python ../../../rmg.py input.py From 28f7503a9ef6915e39a3495d38c14f584833860b Mon Sep 17 00:00:00 2001 From: Colin Grambow Date: Mon, 5 Aug 2019 12:31:02 -0400 Subject: [PATCH 3/5] Update meta.yaml --- .conda/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.conda/meta.yaml b/.conda/meta.yaml index 75a0ebad4d..4a522e1acf 100644 --- a/.conda/meta.yaml +++ b/.conda/meta.yaml @@ -34,10 +34,10 @@ requirements: - cairo - cairocffi - cantera >=2.3.0 + - chemprop - coolprop - coverage - cython >=0.25.2 - - dde - ffmpeg - gprof2dot - graphviz From afb821ed80f7dd387e678125f792cdbd3c87ca3a Mon Sep 17 00:00:00 2001 From: Mark Payne Date: Mon, 22 Jul 2019 07:27:53 -0400 Subject: [PATCH 4/5] Revert Fix Travis builds by adding in missing miniconda dependencies Packages requiring libgcc have been rebuilt to require libgcc-ng instead. With this the conda solver can resolve the environment without needing pydot-ng and pygpu to be given explicitly --- environment_linux.yml | 3 --- environment_mac.yml | 3 --- environment_windows.yml | 3 --- 3 files changed, 9 deletions(-) diff --git a/environment_linux.yml b/environment_linux.yml index 3f985f41ea..52a005e830 100644 --- a/environment_linux.yml +++ b/environment_linux.yml @@ -4,7 +4,6 @@ channels: - rmg - rdkit - cantera - - anaconda - omnia dependencies: - cairo @@ -35,9 +34,7 @@ dependencies: - psutil - pydas >=1.0.1 - pydot ==1.2.2 - - pydot-ng - pydqed >=1.0.0 - - pygpu - pymongo - pyparsing - pyrdl diff --git a/environment_mac.yml b/environment_mac.yml index 78ded8392c..9fdb497c4a 100644 --- a/environment_mac.yml +++ b/environment_mac.yml @@ -4,7 +4,6 @@ channels: - rmg - rdkit - cantera - - anaconda - omnia dependencies: - cairo @@ -35,9 +34,7 @@ dependencies: - psutil - pydas >=1.0.1 - pydot ==1.2.2 - - pydot-ng - pydqed >=1.0.0 - - pygpu - pymongo - pyparsing - pyrdl diff --git a/environment_windows.yml b/environment_windows.yml index 647e88ae47..bc09852bd3 100644 --- a/environment_windows.yml +++ b/environment_windows.yml @@ -4,7 +4,6 @@ channels: - rmg - rdkit - cantera - - anaconda - omnia dependencies: - cairo @@ -36,9 +35,7 @@ dependencies: - psutil - pydas >=1.0.1 - pydot ==1.2.2 - - pydot-ng - pydqed >=1.0.0 - - pygpu - pymongo - pyparsing - pyrdl From 3c687ab89473787f19d3b4e6fb4cc157fb05e4fa Mon Sep 17 00:00:00 2001 From: Colin Grambow Date: Mon, 12 Aug 2019 09:09:04 -0400 Subject: [PATCH 5/5] Update Python 3 environment --- environment_py3.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/environment_py3.yml b/environment_py3.yml index c06bcf0c5b..5f024fb043 100644 --- a/environment_py3.yml +++ b/environment_py3.yml @@ -4,12 +4,14 @@ channels: - rmg - rdkit - cantera - - anaconda + - pytorch + - conda-forge dependencies: - cairo - cairocffi - cantera >=2.3.0 - cclib + - chemprop - coolprop - coverage - cython >=0.25.2