Impute missing values (#149)

* Upgraded pymatgen and matminer requirements * Better handling of NaNs in features by adding the possibility to use the mean of the column. Fixes bug where they were all set to 0. * Small bug when adding keys to genes. * Small typo and bug fix. * Update with a choice of order between scaling and imputing. * Small bug fix. * Small name change. * Typos. * Rename according to PP's PR * Add some additional 'bad columns' in testing * Tidy up merge * Fix linting --------- Co-authored-by: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Co-authored-by: Matthew Evans <git@ml-evs.science>
ppdebreuck · May 31, 2023 · 0afb9ef · 0afb9ef
1 parent 548ffbf
commit 0afb9ef
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 44 deletions.
diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
@@ -37,6 +37,9 @@ def __init__(
             fit_params: Any additional parameters to pass to `MODNetModel.fit(...)`,
         """
 
+        self.act = "elu"
+        self.loss = loss
+        self.n_neurons_first_layer = 32 * random.randint(1, 10)
         self.max_feat = max_feat
         self.num_classes = num_classes
         self.multi_label = multi_label
@@ -45,18 +48,25 @@ def __init__(
         self.fit_params = fit_params
 
         self.xscale_list = ["minmax", "standard"]
+        self.impute_missing_list = [0, "mean"]
+        self.xscale_before_impute = True
         self.lr_list = [0.1, 0.01, 0.005, 0.001]
         self.batch_size_list = [32, 64, 128, 256]
         self.fraction_list = [1, 0.75, 0.5, 0.25]
 
+        if fit_params:
+            self.__dict__.update(fit_params)
+
         self.genes = {
-            "act": "elu",
-            "loss": loss,
-            "n_neurons_first_layer": 32 * random.randint(1, 10),
+            "act": self.act,
+            "loss": self.loss,
+            "n_neurons_first_layer": self.n_neurons_first_layer,
             "fraction1": random.choice(self.fraction_list),
             "fraction2": random.choice(self.fraction_list),
             "fraction3": random.choice(self.fraction_list),
             "xscale": random.choice(self.xscale_list),
+            "impute_missing": random.choice(self.impute_missing_list),
+            "xscale_before_impute": self.xscale_before_impute,
             "lr": random.choice(self.lr_list),
             "batch_size": random.choice(self.batch_size_list),
             "n_feat": 0,
@@ -82,14 +92,14 @@ def crossover(self, partner: Individual) -> Individual:
         """
 
         genes_from_mother = random.sample(
-            range(10), k=5
-        )  # creates indices to take randomly 5 genes from one parent, and 5 genes from the other
+            range(len(self.genes)), k=len(self.genes) // 2
+        )  # creates indices to take randomly half the genes from one parent, and half the genes from the other
 
         child_genes = {
             list(self.genes.keys())[i]: list(self.genes.values())[i]
             if i in genes_from_mother
             else list(partner.genes.values())[i]
-            for i in range(10)
+            for i in range(len(self.genes))
         }
 
         child = Individual(
@@ -221,6 +231,8 @@ def evaluate(
             epochs=800 if not fast else 1,
             batch_size=self.genes["batch_size"],
             xscale=self.genes["xscale"],
+            impute_missing=self.genes["impute_missing"],
+            xscale_before_impute=self.genes["xscale_before_impute"],
             callbacks=callbacks,
             verbose=0,
             **self.fit_params,
@@ -288,6 +300,8 @@ def refit_model(self, data: MODData, n_models=10, n_jobs=1, fast: bool = False):
             epochs=800 if not fast else 1,
             batch_size=self.genes["batch_size"],
             xscale=self.genes["xscale"],
+            impute_missing=self.genes["impute_missing"],
+            xscale_before_impute=self.genes["xscale_before_impute"],
             callbacks=callbacks,
             verbose=0,
             **self.fit_params,
@@ -522,8 +536,8 @@ def run(
             loss: The built-in tf.keras loss to pass to `compile(...)`.
             n_jobs (Optional[int], optional): Number of jobs to parallelize on. Defaults to None.
             early_stopping (Optional[int], optional): Number of successive generations without improvement before stopping. Defaults to 4.
-            refit (Optional[int], optional): Wether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
-                The amount corresponds the the number of models used in the ensemble. Defaults to 0.
+            refit (Optional[int], optional): Whether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
+                The amount corresponds to the number of models used in the ensemble. Defaults to 0.
             fast (bool, optional): Use only for debugging and testing. A fast GA run with small number of epochs, generations, individuals and folds.
                 Overrides the size_pop, num_generation and nested arguments.. Defaults to False.
             fit_params: Any additional parameters to pass to `MODNetModel.fit(...)`,
@@ -646,7 +660,7 @@ def run(
 
         else:
             ensemble = []
-            for m in models[ranking[:10]]:
+            for m in models[ranking[:refit]]:
                 ensemble += m.model
             self.best_model = EnsembleMODNetModel(modnet_models=ensemble)
 

diff --git a/modnet/models/ensemble.py b/modnet/models/ensemble.py
@@ -142,9 +142,9 @@ def predict(
         Parameters:
             test_data: A featurized and feature-selected `MODData`
                 object containing the descriptors used in training.
-            return_prob: For a classification tasks only: whether to return the probability of each
+            return_prob: For a classification task only: whether to return the probability of each
                 class OR only return the most probable class.
-            return_unc: wheter to return a second dataframe containing the uncertainties
+            return_unc: whether to return a second dataframe containing the uncertainties
 
         Returns:
             A `pandas.DataFrame` containing the predicted values of the targets.
@@ -276,8 +276,6 @@ def fit_preset(
             for k, _ in enumerate(presets):
                 presets[k]["epochs"] = 5
 
-        val_losses = 1e20 * np.ones((len(presets),))
-
         num_nested_folds = 5
         if nested:
             num_nested_folds = nested
@@ -445,6 +443,7 @@ def _validate_ensemble_model(
     act="relu",
     out_act="linear",
     xscale="minmax",
+    impute_missing=-1,
     callbacks=[],
     preset_id=None,
     fold_id=None,
@@ -469,6 +468,7 @@ def _validate_ensemble_model(
         batch_size=batch_size,
         loss=loss,
         xscale=xscale,
+        impute_missing=impute_missing,
         callbacks=callbacks,
         verbose=verbose,
         val_fraction=0,

diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py
@@ -3,7 +3,7 @@
 
 """
 
-from typing import List, Tuple, Dict, Optional, Callable, Any
+from typing import List, Tuple, Dict, Optional, Callable, Any, Union
 from pathlib import Path
 import multiprocessing
 
@@ -12,6 +12,8 @@
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_absolute_error, roc_auc_score
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
 import tensorflow as tf
 
 from modnet.preprocessing import MODData
@@ -87,7 +89,11 @@ def __init__(
         self.act = act
         self.out_act = out_act
 
+        self.xscale = None
         self._scaler = None
+        self._imputer = None
+        self.impute_missing = None
+        self._scale_impute = None
         self.optimal_descriptors = None
         self.target_names = None
         self.targets = targets
@@ -215,6 +221,8 @@ def fit(
         epochs: int = 200,
         batch_size: int = 128,
         xscale: Optional[str] = "minmax",
+        impute_missing: Optional[Union[float, str]] = 0,
+        xscale_before_impute: bool = True,
         metrics: List[str] = ["mae"],
         callbacks: List[Callable] = None,
         verbose: int = 0,
@@ -240,6 +248,17 @@ def fit(
             batch_size: The batch size to use for training.
             xscale: The feature scaler to use, either `None`,
                 `'minmax'` or `'standard'`.
+            impute_missing: Determines how the NaN features are treated.
+                If str, defines the strategy used in the scikit-learn SimpleImputer,
+                e.g., "mean" sets the NaNs to the mean of their feature column.
+                If a float is provided, and if xscale_before_impute is False, this
+                float is used to replace NaNs in the original dataset.
+                If a float is provided but xscale_before_impute is True, the float
+                is not used and standard values are used.
+                If you want to do something more sophisticated, make your own
+                modifications to MODData.df_featurized before fitting the model.
+            xscale_before_impute: whether to first scale the input and then impute values, or
+                first impute values and then scale the inputs.
             metrics: A list of tf.keras metrics to pass to `compile(...)`.
             loss: The built-in tf.keras loss to pass to `compile(...)`.
             fit_params: Any additional parameters to pass to `fit(...)`,
@@ -255,6 +274,7 @@ def fit(
             )
 
         self.xscale = xscale
+        self.impute_missing = impute_missing
         self.target_names = list(self.weights.keys())
         self.optimal_descriptors = training_data.get_optimal_descriptors()
 
@@ -300,22 +320,50 @@ def fit(
                 )
             y.append(y_inner)
 
-        # Scale the input features:
+        # Define the scaler
         if self.xscale == "minmax":
             self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5))
 
         elif self.xscale == "standard":
             self._scaler = StandardScaler()
 
-        x = self._scaler.fit_transform(x)
-        x = np.nan_to_num(x, nan=-1)
+        # Define the imputer
+        if isinstance(impute_missing, str):
+            self._imputer = SimpleImputer(
+                missing_values=np.nan, strategy=impute_missing
+            )
+        else:
+            if self.xscale == "minmax":
+                impute_missing = -1 if xscale_before_impute else impute_missing
+            elif self.xscale == "standard":
+                impute_missing = (
+                    10 * np.max(np.nan_to_num(StandardScaler().fit_transform(x)))
+                    if xscale_before_impute
+                    else impute_missing
+                )
+            self.impute_missing = impute_missing
+
+            self._imputer = SimpleImputer(
+                missing_values=np.nan, strategy="constant", fill_value=impute_missing
+            )
+
+        # Scale and impute input features in the desired order
+        if xscale_before_impute:
+            self._scale_impute = Pipeline(
+                [("scaler", self._scaler), ("imputer", self._imputer)]
+            )
+        else:
+            self._scale_impute = Pipeline(
+                [("imputer", self._imputer), ("scaler", self._scaler)]
+            )
+
+        x = self._scale_impute.fit_transform(x)
 
         if val_data is not None:
             val_x = val_data.get_featurized_df()[
                 self.optimal_descriptors[: self.n_feat]
             ].values
-            val_x = self._scaler.transform(val_x)
-            val_x = np.nan_to_num(val_x, nan=-1)
+            val_x = self._scale_impute.transform(val_x)
             val_y = []
             for targ in self.targets_flatten:
                 if self.num_classes[targ] >= 2:  # Classification
@@ -404,6 +452,7 @@ def fit_preset(
         nested: int = 5,
         callbacks: List[Any] = None,
         n_jobs=None,
+        **fit_params,
     ) -> Tuple[
         List[List[Any]],
         np.ndarray,
@@ -596,11 +645,14 @@ def fit_preset(
                 loss=best_preset["loss"],
                 callbacks=callbacks,
                 verbose=verbose,
+                **fit_params,
             )
         else:
             self.n_feat = best_model.n_feat
             self.model = best_model.model
             self._scaler = best_model._scaler
+            self._imputer = best_model._imputer
+            self._scale_impute = best_model._scale_impute
 
         os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"  # reset
 
@@ -623,17 +675,13 @@ class OR only return the most probable class.
         # prevents Nan predictions if some features are inf
         x = (
             test_data.get_featurized_df()
-            .replace([np.inf, -np.inf, np.nan], 0)[
-                self.optimal_descriptors[: self.n_feat]
-            ]
+            .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
             .values
         )
 
-        # Scale the input features:
-        x = np.nan_to_num(x)
-        if self._scaler is not None:
-            x = self._scaler.transform(x)
-            x = np.nan_to_num(x, nan=-1)
+        # Scale and impute input features:
+        if self._scale_impute is not None:
+            x = self._scale_impute.transform(x)
 
         p = np.array(self.model.predict(x))
 
@@ -689,17 +737,13 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame:
         # prevents Nan predictions if some features are inf
         x = (
             test_data.get_featurized_df()
-            .replace([np.inf, -np.inf, np.nan], 0)[
-                self.optimal_descriptors[: self.n_feat]
-            ]
+            .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
             .values
         )
 
-        # Scale the input features:
-        x = np.nan_to_num(x)
-        if self._scaler is not None:
-            x = self._scaler.transform(x)
-            x = np.nan_to_num(x, nan=-1)
+        # Scale and impute input features:
+        if self._scale_impute is not None:
+            x = self._scale_impute.transform(x)
 
         y_pred = np.array(self.model.predict(x))
         if len(y_pred.shape) == 2:

diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py
@@ -24,7 +24,7 @@
 import tqdm
 from multiprocessing import Pool
 
-from modnet.featurizers import MODFeaturizer
+from modnet.featurizers import MODFeaturizer, clean_df
 from modnet import __version__
 from modnet.utils import LOG
 
@@ -769,7 +769,8 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None):
         else:
             df_final = self.featurizer.featurize(self.df_structure)
 
-        df_final = df_final.replace([np.inf, -np.inf, np.nan], 0)
+        # replace infinite values by nan that are handled during the fit
+        df_final = clean_df(df_final)
 
         self.df_featurized = df_final
         LOG.info("Data has successfully been featurized!")
@@ -804,7 +805,7 @@ def feature_selection(
         """
         if getattr(self, "df_featurized", None) is None:
             raise RuntimeError(
-                "Mutual information feature selection requiresd featurized data, please call `.featurize()`"
+                "Mutual information feature selection requires featurized data, please call `.featurize()`"
             )
         if getattr(self, "df_targets", None) is None:
             raise RuntimeError(
@@ -822,8 +823,6 @@ def feature_selection(
 
         if cross_nmi is not None:
             self.cross_nmi = cross_nmi
-        elif getattr(self, "cross_nmi", None) is None:
-            self.cross_nmi = None
 
         # Loading mutual information between features
         if use_precomputed_cross_nmi:
@@ -850,9 +849,7 @@ def feature_selection(
             )
 
         if self.cross_nmi.isna().sum().sum() > 0:
-            raise RuntimeError(
-                "Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero."
-            )
+            raise RuntimeError("Cross NMI (`moddata.cross_nmi`) contains NaN values.")
 
         selection_names = list(set(self.names).difference(set(ignore_names)))
         for i, name in enumerate(selection_names):

diff --git a/modnet/tests/test_preprocessing.py b/modnet/tests/test_preprocessing.py
@@ -31,7 +31,17 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03):
     # different number of symm ops being detected.
 
     # We need a mechanism to allow these discrepancies through in certain cases:
-    allowed_bad_columns = ["GlobalSymmetryFeatures|n_symmetry_ops"]
+    allowed_bad_columns = [
+        "GlobalSymmetryFeatures|n_symmetry_ops",
+        "GlobalSymmetryFeatures|crystal_system",
+        "YangSolidSolution|Yang delta",
+        "Miedema|Miedema_deltaH_inter",
+        "AtomicPackingEfficiency|mean simul. packing efficiency",
+        "Miedema|Miedema_deltaH_amor",
+        "AtomicPackingEfficiency|mean abs simul. packing efficiency",
+        "Miedema|Miedema_deltaH_ss_min",
+    ]
+
     for col in allowed_bad_columns:
         if col in error_cols:
             error_cols.remove(col)