Skip to content

Commit

Permalink
Impute missing values (#149)
Browse files Browse the repository at this point in the history
* Upgraded pymatgen and matminer requirements

* Better handling of NaNs in features by adding the possibility to use the mean of the column. Fixes bug where they were all set to 0.

* Small bug when adding keys to genes.

* Small typo and bug fix.

* Update with a choice of order between scaling and imputing.

* Small bug fix.

* Small name change.

* Typos.

* Rename according to PP's PR

* Add some additional 'bad columns' in testing

* Tidy up merge

* Fix linting

---------

Co-authored-by: Matthew Evans <7916000+ml-evs@users.noreply.github.com>
Co-authored-by: Matthew Evans <git@ml-evs.science>
  • Loading branch information
3 people authored May 31, 2023
1 parent 548ffbf commit 0afb9ef
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 44 deletions.
32 changes: 23 additions & 9 deletions modnet/hyper_opt/fit_genetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def __init__(
fit_params: Any additional parameters to pass to `MODNetModel.fit(...)`,
"""

self.act = "elu"
self.loss = loss
self.n_neurons_first_layer = 32 * random.randint(1, 10)
self.max_feat = max_feat
self.num_classes = num_classes
self.multi_label = multi_label
Expand All @@ -45,18 +48,25 @@ def __init__(
self.fit_params = fit_params

self.xscale_list = ["minmax", "standard"]
self.impute_missing_list = [0, "mean"]
self.xscale_before_impute = True
self.lr_list = [0.1, 0.01, 0.005, 0.001]
self.batch_size_list = [32, 64, 128, 256]
self.fraction_list = [1, 0.75, 0.5, 0.25]

if fit_params:
self.__dict__.update(fit_params)

self.genes = {
"act": "elu",
"loss": loss,
"n_neurons_first_layer": 32 * random.randint(1, 10),
"act": self.act,
"loss": self.loss,
"n_neurons_first_layer": self.n_neurons_first_layer,
"fraction1": random.choice(self.fraction_list),
"fraction2": random.choice(self.fraction_list),
"fraction3": random.choice(self.fraction_list),
"xscale": random.choice(self.xscale_list),
"impute_missing": random.choice(self.impute_missing_list),
"xscale_before_impute": self.xscale_before_impute,
"lr": random.choice(self.lr_list),
"batch_size": random.choice(self.batch_size_list),
"n_feat": 0,
Expand All @@ -82,14 +92,14 @@ def crossover(self, partner: Individual) -> Individual:
"""

genes_from_mother = random.sample(
range(10), k=5
) # creates indices to take randomly 5 genes from one parent, and 5 genes from the other
range(len(self.genes)), k=len(self.genes) // 2
) # creates indices to take randomly half the genes from one parent, and half the genes from the other

child_genes = {
list(self.genes.keys())[i]: list(self.genes.values())[i]
if i in genes_from_mother
else list(partner.genes.values())[i]
for i in range(10)
for i in range(len(self.genes))
}

child = Individual(
Expand Down Expand Up @@ -221,6 +231,8 @@ def evaluate(
epochs=800 if not fast else 1,
batch_size=self.genes["batch_size"],
xscale=self.genes["xscale"],
impute_missing=self.genes["impute_missing"],
xscale_before_impute=self.genes["xscale_before_impute"],
callbacks=callbacks,
verbose=0,
**self.fit_params,
Expand Down Expand Up @@ -288,6 +300,8 @@ def refit_model(self, data: MODData, n_models=10, n_jobs=1, fast: bool = False):
epochs=800 if not fast else 1,
batch_size=self.genes["batch_size"],
xscale=self.genes["xscale"],
impute_missing=self.genes["impute_missing"],
xscale_before_impute=self.genes["xscale_before_impute"],
callbacks=callbacks,
verbose=0,
**self.fit_params,
Expand Down Expand Up @@ -522,8 +536,8 @@ def run(
loss: The built-in tf.keras loss to pass to `compile(...)`.
n_jobs (Optional[int], optional): Number of jobs to parallelize on. Defaults to None.
early_stopping (Optional[int], optional): Number of successive generations without improvement before stopping. Defaults to 4.
refit (Optional[int], optional): Wether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
The amount corresponds the the number of models used in the ensemble. Defaults to 0.
refit (Optional[int], optional): Whether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
The amount corresponds to the number of models used in the ensemble. Defaults to 0.
fast (bool, optional): Use only for debugging and testing. A fast GA run with small number of epochs, generations, individuals and folds.
Overrides the size_pop, num_generation and nested arguments.. Defaults to False.
fit_params: Any additional parameters to pass to `MODNetModel.fit(...)`,
Expand Down Expand Up @@ -646,7 +660,7 @@ def run(

else:
ensemble = []
for m in models[ranking[:10]]:
for m in models[ranking[:refit]]:
ensemble += m.model
self.best_model = EnsembleMODNetModel(modnet_models=ensemble)

Expand Down
8 changes: 4 additions & 4 deletions modnet/models/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ def predict(
Parameters:
test_data: A featurized and feature-selected `MODData`
object containing the descriptors used in training.
return_prob: For a classification tasks only: whether to return the probability of each
return_prob: For a classification task only: whether to return the probability of each
class OR only return the most probable class.
return_unc: wheter to return a second dataframe containing the uncertainties
return_unc: whether to return a second dataframe containing the uncertainties
Returns:
A `pandas.DataFrame` containing the predicted values of the targets.
Expand Down Expand Up @@ -276,8 +276,6 @@ def fit_preset(
for k, _ in enumerate(presets):
presets[k]["epochs"] = 5

val_losses = 1e20 * np.ones((len(presets),))

num_nested_folds = 5
if nested:
num_nested_folds = nested
Expand Down Expand Up @@ -445,6 +443,7 @@ def _validate_ensemble_model(
act="relu",
out_act="linear",
xscale="minmax",
impute_missing=-1,
callbacks=[],
preset_id=None,
fold_id=None,
Expand All @@ -469,6 +468,7 @@ def _validate_ensemble_model(
batch_size=batch_size,
loss=loss,
xscale=xscale,
impute_missing=impute_missing,
callbacks=callbacks,
verbose=verbose,
val_fraction=0,
Expand Down
88 changes: 66 additions & 22 deletions modnet/models/vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from typing import List, Tuple, Dict, Optional, Callable, Any
from typing import List, Tuple, Dict, Optional, Callable, Any, Union
from pathlib import Path
import multiprocessing

Expand All @@ -12,6 +12,8 @@
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import tensorflow as tf

from modnet.preprocessing import MODData
Expand Down Expand Up @@ -87,7 +89,11 @@ def __init__(
self.act = act
self.out_act = out_act

self.xscale = None
self._scaler = None
self._imputer = None
self.impute_missing = None
self._scale_impute = None
self.optimal_descriptors = None
self.target_names = None
self.targets = targets
Expand Down Expand Up @@ -215,6 +221,8 @@ def fit(
epochs: int = 200,
batch_size: int = 128,
xscale: Optional[str] = "minmax",
impute_missing: Optional[Union[float, str]] = 0,
xscale_before_impute: bool = True,
metrics: List[str] = ["mae"],
callbacks: List[Callable] = None,
verbose: int = 0,
Expand All @@ -240,6 +248,17 @@ def fit(
batch_size: The batch size to use for training.
xscale: The feature scaler to use, either `None`,
`'minmax'` or `'standard'`.
impute_missing: Determines how the NaN features are treated.
If str, defines the strategy used in the scikit-learn SimpleImputer,
e.g., "mean" sets the NaNs to the mean of their feature column.
If a float is provided, and if xscale_before_impute is False, this
float is used to replace NaNs in the original dataset.
If a float is provided but xscale_before_impute is True, the float
is not used and standard values are used.
If you want to do something more sophisticated, make your own
modifications to MODData.df_featurized before fitting the model.
xscale_before_impute: whether to first scale the input and then impute values, or
first impute values and then scale the inputs.
metrics: A list of tf.keras metrics to pass to `compile(...)`.
loss: The built-in tf.keras loss to pass to `compile(...)`.
fit_params: Any additional parameters to pass to `fit(...)`,
Expand All @@ -255,6 +274,7 @@ def fit(
)

self.xscale = xscale
self.impute_missing = impute_missing
self.target_names = list(self.weights.keys())
self.optimal_descriptors = training_data.get_optimal_descriptors()

Expand Down Expand Up @@ -300,22 +320,50 @@ def fit(
)
y.append(y_inner)

# Scale the input features:
# Define the scaler
if self.xscale == "minmax":
self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5))

elif self.xscale == "standard":
self._scaler = StandardScaler()

x = self._scaler.fit_transform(x)
x = np.nan_to_num(x, nan=-1)
# Define the imputer
if isinstance(impute_missing, str):
self._imputer = SimpleImputer(
missing_values=np.nan, strategy=impute_missing
)
else:
if self.xscale == "minmax":
impute_missing = -1 if xscale_before_impute else impute_missing
elif self.xscale == "standard":
impute_missing = (
10 * np.max(np.nan_to_num(StandardScaler().fit_transform(x)))
if xscale_before_impute
else impute_missing
)
self.impute_missing = impute_missing

self._imputer = SimpleImputer(
missing_values=np.nan, strategy="constant", fill_value=impute_missing
)

# Scale and impute input features in the desired order
if xscale_before_impute:
self._scale_impute = Pipeline(
[("scaler", self._scaler), ("imputer", self._imputer)]
)
else:
self._scale_impute = Pipeline(
[("imputer", self._imputer), ("scaler", self._scaler)]
)

x = self._scale_impute.fit_transform(x)

if val_data is not None:
val_x = val_data.get_featurized_df()[
self.optimal_descriptors[: self.n_feat]
].values
val_x = self._scaler.transform(val_x)
val_x = np.nan_to_num(val_x, nan=-1)
val_x = self._scale_impute.transform(val_x)
val_y = []
for targ in self.targets_flatten:
if self.num_classes[targ] >= 2: # Classification
Expand Down Expand Up @@ -404,6 +452,7 @@ def fit_preset(
nested: int = 5,
callbacks: List[Any] = None,
n_jobs=None,
**fit_params,
) -> Tuple[
List[List[Any]],
np.ndarray,
Expand Down Expand Up @@ -596,11 +645,14 @@ def fit_preset(
loss=best_preset["loss"],
callbacks=callbacks,
verbose=verbose,
**fit_params,
)
else:
self.n_feat = best_model.n_feat
self.model = best_model.model
self._scaler = best_model._scaler
self._imputer = best_model._imputer
self._scale_impute = best_model._scale_impute

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" # reset

Expand All @@ -623,17 +675,13 @@ class OR only return the most probable class.
# prevents Nan predictions if some features are inf
x = (
test_data.get_featurized_df()
.replace([np.inf, -np.inf, np.nan], 0)[
self.optimal_descriptors[: self.n_feat]
]
.replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
.values
)

# Scale the input features:
x = np.nan_to_num(x)
if self._scaler is not None:
x = self._scaler.transform(x)
x = np.nan_to_num(x, nan=-1)
# Scale and impute input features:
if self._scale_impute is not None:
x = self._scale_impute.transform(x)

p = np.array(self.model.predict(x))

Expand Down Expand Up @@ -689,17 +737,13 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame:
# prevents Nan predictions if some features are inf
x = (
test_data.get_featurized_df()
.replace([np.inf, -np.inf, np.nan], 0)[
self.optimal_descriptors[: self.n_feat]
]
.replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
.values
)

# Scale the input features:
x = np.nan_to_num(x)
if self._scaler is not None:
x = self._scaler.transform(x)
x = np.nan_to_num(x, nan=-1)
# Scale and impute input features:
if self._scale_impute is not None:
x = self._scale_impute.transform(x)

y_pred = np.array(self.model.predict(x))
if len(y_pred.shape) == 2:
Expand Down
13 changes: 5 additions & 8 deletions modnet/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import tqdm
from multiprocessing import Pool

from modnet.featurizers import MODFeaturizer
from modnet.featurizers import MODFeaturizer, clean_df
from modnet import __version__
from modnet.utils import LOG

Expand Down Expand Up @@ -769,7 +769,8 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None):
else:
df_final = self.featurizer.featurize(self.df_structure)

df_final = df_final.replace([np.inf, -np.inf, np.nan], 0)
# replace infinite values by nan that are handled during the fit
df_final = clean_df(df_final)

self.df_featurized = df_final
LOG.info("Data has successfully been featurized!")
Expand Down Expand Up @@ -804,7 +805,7 @@ def feature_selection(
"""
if getattr(self, "df_featurized", None) is None:
raise RuntimeError(
"Mutual information feature selection requiresd featurized data, please call `.featurize()`"
"Mutual information feature selection requires featurized data, please call `.featurize()`"
)
if getattr(self, "df_targets", None) is None:
raise RuntimeError(
Expand All @@ -822,8 +823,6 @@ def feature_selection(

if cross_nmi is not None:
self.cross_nmi = cross_nmi
elif getattr(self, "cross_nmi", None) is None:
self.cross_nmi = None

# Loading mutual information between features
if use_precomputed_cross_nmi:
Expand All @@ -850,9 +849,7 @@ def feature_selection(
)

if self.cross_nmi.isna().sum().sum() > 0:
raise RuntimeError(
"Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero."
)
raise RuntimeError("Cross NMI (`moddata.cross_nmi`) contains NaN values.")

selection_names = list(set(self.names).difference(set(ignore_names)))
for i, name in enumerate(selection_names):
Expand Down
12 changes: 11 additions & 1 deletion modnet/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,17 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03):
# different number of symm ops being detected.

# We need a mechanism to allow these discrepancies through in certain cases:
allowed_bad_columns = ["GlobalSymmetryFeatures|n_symmetry_ops"]
allowed_bad_columns = [
"GlobalSymmetryFeatures|n_symmetry_ops",
"GlobalSymmetryFeatures|crystal_system",
"YangSolidSolution|Yang delta",
"Miedema|Miedema_deltaH_inter",
"AtomicPackingEfficiency|mean simul. packing efficiency",
"Miedema|Miedema_deltaH_amor",
"AtomicPackingEfficiency|mean abs simul. packing efficiency",
"Miedema|Miedema_deltaH_ss_min",
]

for col in allowed_bad_columns:
if col in error_cols:
error_cols.remove(col)
Expand Down

0 comments on commit 0afb9ef

Please sign in to comment.