Skip to content

Commit

Permalink
Merge branch 'AUTOML-49' into 'master'
Browse files Browse the repository at this point in the history
[AUTOML-49] Model saving via joblib and delete joblib upper bound

See merge request ai-lab-pmo/mltools/automl/LightAutoML!35
  • Loading branch information
dev-rinchin committed Dec 12, 2024
2 parents 86bbe76 + e29e365 commit baf5be4
Show file tree
Hide file tree
Showing 11 changed files with 149 additions and 116 deletions.
12 changes: 12 additions & 0 deletions lightautoml/addons/utilization/utilization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from ...utils.logging import verbosity_to_loglevel
from ...utils.timer import PipelineTimer

from joblib import dump


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -226,6 +228,7 @@ def fit_predict(
valid_features: Optional[Sequence[str]] = None,
verbose: int = 0,
log_file: str = None,
path_to_save: Optional[str] = None,
) -> LAMLDataset:
"""Fit and get prediction on validation dataset.
Expand Down Expand Up @@ -253,6 +256,7 @@ def fit_predict(
if cannot be inferred from `valid_data`.
verbose: Verbose.
log_file: Log filename.
path_to_save: The path that joblib will use to save the model after fit stage is completed. Use *.joblib format.
Returns:
Dataset with predictions. Call ``.data`` to get predictions array.
Expand Down Expand Up @@ -361,6 +365,14 @@ def fit_predict(
val_pred = concatenate(inner_preds)
self.outer_pipes = inner_pipes

# saving automl model with joblib
if path_to_save is not None:
# There is 1 parameter for model save:
# "path_to_save" - name of model for saving.

dump_name = path_to_save if path_to_save.endswith(".joblib") else f"{path_to_save}.joblib"
dump(self, dump_name, compress=0)

return val_pred

def predict(
Expand Down
11 changes: 11 additions & 0 deletions lightautoml/automl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ..validation.utils import create_validation_iterator
from .blend import BestModelSelector
from .blend import Blender
from joblib import dump


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -157,6 +158,7 @@ def fit_predict(
valid_data: Optional[Any] = None,
valid_features: Optional[Sequence[str]] = None,
verbose: int = 0,
path_to_save: Optional[str] = None,
) -> LAMLDataset:
"""Fit on input data and make prediction on validation part.
Expand All @@ -176,6 +178,7 @@ def fit_predict(
>=2 : the information about folds processing is also displayed;
>=3 : the hyperparameters optimization process is also displayed;
>=4 : the training process for every algorithm is displayed.
path_to_save: The path that joblib will use to save the model after fit stage is completed. Use *.joblib format.
Returns:
Predicted values.
Expand Down Expand Up @@ -280,6 +283,14 @@ def fit_predict(

del self._levels

# saving automl model with joblib
if path_to_save is not None:
# There is 1 parameter for model save:
# "path_to_save" - name of model for saving.

dump_name = path_to_save if path_to_save.endswith(".joblib") else f"{path_to_save}.joblib"
dump(self, dump_name, compress=0)

if self.return_all_predictions:
return concatenate(level_predictions)
return blended_prediction
Expand Down
3 changes: 3 additions & 0 deletions lightautoml/automl/presets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def fit_predict(
valid_data: Optional[Any] = None,
valid_features: Optional[Sequence[str]] = None,
verbose: int = 0,
path_to_save: Optional[str] = None,
) -> LAMLDataset:
"""Fit on input data and make prediction on validation part.
Expand All @@ -182,6 +183,7 @@ def fit_predict(
valid_features: Optional validation dataset features if can't be
inferred from `valid_data`.
verbose: Verbosity level that are passed to each automl.
path_to_save: The path that joblib will use to save the model after fit stage is completed. Use *.joblib format.
Returns:
Dataset with predictions. Call ``.data`` to get predictions array.
Expand Down Expand Up @@ -213,6 +215,7 @@ def fit_predict(
valid_data,
valid_features,
verbose=verbose,
path_to_save=path_to_save,
)

logger.info("\x1b[1mAutoml preset training completed in {:.2f} seconds\x1b[0m\n".format(self.timer.time_spent))
Expand Down
11 changes: 10 additions & 1 deletion lightautoml/automl/presets/tabular_presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,7 @@ def fit_predict(
valid_features: Optional[Sequence[str]] = None,
log_file: str = None,
verbose: int = 0,
path_to_save: Optional[str] = None,
) -> NumpyDataset:
"""Fit and get prediction on validation dataset.
Expand Down Expand Up @@ -713,6 +714,7 @@ def fit_predict(
>=4 : the training process for every algorithm is displayed;
log_file: Filename for writing logging messages. If log_file is specified,
the messages will be saved in a the file. If the file exists, it will be overwritten.
path_to_save: The path that joblib will use to save the model after fit stage is completed. Use *.joblib format.
Returns:
Dataset with predictions. Call ``.data`` to get predictions array.
Expand All @@ -734,7 +736,14 @@ def fit_predict(
if self.is_time_series:
train = {"seq": {"seq0": train}}

oof_pred = super().fit_predict(train, roles=roles, cv_iter=cv_iter, valid_data=valid_data, verbose=verbose)
oof_pred = super().fit_predict(
train,
roles=roles,
cv_iter=cv_iter,
valid_data=valid_data,
verbose=verbose,
path_to_save=path_to_save,
)

return cast(NumpyDataset, oof_pred)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ networkx = "*"
cmaes = "*"
pyyaml = "*"
tqdm = "*"
joblib = "<1.3.0"
joblib = "*"
autowoe = ">=1.2"
jinja2 = "*"
json2html = "*"
Expand Down
30 changes: 30 additions & 0 deletions tests/integration/integration_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import numpy as np

from lightautoml.dataset.roles import TargetRole
from joblib import load

from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score


def load_and_test_automl(filename, task, score, pred, data, target_name):
automl = load(filename)

test_pred_joblib = automl.predict(data)

if task.name == "binary":
score_new = roc_auc_score(data[target_name].values, test_pred_joblib.data[:, 0])
elif task.name == "multiclass":
score_new = log_loss(data[target_name].map(automl.reader.class_mapping), test_pred_joblib.data)
elif task.name == "reg":
score_new = mean_squared_error(data[target_name].values, test_pred_joblib.data[:, 0])

np.testing.assert_almost_equal(score, score_new, decimal=3)
np.testing.assert_allclose(pred.data[:, 0], test_pred_joblib.data[:, 0])


def get_target_name(roles):
for key, value in roles.items():
if (key == "target") or isinstance(key, TargetRole):
return value
72 changes: 28 additions & 44 deletions tests/integration/test_custom_2_level_stacking.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
#!/usr/bin/env python
# coding: utf-8

import os
import pickle
import tempfile

from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from os.path import join as pjoin
from sklearn.metrics import roc_auc_score

from lightautoml.automl.base import AutoML
from lightautoml.dataset.roles import TargetRole
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
Expand All @@ -21,32 +16,7 @@
)
from lightautoml.reader.base import PandasToPandasReader


def check_pickling(automl, ho_score, task, test, target_name):
with tempfile.TemporaryDirectory() as tmpdirname:
filename = os.path.join(tmpdirname, "automl.pickle")
with open(filename, "wb") as f:
pickle.dump(automl, f)

with open(filename, "rb") as f:
automl = pickle.load(f)

test_pred = automl.predict(test)

if task.name == "binary":
ho_score_new = roc_auc_score(test[target_name].values, test_pred.data[:, 0])
elif task.name == "multiclass":
ho_score_new = log_loss(test[target_name].map(automl.reader.class_mapping), test_pred.data)
elif task.name == "reg":
ho_score_new = mean_squared_error(test[target_name].values, test_pred.data[:, 0])

assert ho_score == ho_score_new


def get_target_name(roles):
for key, value in roles.items():
if (key == "target") or isinstance(key, TargetRole):
return value
from integration_utils import get_target_name, load_and_test_automl


def test_manual_pipeline(sampled_app_train_test, sampled_app_roles, binary_task):
Expand Down Expand Up @@ -145,16 +115,30 @@ def test_manual_pipeline(sampled_app_train_test, sampled_app_roles, binary_task)
skip_conn=False,
)

# Start AutoML training
oof_predictions = automl.fit_predict(train, roles=sampled_app_roles)

# predict for test data
ho_predictions = automl.predict(test)

oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0])
ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0])

assert oof_score > 0.67
assert ho_score > 0.67
with tempfile.TemporaryDirectory() as tmpdirname:

check_pickling(automl, ho_score, binary_task, test, target_name)
path_to_save = pjoin(tmpdirname, "model.joblib")
# Start AutoML training
oof_predictions = automl.fit_predict(
train,
roles=sampled_app_roles,
path_to_save=path_to_save,
)

# predict for test data
ho_predictions = automl.predict(test)

oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0])
ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0])

assert oof_score > 0.67
assert ho_score > 0.67

load_and_test_automl(
filename=path_to_save,
task=binary_task,
score=ho_score,
pred=ho_predictions,
data=test,
target_name=target_name,
)
12 changes: 0 additions & 12 deletions tests/integration/test_demo0.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

"""Building ML pipeline from blocks and fit + predict the pipeline itself."""

import os
import pickle
import time

import numpy as np
Expand Down Expand Up @@ -148,16 +146,6 @@ def test_simple_pipeline(sampled_app_roles, binary_task):
train_pred = total.predict(pd_dataset)
assert time.time() - start_time < MAX_PREDICT_TIME

# Pickle automl
with open("automl.pickle", "wb") as f:
pickle.dump(total, f)

with open("automl.pickle", "rb") as f:
total = pickle.load(f)

train_pred = total.predict(pd_dataset)
os.remove("automl.pickle")

assert train_pred.shape == (10000, 2)

assert isinstance(model1.get_features_score(), pd.Series)
Expand Down
43 changes: 20 additions & 23 deletions tests/integration/test_demo1.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#!/usr/bin/env python
# coding: utf-8

import os
import pickle

import tempfile
from os.path import join as pjoin
from sklearn.metrics import roc_auc_score

from lightautoml.automl.base import AutoML
Expand All @@ -17,12 +16,15 @@
)
from lightautoml.reader.base import PandasToPandasReader

from integration_utils import load_and_test_automl, get_target_name


def test_cutoff_selector_in_pipeline(sampled_app_train_test, binary_task):
def test_cutoff_selector_in_pipeline(sampled_app_train_test, sampled_app_roles, binary_task):

train_data, test_data = sampled_app_train_test

task = binary_task
target_name = get_target_name(sampled_app_roles)

reader = PandasToPandasReader(task, cv=5, random_state=1)

Expand Down Expand Up @@ -93,25 +95,20 @@ def test_cutoff_selector_in_pipeline(sampled_app_train_test, binary_task):
debug=True,
)

automl.fit_predict(train_data, roles={"target": "TARGET"}, verbose=5)

# just checking if methods can be called
selector.get_features_score()
automl.levels[-1][0].ml_algos[0].get_features_score()
automl.levels[0][0].ml_algos[0].get_features_score()
automl.levels[0][0].ml_algos[1].get_features_score()

test_pred = automl.predict(test_data)

with open("automl.pickle", "wb") as f:
pickle.dump(automl, f)

with open("automl.pickle", "rb") as f:
automl = pickle.load(f)
with tempfile.TemporaryDirectory() as tmpdirname:
path_to_save = pjoin(tmpdirname, "model.joblib")
automl.fit_predict(train_data, roles={"target": "TARGET"}, verbose=5, path_to_save=path_to_save)

test_pred = automl.predict(test_data)
test_score = roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])
# just checking if methods can be called
selector.get_features_score()
automl.levels[-1][0].ml_algos[0].get_features_score()
automl.levels[0][0].ml_algos[0].get_features_score()
automl.levels[0][0].ml_algos[1].get_features_score()

assert test_score > 0.65
test_pred = automl.predict(test_data)
test_score = roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])
assert test_score > 0.65

os.remove("automl.pickle")
load_and_test_automl(
path_to_save, task=task, score=test_score, pred=test_pred, data=test_data, target_name=target_name
)
Loading

0 comments on commit baf5be4

Please sign in to comment.