Skip to content

Commit

Permalink
Add NaiveAutoML (#560)
Browse files Browse the repository at this point in the history
  • Loading branch information
PGijsbers authored Jun 29, 2023
1 parent c6a7033 commit ffc76e8
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/run_all_frameworks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
skip_evaluation=0
if [[ $is_common -eq 0 ]];
then
FRAMEWORKS='["autogluon", "autosklearn", "gama", "h2oautoml", "mlplanweka", "mlr3automl", "tpot", "tunedrandomforest"]'
FRAMEWORKS='["autogluon", "autosklearn", "gama", "h2oautoml", "mlplanweka", "mlr3automl", "naiveautoml", "tpot", "tunedrandomforest"]'
TASKS='["iris", "kc2", "cholesterol"]'
BENCHMARK='["test"]'
else
Expand Down
32 changes: 32 additions & 0 deletions frameworks/NaiveAutoML/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from amlb.benchmark import TaskConfig
from amlb.data import Dataset
from amlb.utils import call_script_in_same_dir


def setup(*args, **kwargs):
call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)


def run(dataset: Dataset, config: TaskConfig):
from frameworks.shared.caller import run_in_venv

data = dict(
target=dataset.target.name,
train=dict(
X=dataset.train.X,
y=dataset.train.y_enc,
),
test=dict(
X=dataset.test.X,
y=dataset.test.y_enc,
),
)
if config.measure_inference_time:
data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
options = dict(
serialization=dict(sparse_dataframe_deserialized_format='dense')
)

return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config, options=options)

125 changes: 125 additions & 0 deletions frameworks/NaiveAutoML/exec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import logging
import os
import pickle
import re
import subprocess
import sys
import tempfile as tmp
from pathlib import Path
from typing import Union

import pandas as pd

if sys.platform == 'darwin':
os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'
os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'

from frameworks.shared.callee import call_run, result, output_subdir, \
measure_inference_times
from frameworks.shared.utils import Timer, touch

from naiveautoml import NaiveAutoML

log = logging.getLogger(__name__)


def run(dataset, config):
pip_list = subprocess.run("python -m pip list".split(), capture_output=True)
match = re.search(r"naiveautoml\s+([^\n]+)", pip_list.stdout.decode(), flags=re.IGNORECASE)
version, = match.groups()
log.info("\n**** NaiveAutoML [v%s] ****", version)

metrics_mapping = dict(
acc='accuracy',
balacc='balanced_accuracy',
auc='roc_auc',
logloss='neg_log_loss',
mae='neg_mean_absolute_error',
r2='r2',
rmse='neg_mean_squared_error',
)
scoring_metric = metrics_mapping.get(config.metric)
if scoring_metric is None:
raise ValueError(f"Performance metric {config.metric} not supported.")

kwargs = dict(
scoring=scoring_metric,
num_cpus=config.cores,
)
# NAML wasn't really designed to run for long time constraints, so we
# make it easy to run NAML with its default configuration for time/iterations.
if not config.framework_params.get("_use_default_time_and_iterations", False):
kwargs["timeout"] = config.max_runtime_seconds
# NAML stops at its first met criterion: iterations or time.
# To ensure time is the first criterion, set max_hpo_iterations very high
kwargs["max_hpo_iterations"] = 1e10
# NAML has a static per-pipeline evaluation time of 10 seconds,
# which is not accommodation for larger datasets.
kwargs["execution_timeout"] = max(config.max_runtime_seconds // 20, 10)
else:
log.info("`_use_default_time_and_iterations` is set, ignoring time constraint.")

kwargs |= {k: v for k, v in config.framework_params.items() if not k.startswith("_")}
automl = NaiveAutoML(**kwargs)

with Timer() as training:
automl.fit(dataset.train.X, dataset.train.y)
log.info(f"Finished fit in {training.duration}s.")

is_classification = (config.type == 'classification')

def infer(data: Union[str, pd.DataFrame]):
test_data = pd.read_parquet(data) if isinstance(data, str) else data
predict_fn = automl.predict_proba if is_classification else automl.predict
return predict_fn(test_data)

inference_times = {}
if config.measure_inference_time:
inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
inference_times["df"] = measure_inference_times(
infer,
[(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")

with Timer() as predict:
predictions = automl.predict(dataset.test.X)
probabilities = automl.predict_proba(dataset.test.X) if is_classification else None
log.info(f"Finished predict in {predict.duration}s.")

save_artifacts(automl, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
truth=dataset.test.y,
# models_count=len(gama_automl._final_pop),
training_duration=training.duration,
predict_duration=predict.duration,
inference_times=inference_times,
target_is_encoded=is_classification,
)


def save_artifacts(naive_automl, config):
artifacts = config.framework_params.get('_save_artifacts', ['history'])
try:
artifacts_dir = Path(output_subdir("artifacts", config))
if 'history' in artifacts:
naive_automl.history.to_csv(artifacts_dir / "history.csv", index=False)

if 'model' in artifacts:
(artifacts_dir / "model_str.txt").write_text(str(naive_automl.chosen_model))
with open(artifacts_dir / "model.pkl", 'wb') as fh:
pickle.dump(naive_automl.chosen_model, fh)
except Exception:
log.warning("Error when saving artifacts.", exc_info=True)



if __name__ == '__main__':
call_run(run)
2 changes: 2 additions & 0 deletions frameworks/NaiveAutoML/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pandas # https://github.com/fmohr/naiveautoml/issues/19
ConfigSpace<0.7.1 # https://github.com/fmohr/naiveautoml/issues/20
42 changes: 42 additions & 0 deletions frameworks/NaiveAutoML/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash
HERE=$(dirname "$0")
VERSION=${1:-"stable"}
REPO=${2:-"https://github.com/fmohr/naiveautoml"}
PKG=${3:-"naiveautoml"}

echo "NaiveAutoML/setup.sh" "$@"

if [[ "$VERSION" == "latest" ]]; then
VERSION="master"
fi

. ${HERE}/../shared/setup.sh ${HERE} true

PIP install -r ${HERE}/requirements.txt

# no __version__ available: https://github.com/fmohr/naiveautoml/issues/22
GET_VERSION_STABLE="import subprocess
import re
pip_list = subprocess.run('$pip_exec list'.split(), capture_output=True)
match = re.search(r'naiveautoml\s+([^\n]+)', pip_list.stdout.decode(), flags=re.IGNORECASE)
version, = match.groups()
print(version)"


if [[ "$VERSION" == "stable" ]]; then
PIP install --no-cache-dir -U ${PKG}
echo GET_VERSION_STABLE
VERSION=$(PY -c "${GET_VERSION_STABLE}")
elif [[ "$VERSION" =~ ^[0-9] ]]; then
PIP install --no-cache-dir -U ${PKG}==${VERSION}
else
if [[ "$VERSION" =~ ^# ]]; then
# Versions starting with a `#` are to be interpreted as commit hashes
# The actual git clone command expects the hash without the `#` prefix.
VERSION="${VERSION:1}"
fi
echo "Attempting to install from git+${REPO}.git@${VERSION}#egg=naiveautoml&subdirectory=python"
PIP install -U "git+${REPO}.git@${VERSION}#egg=naiveautoml&subdirectory=python"
fi

echo $VERSION >> "${HERE}/.setup/installed"
11 changes: 11 additions & 0 deletions resources/frameworks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,17 @@ mlr3automl:
version: 'stable'
project: https://github.com/a-hanf/mlr3automl

NaiveAutoML:
version: 'stable'
project: https://github.com/fmohr/naiveautoml
description: |
NaiveAutoML is an AutoML framework that aims to mimic a simple ML workflow,
where in-isolation optimization of the different components of a pre-defined
pipeline scheme is applied.
refs: [https://link.springer.com/article/10.1007/s10994-022-06200-0#article-info]
# params:
# _save_artifacts: ['history, 'model']

oboe:
version: 'latest'
description: |
Expand Down
4 changes: 4 additions & 0 deletions resources/frameworks_2023Q2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ mlr3automl:
version: '#f667900292e3ded64bb419285c71cd5d1d2c4301'
project: https://github.com/a-hanf/mlr3automl

NaiveAutoML:
repo: https://github.com/pgijsbers/naiveautoml
version: '#182f5148e9d360ad92254fe47c12fc35d9fabd62'

TPOT:
version: '0.12.0'
params:
Expand Down
3 changes: 3 additions & 0 deletions resources/frameworks_latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ MLPlanWEKA:
mlr3automl:
version: 'latest'

NaiveAutoML:
version: 'latest'

oboe:
version: 'latest'

Expand Down
3 changes: 3 additions & 0 deletions resources/frameworks_stable.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ MLPlanWEKA:
mlr3automl:
version: 'stable'

NaiveAutoML:
version: 'stable'

oboe:
version: 'stable'

Expand Down

0 comments on commit ffc76e8

Please sign in to comment.