Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Measure inference time #532

Merged
merged 39 commits into from
Jun 17, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
71e7a8b
Add method to split off a subsample of the test set to file
PGijsbers Jun 11, 2023
87f80aa
Add first draft for improving inference time measurements
PGijsbers Jun 12, 2023
94ba496
Store all measured inference times to disk
PGijsbers Jun 13, 2023
0c7bf7d
Also accept a dataframe to allow to infer without disk load
PGijsbers Jun 13, 2023
df46e1b
Make repeats and batch sizes configurable
PGijsbers Jun 13, 2023
0621ee2
Forward inference measurement configuration through task config
PGijsbers Jun 13, 2023
1a72ae1
Randomize samples within the same batch size
PGijsbers Jun 13, 2023
7100391
Rename inference_X_rows column to infer_batch_size_X
PGijsbers Jun 13, 2023
f76e755
Add docstring and move value checking of `fmt`
PGijsbers Jun 13, 2023
bf8cbc7
Add inference time measurements for flaml
PGijsbers Jun 13, 2023
cb04989
Add inference time measurements
PGijsbers Jun 13, 2023
8ea9033
Add inference time measurements
PGijsbers Jun 13, 2023
338fc94
Add inference time measurements
PGijsbers Jun 13, 2023
9b024e2
Document shortcoming of measuring inference time for tpot
PGijsbers Jun 13, 2023
7b3455f
Add inference time measurement
PGijsbers Jun 13, 2023
5733223
Bump ubuntu base to 22.04
PGijsbers Jun 13, 2023
25a1bd4
Add inference measurement for dataframe
PGijsbers Jun 13, 2023
a6f7a52
Defaults to display median inference time
PGijsbers Jun 13, 2023
94f11f6
Add seed to filename
PGijsbers Jun 13, 2023
4119d56
Start with inference measurements (broken)
PGijsbers Jun 13, 2023
0000c63
Add inference measurement on dataframe
PGijsbers Jun 14, 2023
1d2054a
Add inference time measurement with dataframes
PGijsbers Jun 14, 2023
357a63f
Add dataframe inference time measurement
PGijsbers Jun 14, 2023
2dd1ffc
Add dataframe inference time measurement
PGijsbers Jun 14, 2023
1e5a61d
Add dataframe inference measurement (ignores encoding)
PGijsbers Jun 14, 2023
b48c5ed
Add dataframe inference measurement
PGijsbers Jun 14, 2023
42d38ac
Add inference time measurements
PGijsbers Jun 14, 2023
4a38f6c
Add inference time measurements
PGijsbers Jun 14, 2023
6c9ccb3
Allow newer autosklearn versions to use the pandas data instead (#534)
PGijsbers Jun 16, 2023
b24c12c
Add single row file inference for H2O
PGijsbers Jun 16, 2023
d5da2bd
Update inference measurement to record its from file
PGijsbers Jun 16, 2023
cdb0828
Dynamically set type depending on presence of pandas
PGijsbers Jun 16, 2023
4c14dbe
Add time to job timeout if inference measurements enabled
PGijsbers Jun 16, 2023
acae403
Disable inference time measurements for CI
PGijsbers Jun 16, 2023
8caf629
Remove one dash
PGijsbers Jun 17, 2023
f0cbfc0
Disable inference time measurement by default
PGijsbers Jun 17, 2023
b2c9b38
Make measuring inference time optional, also measure single row df
PGijsbers Jun 17, 2023
7a3f433
Add inference time measurement to (T)RF baselines
PGijsbers Jun 17, 2023
54a2dd6
Remove skip inference measurement override since its default now
PGijsbers Jun 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ class TaskConfig:

def __init__(self, name, fold, metrics, seed,
max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
input_dir, output_dir):
input_dir, output_dir, measure_inference_time: bool = False):
self.framework = None
self.framework_params = None
self.framework_version = None
Expand All @@ -397,6 +397,7 @@ def __init__(self, name, fold, metrics, seed,
self.input_dir = input_dir
self.output_dir = output_dir
self.output_predictions_file = os.path.join(output_dir, "predictions.csv")
self.measure_inference_time = measure_inference_time
self.ext = ns() # used if frameworks require extra config points

def __setattr__(self, name, value):
Expand Down Expand Up @@ -477,6 +478,7 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
min_vol_size_mb=task_def.min_vol_size_mb,
input_dir=rconfig().input_dir,
output_dir=benchmark.output_dirs.session,
measure_inference_time=rconfig().inference_time_measurements.enabled,
)
# allowing to override some task parameters through command line, e.g.: -Xt.max_runtime_seconds=60
if rconfig()['t'] is not None:
Expand Down
54 changes: 52 additions & 2 deletions amlb/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,22 @@
**openml** module implements the abstractions defined in **data** module
to expose `OpenML<https://www.openml.org>`_ datasets.
"""
import pathlib
from abc import abstractmethod
import copy
import functools
import logging
import os
import re
from typing import Generic, Tuple, TypeVar, Union
from typing import Generic, Tuple, TypeVar

import arff
import pandas.api.types as pat
import openml as oml
import xmltodict

from ..data import AM, DF, Dataset, DatasetType, Datasplit, Feature
from ..resources import config as rconfig
from ..resources import config as rconfig, get as rget
from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify


Expand Down Expand Up @@ -92,6 +93,55 @@ def test(self):
self._ensure_split_created()
return self._test

def inference_subsample_files(self, fmt: str) -> list[Tuple[int, str]]:
"""Generates n subsamples of size k from the test dataset in `fmt` data format.

We measure the inference time of the models for various batch sizes
(number of rows). We generate config.inference_time_measurements.repeats
subsamples for each of the config.inference_time_measurements.batch_sizes.
These subsamples are stored to file in the `fmt` format (parquet, arff, or csv).
The function returns a list of tuples of (batch size, file path).
"""
seed = rget().seed(self.fold)
return [
(n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i)))
for n in rconfig().inference_time_measurements.batch_sizes
for i, _ in enumerate(range(rconfig().inference_time_measurements.repeats))
]

@profile(logger=log)
def _inference_subsample(self, fmt: str, n: int, seed: int = 0) -> pathlib.Path:
""" Write subset of `n` samples from the test split to disk in `fmt` format """
# Just a hack for now, the splitters all work specifically with openml tasks.
# The important thing is that we split to disk and can load it later.

# We should consider taking a stratified sample if n is large enough,
# inference time might differ based on class
subsample = self._test.X.sample(
n=n,
replace=True,
random_state=seed,
)

_, test_path = self._get_split_paths()
test_path = pathlib.Path(test_path)
subsample_path = test_path.parent / f"{test_path.stem}_{n}_{seed}.{fmt}"
if fmt == "csv":
subsample.to_csv(subsample_path, header=True, index=False)
elif fmt == "arff":
ArffSplitter(self)._save_split(
subsample,
subsample_path,
name=f"{self._oml_dataset.name}_inference_{self.fold}_{n}"
)
elif fmt == "parquet":
subsample.to_parquet(subsample_path)
PGijsbers marked this conversation as resolved.
Show resolved Hide resolved
else:
msg = f"{fmt=}, but must be one of 'csv', 'arff', or 'parquet'."
raise ValueError(msg)

return subsample_path

@lazy_property
@profile(logger=log)
def features(self):
Expand Down
7 changes: 6 additions & 1 deletion amlb/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,11 @@ def compute_score(self, result=None, meta_result=None):
required_meta_res = ['training_duration', 'predict_duration', 'models_count']
for m in required_meta_res:
entry[m] = meta_result[m] if m in meta_result else nan

if inference_times := Namespace.get(meta_result, "inference_times"):
for data_type, measurements in Namespace.dict(inference_times).items():
for n_samples, measured_times in Namespace.dict(measurements).items():
entry[f"infer_batch_size_{data_type}_{n_samples}"] = np.median(measured_times)
result = self.get_result() if result is None else result

scoring_errors = []
Expand Down Expand Up @@ -473,7 +478,7 @@ def set_score(score):
entry.info = result.info
if scoring_errors:
entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors]))
entry |= Namespace({k: v for k, v in meta_result if k not in required_meta_res})
entry |= Namespace({k: v for k, v in meta_result if k not in required_meta_res and k != "inference_times"})
log.info("Metric scores: %s", entry)
return entry

Expand Down
2 changes: 1 addition & 1 deletion amlb/runners/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def _upload_image(self, image):
log.info(f"Successfully published docker image {image}.")

def _generate_script(self, custom_commands):
docker_content = """FROM ubuntu:18.04
docker_content = """FROM ubuntu:22.04

ENV DEBIAN_FRONTEND noninteractive
RUN apt-get update
Expand Down
3 changes: 2 additions & 1 deletion frameworks/AutoGluon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def run_autogluon_tabular(dataset: Dataset, config: TaskConfig):
name=dataset.target.name,
classes=dataset.target.values
),
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType
problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType
inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)

return run_in_venv(__file__, "exec.py",
Expand Down
34 changes: 25 additions & 9 deletions frameworks/AutoGluon/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import warnings
import sys
import tempfile
from typing import Union

warnings.simplefilter("ignore")

if sys.platform == 'darwin':
Expand All @@ -18,7 +20,8 @@
import autogluon.core.metrics as metrics
from autogluon.tabular.version import __version__

from frameworks.shared.callee import call_run, result, output_subdir
from frameworks.shared.callee import call_run, result, output_subdir, \
measure_inference_times
from frameworks.shared.utils import Timer, zip_path

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -64,18 +67,30 @@ def run(dataset, config):
**training_params
)

test_data = TabularDataset(test_path)
# Persist model in memory that is going to be predicting to get correct inference latency
predictor.persist_models('best')

def inference_time_classification(data: Union[str, pd.DataFrame]):
return None, predictor.predict_proba(data, as_multiclass=True)

def inference_time_regression(data: Union[str, pd.DataFrame]):
return predictor.predict(data, as_pandas=False), None

infer = inference_time_classification if is_classification else inference_time_regression
PGijsbers marked this conversation as resolved.
Show resolved Hide resolved
inference_times = {}
if config.measure_inference_time:
inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
PGijsbers marked this conversation as resolved.
Show resolved Hide resolved
test_data = pd.read_parquet(dataset.test.path)
inference_times["df"] = measure_inference_times(
PGijsbers marked this conversation as resolved.
Show resolved Hide resolved
infer,
[(1, test_data.sample(1, random_state=i)) for i in range(100)],
)

test_data = TabularDataset(test_path)
with Timer() as predict:
predictions, probabilities = infer(test_data)
PGijsbers marked this conversation as resolved.
Show resolved Hide resolved
Comment on lines +89 to +91
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought we were going to include the file loading as part of the predict time? Or is this not the case?

Current code would not include loading time as part of the time.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does for measure_inference_times, I'll make sure it does for predict too, where possible. thanks!

if is_classification:
with Timer() as predict:
probabilities = predictor.predict_proba(test_data, as_multiclass=True)
predictions = probabilities.idxmax(axis=1).to_numpy()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: in v0.8, we now have a cleaner way to do this: https://auto.gluon.ai/dev/api/autogluon.tabular.TabularPredictor.get_pred_from_proba.html

We can probably keep as is for now. Switching will only matter if we end up running on metrics where decision threshold calibration matters (balanced accuracy, f1, etc.), as the current code in AMLB is forcibly using decision_threshold=0.5.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We won't use those metrics for our next set of experiments, but PR to make things also work for 0.8 is welcome.

else:
with Timer() as predict:
predictions = predictor.predict(test_data, as_pandas=False)
probabilities = None

prob_labels = probabilities.columns.values.astype(str).tolist() if probabilities is not None else None

Expand Down Expand Up @@ -107,7 +122,8 @@ def run(dataset, config):
models_count=num_models_trained,
models_ensemble_count=num_models_ensemble,
training_duration=training.duration,
predict_duration=predict.duration)
predict_duration=predict.duration,
inference_times=inference_times,)


def save_artifacts(predictor, leaderboard, config):
Expand Down
1 change: 1 addition & 0 deletions frameworks/GAMA/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def run(dataset: Dataset, config: TaskConfig):
X=dataset.test.X,
y=dataset.test.y
),
inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
options = dict(
serialization=dict(sparse_dataframe_deserialized_format='dense')
Expand Down
21 changes: 17 additions & 4 deletions frameworks/GAMA/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import sys
import tempfile as tmp

import pandas as pd

if sys.platform == 'darwin':
os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'
os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
Expand All @@ -18,7 +20,8 @@
from gama.data_loading import file_to_pandas
from gama import GamaClassifier, GamaRegressor, __version__

from frameworks.shared.callee import call_run, result, output_subdir
from frameworks.shared.callee import call_run, result, output_subdir, \
measure_inference_times
from frameworks.shared.utils import Timer, touch


Expand Down Expand Up @@ -83,12 +86,21 @@ def run(dataset, config):
# data = file_to_pandas(dataset.test.path, encoding='utf-8')
# X_test, y_test = data.loc[:, data.columns != dataset.target], data.loc[:, dataset.target]

def infer(path: str):
test_data = pd.read_parquet(path)
predict_fn = gama_automl.predict_proba if is_classification else gama_automl.predict
return predict_fn(test_data)

inference_times = None
if config.measure_inference_time:
inference_times = measure_inference_times(infer, dataset.inference_subsample_files)

with Timer() as predict_timer:
predictions = gama_automl.predict(X_test)

probabilities = None
if is_classification:
probabilities = gama_automl.predict_proba(X_test)
else:
probabilities = None

return result(
output_file=config.output_predictions_file,
Expand All @@ -98,7 +110,8 @@ def run(dataset, config):
target_is_encoded=False,
models_count=len(gama_automl._final_pop),
training_duration=training_timer.duration,
predict_duration=predict_timer.duration
predict_duration=predict_timer.duration,
inference_times=inference_times,
)


Expand Down
3 changes: 2 additions & 1 deletion frameworks/TPOT/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def run(dataset: Dataset, config: TaskConfig):
test=dict(
X=X_test,
y=y_test
)
),
inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)

def process_results(results):
Expand Down
24 changes: 22 additions & 2 deletions frameworks/TPOT/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import sys
import tempfile as tmp

import pandas as pd

if sys.platform == 'darwin':
os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'
os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
Expand All @@ -13,7 +15,8 @@

from tpot import TPOTClassifier, TPOTRegressor, __version__

from frameworks.shared.callee import call_run, output_subdir, result
from frameworks.shared.callee import call_run, output_subdir, result, \
measure_inference_times
from frameworks.shared.utils import Timer, is_sparse


Expand Down Expand Up @@ -67,6 +70,21 @@ def run(dataset, config):
y_test = dataset.test.y
with Timer() as predict:
predictions = tpot.predict(X_test)

def infer(path):
data = pd.read_parquet(path)
if is_classification:
try:
return tpot.predict_proba(data)
except RuntimeError:
return tpot.predict(data)
return tpot.predict(data)

inference_times = None
if config.measure_inference_time:
log.info("TPOT inference time measurements exclude preprocessing time of AMLB.")
inference_times = measure_inference_times(infer, dataset.inference_subsample_files)

try:
probabilities = tpot.predict_proba(X_test) if is_classification else None
except RuntimeError:
Expand All @@ -82,7 +100,9 @@ def run(dataset, config):
target_is_encoded=is_classification,
models_count=len(tpot.evaluated_individuals_),
training_duration=training.duration,
predict_duration=predict.duration)
predict_duration=predict.duration,
inference_times=inference_times,
)


def save_artifacts(estimator, config):
Expand Down
3 changes: 2 additions & 1 deletion frameworks/autosklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def run(dataset: Dataset, config: TaskConfig):
X=X_test,
y=y_test
),
predictors_type=['Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors]
predictors_type=['Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors],
inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)

return run_in_venv(__file__, "exec.py",
Expand Down
Loading