From dd23dcd575a388f2af01128164e2f54ad49da377 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 18 Jun 2023 14:54:32 +0200 Subject: [PATCH 1/2] Add scikit_safe inference time measurement files These files have categorical values numerically encoded and missing values imputed, which makes them usable for any scikit-learn algo. --- amlb/datasets/openml.py | 34 +++++++++++++++++++----- frameworks/RandomForest/__init__.py | 2 +- frameworks/TunedRandomForest/__init__.py | 2 +- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py index 3471fe7eb..3779f3d36 100644 --- a/amlb/datasets/openml.py +++ b/amlb/datasets/openml.py @@ -12,11 +12,13 @@ from typing import Generic, Tuple, TypeVar import arff +import pandas as pd import pandas.api.types as pat import openml as oml import xmltodict from ..data import AM, DF, Dataset, DatasetType, Datasplit, Feature +from ..datautils import impute_array from ..resources import config as rconfig, get as rget from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify @@ -93,32 +95,52 @@ def test(self): self._ensure_split_created() return self._test - def inference_subsample_files(self, fmt: str, with_labels: bool = False) -> list[Tuple[int, str]]: + def inference_subsample_files(self, fmt: str, with_labels: bool = False, scikit_safe: bool = False) -> list[Tuple[int, str]]: """Generates n subsamples of size k from the test dataset in `fmt` data format. We measure the inference time of the models for various batch sizes (number of rows). We generate config.inference_time_measurements.repeats subsamples for each of the config.inference_time_measurements.batch_sizes. + These subsamples are stored to file in the `fmt` format (parquet, arff, or csv). The function returns a list of tuples of (batch size, file path). + + Iff `with_labels` is true, the target column will be included in the split file. + Iff `scikit_safe` is true, categorical values are encoded and missing values + are imputed. """ seed = rget().seed(self.fold) return [ - (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i, with_labels=with_labels))) + (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i, with_labels=with_labels, scikit_safe=scikit_safe))) for n in rconfig().inference_time_measurements.batch_sizes for i, _ in enumerate(range(rconfig().inference_time_measurements.repeats)) ] @profile(logger=log) - def _inference_subsample(self, fmt: str, n: int, seed: int = 0, with_labels: bool = False) -> pathlib.Path: - """ Write subset of `n` samples from the test split to disk in `fmt` format """ + def _inference_subsample(self, fmt: str, n: int, seed: int = 0, with_labels: bool = False, scikit_safe: bool = False) -> pathlib.Path: + """ Write subset of `n` samples from the test split to disk in `fmt` format + + Iff `with_labels` is true, the target column will be included in the split file. + Iff `scikit_safe` is true, categorical values are encoded and missing values + are imputed. + """ # Just a hack for now, the splitters all work specifically with openml tasks. # The important thing is that we split to disk and can load it later. # We should consider taking a stratified sample if n is large enough, # inference time might differ based on class - test = self._test.data if with_labels else self._test.X - subsample = test.sample( + if scikit_safe: + if with_labels: + _, data = impute_array(self.train.data_enc, self.test.data_enc) + else: + _, data = impute_array(self.train.X_enc, self.test.X_enc) + + columns = self._test.data.columns if with_labels else self._test.X.columns + data = pd.DataFrame(data, columns=columns) + else: + data = self._test.data if with_labels else self._test.X + + subsample = data.sample( n=n, replace=True, random_state=seed, diff --git a/frameworks/RandomForest/__init__.py b/frameworks/RandomForest/__init__.py index 3de306f59..9fec28550 100644 --- a/frameworks/RandomForest/__init__.py +++ b/frameworks/RandomForest/__init__.py @@ -24,7 +24,7 @@ def run(dataset: Dataset, config: TaskConfig): X=X_test, y=y_test ), - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), + inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True), ) return run_in_venv(__file__, "exec.py", diff --git a/frameworks/TunedRandomForest/__init__.py b/frameworks/TunedRandomForest/__init__.py index dc0cad908..a35ed65c3 100644 --- a/frameworks/TunedRandomForest/__init__.py +++ b/frameworks/TunedRandomForest/__init__.py @@ -22,7 +22,7 @@ def run(dataset: Dataset, config: TaskConfig): X=X_test, y=y_test ), - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), + inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True), ) return run_in_venv(__file__, "exec.py", From 55e3a302ca542b99646d249be040a8e2d99bf849 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 18 Jun 2023 15:11:17 +0200 Subject: [PATCH 2/2] Only generate inference measurement files if enabled --- frameworks/AutoGluon/__init__.py | 3 ++- frameworks/GAMA/__init__.py | 3 ++- frameworks/H2OAutoML/__init__.py | 3 ++- frameworks/RandomForest/__init__.py | 3 ++- frameworks/TPOT/__init__.py | 3 ++- frameworks/TunedRandomForest/__init__.py | 3 ++- frameworks/autosklearn/__init__.py | 3 ++- frameworks/flaml/__init__.py | 3 ++- frameworks/lightautoml/__init__.py | 3 ++- frameworks/mljarsupervised/__init__.py | 3 ++- 10 files changed, 20 insertions(+), 10 deletions(-) diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index 4c92d08f1..9d3d980a3 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -26,8 +26,9 @@ def run_autogluon_tabular(dataset: Dataset, config: TaskConfig): classes=dataset.target.values ), problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet") return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) diff --git a/frameworks/GAMA/__init__.py b/frameworks/GAMA/__init__.py index 5476600cb..750f5e74e 100644 --- a/frameworks/GAMA/__init__.py +++ b/frameworks/GAMA/__init__.py @@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig): X=dataset.test.X, y=dataset.test.y ), - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet") options = dict( serialization=dict(sparse_dataframe_deserialized_format='dense') ) diff --git a/frameworks/H2OAutoML/__init__.py b/frameworks/H2OAutoML/__init__.py index ce51582ef..596513181 100644 --- a/frameworks/H2OAutoML/__init__.py +++ b/frameworks/H2OAutoML/__init__.py @@ -16,8 +16,9 @@ def run(dataset: Dataset, config: TaskConfig): target=dict(index=dataset.target.index), domains=dict(cardinalities=[0 if f.values is None else len(f.values) for f in dataset.features]), format=dataset.train.format, - inference_subsample_files=dataset.inference_subsample_files(fmt=dataset.train.format, with_labels=True), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt=dataset.train.format, with_labels=True) config.ext.monitoring = rconfig().monitoring return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) diff --git a/frameworks/RandomForest/__init__.py b/frameworks/RandomForest/__init__.py index 9fec28550..5d1f0aa49 100644 --- a/frameworks/RandomForest/__init__.py +++ b/frameworks/RandomForest/__init__.py @@ -24,8 +24,9 @@ def run(dataset: Dataset, config: TaskConfig): X=X_test, y=y_test ), - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet", scikit_safe=True) return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) diff --git a/frameworks/TPOT/__init__.py b/frameworks/TPOT/__init__.py index 1aa3192ea..44cb2cc63 100644 --- a/frameworks/TPOT/__init__.py +++ b/frameworks/TPOT/__init__.py @@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig): X=X_test, y=y_test ), - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet") def process_results(results): if isinstance(results.probabilities, str) and results.probabilities == "predictions": diff --git a/frameworks/TunedRandomForest/__init__.py b/frameworks/TunedRandomForest/__init__.py index a35ed65c3..b97439508 100644 --- a/frameworks/TunedRandomForest/__init__.py +++ b/frameworks/TunedRandomForest/__init__.py @@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig): X=X_test, y=y_test ), - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet", scikit_safe=True) return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) diff --git a/frameworks/autosklearn/__init__.py b/frameworks/autosklearn/__init__.py index a00a7d833..637a81491 100644 --- a/frameworks/autosklearn/__init__.py +++ b/frameworks/autosklearn/__init__.py @@ -24,8 +24,9 @@ def run(dataset: Dataset, config: TaskConfig): y_enc=unsparsify(dataset.test.y_enc), ), predictors_type=['Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors], - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet") return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config) diff --git a/frameworks/flaml/__init__.py b/frameworks/flaml/__init__.py index bca1b6893..dcec90325 100644 --- a/frameworks/flaml/__init__.py +++ b/frameworks/flaml/__init__.py @@ -18,8 +18,9 @@ def run(dataset, config): y=dataset.test.y ), problem_type=dataset.type.name, - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet") options = dict( serialization=dict(sparse_dataframe_deserialized_format='dense') ) diff --git a/frameworks/lightautoml/__init__.py b/frameworks/lightautoml/__init__.py index fedabacf3..4c9654850 100644 --- a/frameworks/lightautoml/__init__.py +++ b/frameworks/lightautoml/__init__.py @@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig): name=dataset.target.name, ), problem_type=dataset.type.name, - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet") options = dict( serialization=dict(sparse_dataframe_deserialized_format='dense') ) diff --git a/frameworks/mljarsupervised/__init__.py b/frameworks/mljarsupervised/__init__.py index 3cd6003ce..b15d780f8 100644 --- a/frameworks/mljarsupervised/__init__.py +++ b/frameworks/mljarsupervised/__init__.py @@ -20,8 +20,9 @@ def run(dataset: Dataset, config: TaskConfig): y=dataset.test.y ), problem_type=dataset.type.name, - inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"), ) + if config.measure_inference_time: + data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet") options = dict( serialization=dict(sparse_dataframe_deserialized_format='dense') )