From dd23dcd575a388f2af01128164e2f54ad49da377 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Sun, 18 Jun 2023 14:54:32 +0200
Subject: [PATCH 1/2] Add scikit_safe inference time measurement files
These files have categorical values numerically encoded and missing
values imputed, which makes them usable for any scikit-learn algo.
---
amlb/datasets/openml.py | 34 +++++++++++++++++++-----
frameworks/RandomForest/__init__.py | 2 +-
frameworks/TunedRandomForest/__init__.py | 2 +-
3 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index 3471fe7eb..3779f3d36 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -12,11 +12,13 @@
from typing import Generic, Tuple, TypeVar
import arff
+import pandas as pd
import pandas.api.types as pat
import openml as oml
import xmltodict
from ..data import AM, DF, Dataset, DatasetType, Datasplit, Feature
+from ..datautils import impute_array
from ..resources import config as rconfig, get as rget
from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify
@@ -93,32 +95,52 @@ def test(self):
self._ensure_split_created()
return self._test
- def inference_subsample_files(self, fmt: str, with_labels: bool = False) -> list[Tuple[int, str]]:
+ def inference_subsample_files(self, fmt: str, with_labels: bool = False, scikit_safe: bool = False) -> list[Tuple[int, str]]:
"""Generates n subsamples of size k from the test dataset in `fmt` data format.
We measure the inference time of the models for various batch sizes
(number of rows). We generate config.inference_time_measurements.repeats
subsamples for each of the config.inference_time_measurements.batch_sizes.
+
These subsamples are stored to file in the `fmt` format (parquet, arff, or csv).
The function returns a list of tuples of (batch size, file path).
+
+ Iff `with_labels` is true, the target column will be included in the split file.
+ Iff `scikit_safe` is true, categorical values are encoded and missing values
+ are imputed.
"""
seed = rget().seed(self.fold)
return [
- (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i, with_labels=with_labels)))
+ (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i, with_labels=with_labels, scikit_safe=scikit_safe)))
for n in rconfig().inference_time_measurements.batch_sizes
for i, _ in enumerate(range(rconfig().inference_time_measurements.repeats))
]
@profile(logger=log)
- def _inference_subsample(self, fmt: str, n: int, seed: int = 0, with_labels: bool = False) -> pathlib.Path:
- """ Write subset of `n` samples from the test split to disk in `fmt` format """
+ def _inference_subsample(self, fmt: str, n: int, seed: int = 0, with_labels: bool = False, scikit_safe: bool = False) -> pathlib.Path:
+ """ Write subset of `n` samples from the test split to disk in `fmt` format
+
+ Iff `with_labels` is true, the target column will be included in the split file.
+ Iff `scikit_safe` is true, categorical values are encoded and missing values
+ are imputed.
+ """
# Just a hack for now, the splitters all work specifically with openml tasks.
# The important thing is that we split to disk and can load it later.
# We should consider taking a stratified sample if n is large enough,
# inference time might differ based on class
- test = self._test.data if with_labels else self._test.X
- subsample = test.sample(
+ if scikit_safe:
+ if with_labels:
+ _, data = impute_array(self.train.data_enc, self.test.data_enc)
+ else:
+ _, data = impute_array(self.train.X_enc, self.test.X_enc)
+
+ columns = self._test.data.columns if with_labels else self._test.X.columns
+ data = pd.DataFrame(data, columns=columns)
+ else:
+ data = self._test.data if with_labels else self._test.X
+
+ subsample = data.sample(
n=n,
replace=True,
random_state=seed,
diff --git a/frameworks/RandomForest/__init__.py b/frameworks/RandomForest/__init__.py
index 3de306f59..9fec28550 100644
--- a/frameworks/RandomForest/__init__.py
+++ b/frameworks/RandomForest/__init__.py
@@ -24,7 +24,7 @@ def run(dataset: Dataset, config: TaskConfig):
X=X_test,
y=y_test
),
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
+ inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True),
)
return run_in_venv(__file__, "exec.py",
diff --git a/frameworks/TunedRandomForest/__init__.py b/frameworks/TunedRandomForest/__init__.py
index dc0cad908..a35ed65c3 100644
--- a/frameworks/TunedRandomForest/__init__.py
+++ b/frameworks/TunedRandomForest/__init__.py
@@ -22,7 +22,7 @@ def run(dataset: Dataset, config: TaskConfig):
X=X_test,
y=y_test
),
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
+ inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True),
)
return run_in_venv(__file__, "exec.py",
From 55e3a302ca542b99646d249be040a8e2d99bf849 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Sun, 18 Jun 2023 15:11:17 +0200
Subject: [PATCH 2/2] Only generate inference measurement files if enabled
---
frameworks/AutoGluon/__init__.py | 3 ++-
frameworks/GAMA/__init__.py | 3 ++-
frameworks/H2OAutoML/__init__.py | 3 ++-
frameworks/RandomForest/__init__.py | 3 ++-
frameworks/TPOT/__init__.py | 3 ++-
frameworks/TunedRandomForest/__init__.py | 3 ++-
frameworks/autosklearn/__init__.py | 3 ++-
frameworks/flaml/__init__.py | 3 ++-
frameworks/lightautoml/__init__.py | 3 ++-
frameworks/mljarsupervised/__init__.py | 3 ++-
10 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py
index 4c92d08f1..9d3d980a3 100644
--- a/frameworks/AutoGluon/__init__.py
+++ b/frameworks/AutoGluon/__init__.py
@@ -26,8 +26,9 @@ def run_autogluon_tabular(dataset: Dataset, config: TaskConfig):
classes=dataset.target.values
),
problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/GAMA/__init__.py b/frameworks/GAMA/__init__.py
index 5476600cb..750f5e74e 100644
--- a/frameworks/GAMA/__init__.py
+++ b/frameworks/GAMA/__init__.py
@@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig):
X=dataset.test.X,
y=dataset.test.y
),
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
options = dict(
serialization=dict(sparse_dataframe_deserialized_format='dense')
)
diff --git a/frameworks/H2OAutoML/__init__.py b/frameworks/H2OAutoML/__init__.py
index ce51582ef..596513181 100644
--- a/frameworks/H2OAutoML/__init__.py
+++ b/frameworks/H2OAutoML/__init__.py
@@ -16,8 +16,9 @@ def run(dataset: Dataset, config: TaskConfig):
target=dict(index=dataset.target.index),
domains=dict(cardinalities=[0 if f.values is None else len(f.values) for f in dataset.features]),
format=dataset.train.format,
- inference_subsample_files=dataset.inference_subsample_files(fmt=dataset.train.format, with_labels=True),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt=dataset.train.format, with_labels=True)
config.ext.monitoring = rconfig().monitoring
return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/RandomForest/__init__.py b/frameworks/RandomForest/__init__.py
index 9fec28550..5d1f0aa49 100644
--- a/frameworks/RandomForest/__init__.py
+++ b/frameworks/RandomForest/__init__.py
@@ -24,8 +24,9 @@ def run(dataset: Dataset, config: TaskConfig):
X=X_test,
y=y_test
),
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet", scikit_safe=True)
return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/TPOT/__init__.py b/frameworks/TPOT/__init__.py
index 1aa3192ea..44cb2cc63 100644
--- a/frameworks/TPOT/__init__.py
+++ b/frameworks/TPOT/__init__.py
@@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig):
X=X_test,
y=y_test
),
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
def process_results(results):
if isinstance(results.probabilities, str) and results.probabilities == "predictions":
diff --git a/frameworks/TunedRandomForest/__init__.py b/frameworks/TunedRandomForest/__init__.py
index a35ed65c3..b97439508 100644
--- a/frameworks/TunedRandomForest/__init__.py
+++ b/frameworks/TunedRandomForest/__init__.py
@@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig):
X=X_test,
y=y_test
),
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet", scikit_safe=True),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet", scikit_safe=True)
return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/autosklearn/__init__.py b/frameworks/autosklearn/__init__.py
index a00a7d833..637a81491 100644
--- a/frameworks/autosklearn/__init__.py
+++ b/frameworks/autosklearn/__init__.py
@@ -24,8 +24,9 @@ def run(dataset: Dataset, config: TaskConfig):
y_enc=unsparsify(dataset.test.y_enc),
),
predictors_type=['Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors],
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/flaml/__init__.py b/frameworks/flaml/__init__.py
index bca1b6893..dcec90325 100644
--- a/frameworks/flaml/__init__.py
+++ b/frameworks/flaml/__init__.py
@@ -18,8 +18,9 @@ def run(dataset, config):
y=dataset.test.y
),
problem_type=dataset.type.name,
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
options = dict(
serialization=dict(sparse_dataframe_deserialized_format='dense')
)
diff --git a/frameworks/lightautoml/__init__.py b/frameworks/lightautoml/__init__.py
index fedabacf3..4c9654850 100644
--- a/frameworks/lightautoml/__init__.py
+++ b/frameworks/lightautoml/__init__.py
@@ -22,8 +22,9 @@ def run(dataset: Dataset, config: TaskConfig):
name=dataset.target.name,
),
problem_type=dataset.type.name,
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
options = dict(
serialization=dict(sparse_dataframe_deserialized_format='dense')
)
diff --git a/frameworks/mljarsupervised/__init__.py b/frameworks/mljarsupervised/__init__.py
index 3cd6003ce..b15d780f8 100644
--- a/frameworks/mljarsupervised/__init__.py
+++ b/frameworks/mljarsupervised/__init__.py
@@ -20,8 +20,9 @@ def run(dataset: Dataset, config: TaskConfig):
y=dataset.test.y
),
problem_type=dataset.type.name,
- inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
)
+ if config.measure_inference_time:
+ data["inference_subsample_files"] = dataset.inference_subsample_files(fmt="parquet")
options = dict(
serialization=dict(sparse_dataframe_deserialized_format='dense')
)