From 71e7a8bde8f6bc06241548bd5d7045d152abd9ef Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sun, 11 Jun 2023 11:59:43 +0200
Subject: [PATCH 01/39] Add method to split off a subsample of the test set to
 file

---
 amlb/datasets/openml.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index d037252d8..452ce9aa5 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -2,13 +2,14 @@
 **openml** module implements the abstractions defined in **data** module
 to expose `OpenML<https://www.openml.org>`_ datasets.
 """
+import pathlib
 from abc import abstractmethod
 import copy
 import functools
 import logging
 import os
 import re
-from typing import Generic, Tuple, TypeVar, Union
+from typing import Generic, Tuple, TypeVar
 
 import arff
 import pandas.api.types as pat
@@ -16,7 +17,7 @@
 import xmltodict
 
 from ..data import AM, DF, Dataset, DatasetType, Datasplit, Feature
-from ..resources import config as rconfig
+from ..resources import config as rconfig, get as rget
 from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify
 
 
@@ -92,6 +93,39 @@ def test(self):
         self._ensure_split_created()
         return self._test
 
+    @profile(logger=log)
+    def _inference_subsample(self, fmt: str, n: int) -> pathlib.Path:
+        """ Write subset of `n` samples from the test split to disk in `fmt` format """
+        # Just a hack for now, the splitters all work specifically with openml tasks.
+        # The important thing is that we split to disk and can load it later.
+        if fmt not in ["csv", "arff", "parquet"]:
+            msg = f"{fmt=}, but must be one of 'csv', 'arff', or 'parquet'."
+            raise ValueError(msg)
+
+        # We should consider taking a stratified sample if n is large enough,
+        # inference time might differ based on class
+        subsample = self._test.X.sample(
+            n=n,
+            replace=True,
+            random_state=rget().seed(self.fold)
+        )
+
+        _, test_path = self._get_split_paths()
+        test_path = pathlib.Path(test_path)
+        subsample_path = test_path.parent / f"{test_path.stem}_{n}.{fmt}"
+        if fmt == "csv":
+            subsample.to_csv(subsample_path, header=True, index=False)
+        elif fmt == "arff":
+            ArffSplitter(self)._save_split(
+                subsample,
+                subsample_path,
+                name=f"{self._oml_dataset.name}_inference_{self.fold}_{n}"
+            )
+        elif fmt == "parquet":
+            subsample.to_parquet(subsample_path)
+
+        return subsample_path
+
     @lazy_property
     @profile(logger=log)
     def features(self):

From 87f80aabf26d656e329ba1177cfe768ad3211b77 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Mon, 12 Jun 2023 14:11:41 +0300
Subject: [PATCH 02/39] Add first draft for improving inference time
 measurements

---
 amlb/datasets/openml.py              |  6 ++++++
 amlb/results.py                      |  6 +++++-
 frameworks/AutoGluon/__init__.py     |  3 ++-
 frameworks/AutoGluon/exec.py         | 25 +++++++++++++++++--------
 frameworks/TPOT/__init__.py          |  3 ++-
 frameworks/TPOT/exec.py              | 21 +++++++++++++++++++--
 frameworks/constantpredictor/exec.py | 11 ++++++++++-
 frameworks/shared/callee.py          | 14 +++++++++++++-
 8 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index 452ce9aa5..445cf46c0 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -93,6 +93,12 @@ def test(self):
         self._ensure_split_created()
         return self._test
 
+    def inference_subsample_files(self, fmt: str) -> list[Tuple[int, str]]:
+        return [
+            (n, str(self._inference_subsample(fmt=fmt, n=n)))
+            for n in [1, 1000, 10_000]
+        ]
+
     @profile(logger=log)
     def _inference_subsample(self, fmt: str, n: int) -> pathlib.Path:
         """ Write subset of `n` samples from the test split to disk in `fmt` format """
diff --git a/amlb/results.py b/amlb/results.py
index 052460fbd..48bd6e447 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -444,6 +444,10 @@ def compute_score(self, result=None, meta_result=None):
         required_meta_res = ['training_duration', 'predict_duration', 'models_count']
         for m in required_meta_res:
             entry[m] = meta_result[m] if m in meta_result else nan
+
+        if inference_times := Namespace.get(meta_result, "inference_times"):
+            for n_samples, measured_times in Namespace.dict(inference_times).items():
+                entry[f"inference_{n_samples}_rows"] = np.mean(measured_times)
         result = self.get_result() if result is None else result
 
         scoring_errors = []
@@ -473,7 +477,7 @@ def set_score(score):
         entry.info = result.info
         if scoring_errors:
             entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors]))
-        entry |= Namespace({k: v for k, v in meta_result if k not in required_meta_res})
+        entry |= Namespace({k: v for k, v in meta_result if k not in required_meta_res and k != "inference_times"})
         log.info("Metric scores: %s", entry)
         return entry
 
diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py
index c8694148c..4c92d08f1 100644
--- a/frameworks/AutoGluon/__init__.py
+++ b/frameworks/AutoGluon/__init__.py
@@ -25,7 +25,8 @@ def run_autogluon_tabular(dataset: Dataset, config: TaskConfig):
             name=dataset.target.name,
             classes=dataset.target.values
         ),
-        problem_type=dataset.type.name  # AutoGluon problem_type is using same names as amlb.data.DatasetType
+        problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
 
     return run_in_venv(__file__, "exec.py",
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index 6fe76769b..2a50ecf37 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -18,7 +18,8 @@
 import autogluon.core.metrics as metrics
 from autogluon.tabular.version import __version__
 
-from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.callee import call_run, result, output_subdir, \
+    measure_inference_times
 from frameworks.shared.utils import Timer, zip_path
 
 log = logging.getLogger(__name__)
@@ -68,14 +69,21 @@ def run(dataset, config):
     # Persist model in memory that is going to be predicting to get correct inference latency
     predictor.persist_models('best')
 
+    def inference_time_classification(path: str):
+        data = TabularDataset(path)
+        return None, predictor.predict_proba(data, as_multiclass=True)
+
+    def inference_time_regression(path: str):
+        data = TabularDataset(path)
+        return predictor.predict(data, as_pandas=False), None
+
+    infer = inference_time_classification if is_classification else inference_time_regression
+    with Timer() as predict:
+        predictions, probabilities = infer(test_data)
     if is_classification:
-        with Timer() as predict:
-            probabilities = predictor.predict_proba(test_data, as_multiclass=True)
         predictions = probabilities.idxmax(axis=1).to_numpy()
-    else:
-        with Timer() as predict:
-            predictions = predictor.predict(test_data, as_pandas=False)
-        probabilities = None
+
+    inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
 
     prob_labels = probabilities.columns.values.astype(str).tolist() if probabilities is not None else None
 
@@ -107,7 +115,8 @@ def run(dataset, config):
                   models_count=num_models_trained,
                   models_ensemble_count=num_models_ensemble,
                   training_duration=training.duration,
-                  predict_duration=predict.duration)
+                  predict_duration=predict.duration,
+                  inference_times=inference_times,)
 
 
 def save_artifacts(predictor, leaderboard, config):
diff --git a/frameworks/TPOT/__init__.py b/frameworks/TPOT/__init__.py
index 9828c6473..1aa3192ea 100644
--- a/frameworks/TPOT/__init__.py
+++ b/frameworks/TPOT/__init__.py
@@ -21,7 +21,8 @@ def run(dataset: Dataset, config: TaskConfig):
         test=dict(
             X=X_test,
             y=y_test
-        )
+        ),
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
 
     def process_results(results):
diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py
index ce70cb7f9..a7e0858ed 100644
--- a/frameworks/TPOT/exec.py
+++ b/frameworks/TPOT/exec.py
@@ -4,6 +4,8 @@
 import sys
 import tempfile as tmp
 
+import pandas as pd
+
 if sys.platform == 'darwin':
     os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
@@ -13,7 +15,8 @@
 
 from tpot import TPOTClassifier, TPOTRegressor, __version__
 
-from frameworks.shared.callee import call_run, output_subdir, result
+from frameworks.shared.callee import call_run, output_subdir, result, \
+    measure_inference_times
 from frameworks.shared.utils import Timer, is_sparse
 
 
@@ -67,6 +70,18 @@ def run(dataset, config):
     y_test = dataset.test.y
     with Timer() as predict:
         predictions = tpot.predict(X_test)
+
+    def infer(path):
+        data = pd.read_parquet(path)
+        if is_classification:
+            try:
+                return tpot.predict_proba(data)
+            except RuntimeError:
+                return tpot.predict(data)
+        return tpot.predict(data)
+
+    inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+
     try:
         probabilities = tpot.predict_proba(X_test) if is_classification else None
     except RuntimeError:
@@ -82,7 +97,9 @@ def run(dataset, config):
                   target_is_encoded=is_classification,
                   models_count=len(tpot.evaluated_individuals_),
                   training_duration=training.duration,
-                  predict_duration=predict.duration)
+                  predict_duration=predict.duration,
+                  inference_times=inference_times,
+                  )
 
 
 def save_artifacts(estimator, config):
diff --git a/frameworks/constantpredictor/exec.py b/frameworks/constantpredictor/exec.py
index 6a7aae69c..4a2c7cf9d 100644
--- a/frameworks/constantpredictor/exec.py
+++ b/frameworks/constantpredictor/exec.py
@@ -1,11 +1,13 @@
 import logging
 
+import pandas as pd
 from sklearn.dummy import DummyClassifier, DummyRegressor
 
 from amlb.benchmark import TaskConfig
 from amlb.data import Dataset
 from amlb.results import save_predictions
 from amlb.utils import Timer, unsparsify
+from frameworks.shared.callee import measure_inference_times
 
 log = logging.getLogger(__name__)
 
@@ -29,6 +31,12 @@ def run(dataset: Dataset, config: TaskConfig):
         predictions = predictor.predict(X_test)
     probabilities = predictor.predict_proba(X_test) if is_classification else None
 
+    def infer(path):
+        data = pd.read_parquet(path)
+        return predictor.predict(data)
+
+    inference_times = measure_inference_times(infer, dataset.inference_subsample_files(fmt="parquet"))
+
     save_predictions(dataset=dataset,
                      output_file=config.output_predictions_file,
                      probabilities=probabilities,
@@ -39,5 +47,6 @@ def run(dataset: Dataset, config: TaskConfig):
     return dict(
         models_count=1,
         training_duration=training.duration,
-        predict_duration=predict.duration
+        predict_duration=predict.duration,
+        inference_times=inference_times,
     )
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index c596e01c5..3c13d144a 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -1,11 +1,14 @@
 import logging
 import os
+import pathlib
 import re
 import signal
 import sys
+from collections import defaultdict
+from typing import Callable, Any, Tuple
 
 from .utils import InterruptTimeout, Namespace as ns, json_dump, json_loads, kill_proc_tree, touch
-from .utils import deserialize_data, serialize_data
+from .utils import deserialize_data, serialize_data, Timer
 
 log = logging.getLogger(__name__)
 
@@ -86,3 +89,12 @@ def load_data(name, path, **_):
         kill_proc_tree(include_parent=False, timeout=5)
 
     json_dump(res, config.result_file, style='compact')
+
+def measure_inference_times(predict_fn: Callable[[str], Any], files: list[Tuple[int, str]]) -> dict[int, list[float]]:
+    inference_times = defaultdict(list)
+    for subsample_size, subsample_path in files:
+        for _ in range(10):
+            with Timer() as predict:
+                predict_fn(subsample_path)
+            inference_times[subsample_size].append(predict.duration)
+    return inference_times
\ No newline at end of file

From 94ba496420128cfc8dc595244735fb2498ecfab9 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 10:43:45 +0300
Subject: [PATCH 03/39] Store all measured inference times to disk

---
 frameworks/shared/callee.py | 5 +++++
 frameworks/shared/caller.py | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index 3c13d144a..ca807024c 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -88,6 +88,11 @@ def load_data(name, path, **_):
         # ensure there's no subprocess left
         kill_proc_tree(include_parent=False, timeout=5)
 
+    inference_measurements = res.get("others", {}).get("inference_times")
+    if inference_measurements:
+        inference_file = pathlib.Path(config.result_file).parent / "inference_times.json"
+        json_dump(inference_measurements, inference_file, style="compact")
+        res["others"]["inference_times"] = str(inference_file)
     json_dump(res, config.result_file, style='compact')
 
 def measure_inference_times(predict_fn: Callable[[str], Any], files: list[Tuple[int, str]]) -> dict[int, list[float]]:
diff --git a/frameworks/shared/caller.py b/frameworks/shared/caller.py
index c833b994b..8422b98d4 100644
--- a/frameworks/shared/caller.py
+++ b/frameworks/shared/caller.py
@@ -1,6 +1,7 @@
 import gc
 import logging
 import os
+import pathlib
 import re
 from tempfile import TemporaryDirectory, mktemp
 from typing import List, Optional, Union
@@ -11,6 +12,7 @@
 from amlb.data import Dataset
 from amlb.resources import config as rconfig
 from amlb.results import NoResultError, save_predictions
+from amlb.utils import json_dump, Namespace
 
 from .utils import Namespace as ns, Timer, dir_of, run_cmd, json_dumps, json_load, profile
 from .utils import is_serializable_data, deserialize_data, serialize_data
@@ -152,6 +154,13 @@ def run_in_venv(caller_file, script_file: str, *args,
         for name in ['predictions', 'truth', 'probabilities', 'optional_columns']:
             res[name] = deserialize_data(res[name], config=ser_config) if res[name] is not None else None
 
+        inference_filepath = Namespace.dict(res.others).get("inference_times")
+        if inference_filepath:
+            inference_times = json_load(inference_filepath)
+            inference_filepath = pathlib.Path(res.output_file).parent / "inference.json"
+            json_dump(inference_times, inference_filepath)
+            res["others"]["inference_times"] = inference_times
+
         if callable(process_results):
             res = process_results(res)
 

From 0c7bf7d91938006028e708291f8502eaa6c5ee9d Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 10:46:42 +0300
Subject: [PATCH 04/39] Also accept a dataframe to allow to infer without disk
 load

---
 frameworks/AutoGluon/exec.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index 2a50ecf37..53f8a42f0 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -4,6 +4,8 @@
 import warnings
 import sys
 import tempfile
+from typing import Union
+
 warnings.simplefilter("ignore")
 
 if sys.platform == 'darwin':
@@ -65,26 +67,24 @@ def run(dataset, config):
             **training_params
         )
 
-    test_data = TabularDataset(test_path)
     # Persist model in memory that is going to be predicting to get correct inference latency
     predictor.persist_models('best')
 
-    def inference_time_classification(path: str):
-        data = TabularDataset(path)
+    def inference_time_classification(data: Union[str, pd.DataFrame]):
         return None, predictor.predict_proba(data, as_multiclass=True)
 
-    def inference_time_regression(path: str):
-        data = TabularDataset(path)
+    def inference_time_regression(data: Union[str, pd.DataFrame]):
         return predictor.predict(data, as_pandas=False), None
 
     infer = inference_time_classification if is_classification else inference_time_regression
+    inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+
+    test_data = TabularDataset(test_path)
     with Timer() as predict:
         predictions, probabilities = infer(test_data)
     if is_classification:
         predictions = probabilities.idxmax(axis=1).to_numpy()
 
-    inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
-
     prob_labels = probabilities.columns.values.astype(str).tolist() if probabilities is not None else None
 
     _leaderboard_extra_info = config.framework_params.get('_leaderboard_extra_info', False)  # whether to get extra model info (very verbose)

From df46e1b63cee2d8ca3f9ef88081e06940391f2e5 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 12:20:43 +0300
Subject: [PATCH 05/39] Make repeats and batch sizes configurable

---
 amlb/datasets/openml.py     | 3 ++-
 frameworks/shared/callee.py | 7 +++----
 resources/config.yaml       | 5 +++++
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index 445cf46c0..bccbd6318 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -96,7 +96,8 @@ def test(self):
     def inference_subsample_files(self, fmt: str) -> list[Tuple[int, str]]:
         return [
             (n, str(self._inference_subsample(fmt=fmt, n=n)))
-            for n in [1, 1000, 10_000]
+            for n in rconfig().inference_time_measurements.batch_sizes
+            for _ in range(rconfig().inference_time_measurements.repeats)
         ]
 
     @profile(logger=log)
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index ca807024c..a77be3e18 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -98,8 +98,7 @@ def load_data(name, path, **_):
 def measure_inference_times(predict_fn: Callable[[str], Any], files: list[Tuple[int, str]]) -> dict[int, list[float]]:
     inference_times = defaultdict(list)
     for subsample_size, subsample_path in files:
-        for _ in range(10):
-            with Timer() as predict:
-                predict_fn(subsample_path)
-            inference_times[subsample_size].append(predict.duration)
+        with Timer() as predict:
+            predict_fn(subsample_path)
+        inference_times[subsample_size].append(predict.duration)
     return inference_times
\ No newline at end of file
diff --git a/resources/config.yaml b/resources/config.yaml
index 91ff68642..73bc81d64 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -84,6 +84,11 @@ results:                 # configuration namespace for the results.csv file.
   global_lock_timeout: 5 # the timeout used to wait for the lock on the global results file.
   incremental_save: true # if true save results after each job., otherwise save results only when all jobs are completed.
 
+inference_time_measurements:  # configuration namespace for performing additional inference time measurements on various batch sizes
+  enabled: true
+  batch_sizes: [1, 10, 100, 1000, 10000]  # the batch sizes for which inference speed should be measured
+  repeats: 100                            # the number of times to repeat the inference measurement for each batch size
+
 openml:                # configuration namespace for openML.
   apikey: c1994bdb7ecb3c6f3c8f3b35f4b47f1f
   infer_dtypes: False

From 0621ee24ef6668910b196abef2ba75adb6e3f2b5 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 14:52:07 +0300
Subject: [PATCH 06/39] Forward inference measurement configuration through
 task config

---
 amlb/benchmark.py            | 4 +++-
 frameworks/AutoGluon/exec.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index d2c87ce6a..dee7c5c14 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -381,7 +381,7 @@ class TaskConfig:
 
     def __init__(self, name, fold, metrics, seed,
                  max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
-                 input_dir, output_dir):
+                 input_dir, output_dir, measure_inference_time: bool = False):
         self.framework = None
         self.framework_params = None
         self.framework_version = None
@@ -397,6 +397,7 @@ def __init__(self, name, fold, metrics, seed,
         self.input_dir = input_dir
         self.output_dir = output_dir
         self.output_predictions_file = os.path.join(output_dir, "predictions.csv")
+        self.measure_inference_time = measure_inference_time
         self.ext = ns()  # used if frameworks require extra config points
 
     def __setattr__(self, name, value):
@@ -477,6 +478,7 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
             min_vol_size_mb=task_def.min_vol_size_mb,
             input_dir=rconfig().input_dir,
             output_dir=benchmark.output_dirs.session,
+            measure_inference_time=rconfig().inference_time_measurements.enabled,
         )
         # allowing to override some task parameters through command line, e.g.: -Xt.max_runtime_seconds=60
         if rconfig()['t'] is not None:
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index 53f8a42f0..79c4d37d6 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -77,7 +77,9 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
         return predictor.predict(data, as_pandas=False), None
 
     infer = inference_time_classification if is_classification else inference_time_regression
-    inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+    inference_times = None
+    if config.measure_inference_time:
+        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
 
     test_data = TabularDataset(test_path)
     with Timer() as predict:

From 1a72ae16531a8d56f59ab71df693dfdf2901971e Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 15:08:54 +0300
Subject: [PATCH 07/39] Randomize samples within the same batch size

---
 amlb/datasets/openml.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index bccbd6318..e926b8e01 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -94,14 +94,15 @@ def test(self):
         return self._test
 
     def inference_subsample_files(self, fmt: str) -> list[Tuple[int, str]]:
+        seed = rget().seed(self.fold)
         return [
-            (n, str(self._inference_subsample(fmt=fmt, n=n)))
+            (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i)))
             for n in rconfig().inference_time_measurements.batch_sizes
-            for _ in range(rconfig().inference_time_measurements.repeats)
+            for i, _ in enumerate(range(rconfig().inference_time_measurements.repeats))
         ]
 
     @profile(logger=log)
-    def _inference_subsample(self, fmt: str, n: int) -> pathlib.Path:
+    def _inference_subsample(self, fmt: str, n: int, seed: int = 0) -> pathlib.Path:
         """ Write subset of `n` samples from the test split to disk in `fmt` format """
         # Just a hack for now, the splitters all work specifically with openml tasks.
         # The important thing is that we split to disk and can load it later.
@@ -114,7 +115,7 @@ def _inference_subsample(self, fmt: str, n: int) -> pathlib.Path:
         subsample = self._test.X.sample(
             n=n,
             replace=True,
-            random_state=rget().seed(self.fold)
+            random_state=seed,
         )
 
         _, test_path = self._get_split_paths()

From 7100391b7df7e4815775f8bca099d12bdc6d3102 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 15:17:04 +0300
Subject: [PATCH 08/39] Rename inference_X_rows column to infer_batch_size_X

---
 amlb/results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/amlb/results.py b/amlb/results.py
index 48bd6e447..3d6cffd8c 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -447,7 +447,7 @@ def compute_score(self, result=None, meta_result=None):
 
         if inference_times := Namespace.get(meta_result, "inference_times"):
             for n_samples, measured_times in Namespace.dict(inference_times).items():
-                entry[f"inference_{n_samples}_rows"] = np.mean(measured_times)
+                entry[f"infer_batch_size_{n_samples}"] = np.mean(measured_times)
         result = self.get_result() if result is None else result
 
         scoring_errors = []

From f76e75550a9a67fc9c57f8167346ae930b379a96 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 15:23:55 +0300
Subject: [PATCH 09/39] Add docstring and move value checking of `fmt`

Moving the value check makes it less error prone if there are changes
in accepted values.
---
 amlb/datasets/openml.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index e926b8e01..1f8adde43 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -94,6 +94,14 @@ def test(self):
         return self._test
 
     def inference_subsample_files(self, fmt: str) -> list[Tuple[int, str]]:
+        """Generates n subsamples of size k from the test dataset in `fmt` data format.
+
+        We measure the inference time of the models for various batch sizes
+        (number of rows). We generate config.inference_time_measurements.repeats
+        subsamples for each of the config.inference_time_measurements.batch_sizes.
+        These subsamples are stored to file in the `fmt` format (parquet, arff, or csv).
+        The function returns a list of tuples of (batch size, file path).
+        """
         seed = rget().seed(self.fold)
         return [
             (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i)))
@@ -106,9 +114,6 @@ def _inference_subsample(self, fmt: str, n: int, seed: int = 0) -> pathlib.Path:
         """ Write subset of `n` samples from the test split to disk in `fmt` format """
         # Just a hack for now, the splitters all work specifically with openml tasks.
         # The important thing is that we split to disk and can load it later.
-        if fmt not in ["csv", "arff", "parquet"]:
-            msg = f"{fmt=}, but must be one of 'csv', 'arff', or 'parquet'."
-            raise ValueError(msg)
 
         # We should consider taking a stratified sample if n is large enough,
         # inference time might differ based on class
@@ -131,6 +136,9 @@ def _inference_subsample(self, fmt: str, n: int, seed: int = 0) -> pathlib.Path:
             )
         elif fmt == "parquet":
             subsample.to_parquet(subsample_path)
+        else:
+            msg = f"{fmt=}, but must be one of 'csv', 'arff', or 'parquet'."
+            raise ValueError(msg)
 
         return subsample_path
 

From bf8cbc750961a526e6968406c487e848e9b99bc0 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 18:02:11 +0300
Subject: [PATCH 10/39] Add inference time measurements for flaml

---
 frameworks/flaml/__init__.py |  3 ++-
 frameworks/flaml/exec.py     | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/frameworks/flaml/__init__.py b/frameworks/flaml/__init__.py
index c911edf3b..bca1b6893 100644
--- a/frameworks/flaml/__init__.py
+++ b/frameworks/flaml/__init__.py
@@ -17,7 +17,8 @@ def run(dataset, config):
             X=dataset.test.X,
             y=dataset.test.y
         ),
-        problem_type=dataset.type.name
+        problem_type=dataset.type.name,
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
     options = dict(
         serialization=dict(sparse_dataframe_deserialized_format='dense')
diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py
index a8a5131af..d0acebde1 100644
--- a/frameworks/flaml/exec.py
+++ b/frameworks/flaml/exec.py
@@ -1,9 +1,11 @@
 import logging
 import os
 
+import pandas as pd
 from flaml import AutoML, __version__
 
-from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.callee import call_run, result, output_subdir, \
+    measure_inference_times
 from frameworks.shared.utils import Timer
 
 log = logging.getLogger(__name__)
@@ -49,7 +51,16 @@ def run(dataset, config):
                 n_jobs=n_jobs,
                 log_file_name= flaml_log_file_name,
                 time_budget=time_budget, **training_params)
-    
+
+    def infer(path: str):
+        data = pd.read_parquet(path)
+        predict_fn = aml.predict_proba if is_classification else aml.predict
+        return predict_fn(data)
+
+    inference_times = None
+    if config.measure_inference_time:
+        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+
     with Timer() as predict:
         predictions = aml.predict(X_test)
     probabilities = aml.predict_proba(X_test) if is_classification else None
@@ -65,6 +76,7 @@ def run(dataset, config):
                     training_duration=training.duration,
                     predict_duration=predict.duration,
                     probabilities_labels=labels,
+                    inference_times=inference_times,
                 )
 
 

From cb04989cbc638ecbb75dc00fde4b35f943e9f87f Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 21:53:58 +0300
Subject: [PATCH 11/39] Add inference time measurements

---
 frameworks/GAMA/__init__.py |  1 +
 frameworks/GAMA/exec.py     | 21 +++++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/frameworks/GAMA/__init__.py b/frameworks/GAMA/__init__.py
index f660e2f8f..5476600cb 100644
--- a/frameworks/GAMA/__init__.py
+++ b/frameworks/GAMA/__init__.py
@@ -22,6 +22,7 @@ def run(dataset: Dataset, config: TaskConfig):
             X=dataset.test.X,
             y=dataset.test.y
         ),
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
     options = dict(
         serialization=dict(sparse_dataframe_deserialized_format='dense')
diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py
index e0880bf34..54498c8ac 100644
--- a/frameworks/GAMA/exec.py
+++ b/frameworks/GAMA/exec.py
@@ -3,6 +3,8 @@
 import sys
 import tempfile as tmp
 
+import pandas as pd
+
 if sys.platform == 'darwin':
     os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
@@ -18,7 +20,8 @@
 from gama.data_loading import file_to_pandas
 from gama import GamaClassifier, GamaRegressor, __version__
 
-from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.callee import call_run, result, output_subdir, \
+    measure_inference_times
 from frameworks.shared.utils import Timer, touch
 
 
@@ -83,12 +86,21 @@ def run(dataset, config):
     # data = file_to_pandas(dataset.test.path, encoding='utf-8')
     # X_test, y_test = data.loc[:, data.columns != dataset.target], data.loc[:, dataset.target]
 
+    def infer(path: str):
+        test_data = pd.read_parquet(path)
+        predict_fn = gama_automl.predict_proba if is_classification else gama_automl.predict
+        return predict_fn(test_data)
+
+    inference_times = None
+    if config.measure_inference_time:
+        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+
     with Timer() as predict_timer:
         predictions = gama_automl.predict(X_test)
+
+    probabilities = None
     if is_classification:
         probabilities = gama_automl.predict_proba(X_test)
-    else:
-        probabilities = None
 
     return result(
         output_file=config.output_predictions_file,
@@ -98,7 +110,8 @@ def run(dataset, config):
         target_is_encoded=False,
         models_count=len(gama_automl._final_pop),
         training_duration=training_timer.duration,
-        predict_duration=predict_timer.duration
+        predict_duration=predict_timer.duration,
+        inference_times=inference_times,
     )
 
 

From 8ea90338c214b19d5d5c4d39b201c87c9034824b Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 22:07:19 +0300
Subject: [PATCH 12/39] Add inference time measurements

---
 frameworks/lightautoml/__init__.py |  3 ++-
 frameworks/lightautoml/exec.py     | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/frameworks/lightautoml/__init__.py b/frameworks/lightautoml/__init__.py
index 97c09fa0e..fedabacf3 100644
--- a/frameworks/lightautoml/__init__.py
+++ b/frameworks/lightautoml/__init__.py
@@ -21,7 +21,8 @@ def run(dataset: Dataset, config: TaskConfig):
         target=dict(
             name=dataset.target.name,
         ),
-        problem_type=dataset.type.name
+        problem_type=dataset.type.name,
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
     options = dict(
         serialization=dict(sparse_dataframe_deserialized_format='dense')
diff --git a/frameworks/lightautoml/exec.py b/frameworks/lightautoml/exec.py
index aee255902..56cbe87c5 100644
--- a/frameworks/lightautoml/exec.py
+++ b/frameworks/lightautoml/exec.py
@@ -5,13 +5,16 @@
 
 import matplotlib
 import numpy as np
+import pandas as pd
+
 matplotlib.use("agg")  # no need for tk
 
 from lightautoml.tasks import Task
 from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
 from lightautoml import __version__
 
-from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.callee import call_run, result, output_subdir, \
+    measure_inference_times
 from frameworks.shared.utils import Timer
 
 log = logging.getLogger(__name__)
@@ -37,6 +40,15 @@ def run(dataset, config):
     with Timer() as training:
         automl.fit_predict(train_data=df_train, roles={'target': label})
 
+    def infer(path: str):
+        batch = pd.read_parquet(path)
+        return automl.predict(batch)
+
+    inference_times = None
+    if config.measure_inference_time:
+        inference_times = measure_inference_times(infer,
+                                                  dataset.inference_subsample_files)
+
     X_test, y_test = dataset.test.X, dataset.test.y
     log.info("Predicting on the test set...")
     with Timer() as predict:
@@ -75,6 +87,7 @@ def run(dataset, config):
         predictions=predictions,
         training_duration=training.duration,
         predict_duration=predict.duration,
+        inference_times=inference_times,
     )
 
 

From 338fc94e2851d32eee337400a222ec5974f78895 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 22:09:39 +0300
Subject: [PATCH 13/39] Add inference time measurements

---
 frameworks/mljarsupervised/__init__.py |  3 ++-
 frameworks/mljarsupervised/exec.py     | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/frameworks/mljarsupervised/__init__.py b/frameworks/mljarsupervised/__init__.py
index 9bee9f4a5..3cd6003ce 100644
--- a/frameworks/mljarsupervised/__init__.py
+++ b/frameworks/mljarsupervised/__init__.py
@@ -19,7 +19,8 @@ def run(dataset: Dataset, config: TaskConfig):
             X=dataset.test.X,
             y=dataset.test.y
         ),
-        problem_type=dataset.type.name
+        problem_type=dataset.type.name,
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
     options = dict(
         serialization=dict(sparse_dataframe_deserialized_format='dense')
diff --git a/frameworks/mljarsupervised/exec.py b/frameworks/mljarsupervised/exec.py
index 653d9cfd6..3287e5be3 100644
--- a/frameworks/mljarsupervised/exec.py
+++ b/frameworks/mljarsupervised/exec.py
@@ -4,12 +4,15 @@
 
 import numpy as np
 import matplotlib
+import pandas as pd
+
 matplotlib.use("agg")  # no need for tk
 
 import supervised
 from supervised.automl import AutoML
 
-from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.callee import call_run, result, output_subdir, \
+    measure_inference_times
 from frameworks.shared.utils import Timer
 
 log = logging.getLogger(os.path.basename(__file__))
@@ -56,6 +59,16 @@ def run(dataset, config):
     with Timer() as training:
         automl.fit(X_train, y_train)
 
+
+    def infer(path: str):
+        batch = pd.read_parquet(path)
+        return automl.predict_all(batch)
+
+    inference_times = None
+    if config.measure_inference_time:
+        inference_times = measure_inference_times(infer,
+                                                  dataset.inference_subsample_files)
+
     with Timer() as predict:
         preds = automl.predict_all(X_test)
 
@@ -88,7 +101,8 @@ def run(dataset, config):
         probabilities_labels=probabilities_labels,
         models_count=len(automl._models),
         training_duration=training.duration,
-        predict_duration=predict.duration
+        predict_duration=predict.duration,
+        inference_times=inference_times,
     )
 
 

From 9b024e2a46c63ca849b237c50d47ac181ac33eef Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 22:11:44 +0300
Subject: [PATCH 14/39] Document shortcoming of measuring inference time for
 tpot

---
 frameworks/TPOT/exec.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py
index a7e0858ed..ff445eae7 100644
--- a/frameworks/TPOT/exec.py
+++ b/frameworks/TPOT/exec.py
@@ -80,7 +80,10 @@ def infer(path):
                 return tpot.predict(data)
         return tpot.predict(data)
 
-    inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+    inference_times = None
+    if config.measure_inference_time:
+        log.info("TPOT inference time measurements exclude preprocessing time of AMLB.")
+        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
 
     try:
         probabilities = tpot.predict_proba(X_test) if is_classification else None

From 7b3455f8ffb0b04c8b3877378a60c3df9d0b6b2d Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 22:29:38 +0300
Subject: [PATCH 15/39] Add inference time measurement

---
 frameworks/autosklearn/__init__.py |  3 ++-
 frameworks/autosklearn/exec.py     | 21 +++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/frameworks/autosklearn/__init__.py b/frameworks/autosklearn/__init__.py
index 3e31b6d64..92e059ef7 100644
--- a/frameworks/autosklearn/__init__.py
+++ b/frameworks/autosklearn/__init__.py
@@ -21,7 +21,8 @@ def run(dataset: Dataset, config: TaskConfig):
             X=X_test,
             y=y_test
         ),
-        predictors_type=['Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors]
+        predictors_type=['Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors],
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
 
     return run_in_venv(__file__, "exec.py",
diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
index 11690e09c..39f301fc5 100644
--- a/frameworks/autosklearn/exec.py
+++ b/frameworks/autosklearn/exec.py
@@ -5,6 +5,8 @@
 import tempfile as tmp
 import warnings
 
+import pandas as pd
+
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
@@ -15,7 +17,8 @@
 import autosklearn.metrics as metrics
 from packaging import version
 
-from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.callee import call_run, result, output_subdir, \
+    measure_inference_times
 from frameworks.shared.utils import Timer, system_memory_mb, walk_apply, zip_path
 
 log = logging.getLogger(__name__)
@@ -130,10 +133,18 @@ def run(dataset, config):
     with Timer() as training:
         auto_sklearn.fit(X_train, y_train, feat_type=predictors_type, **fit_extra_params)
 
+    def infer(path: str):
+        test_data = pd.read_parquet(path)
+        predict_fn = auto_sklearn.predict_proba if is_classification else auto_sklearn.predict
+        return predict_fn(test_data)
+
+    inference_times = None
+    if config.measure_inference_time:
+        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+
     # Convert output to strings for classification
     log.info("Predicting on the test set.")
     X_test = dataset.test.X
-    y_test = dataset.test.y
     with Timer() as predict:
         predictions = auto_sklearn.predict(X_test)
     probabilities = auto_sklearn.predict_proba(X_test) if is_classification else None
@@ -142,12 +153,14 @@ def run(dataset, config):
 
     return result(output_file=config.output_predictions_file,
                   predictions=predictions,
-                  truth=y_test,
+                  truth=dataset.test.y,
                   probabilities=probabilities,
                   target_is_encoded=is_classification,
                   models_count=len(auto_sklearn.get_models_with_weights()),
                   training_duration=training.duration,
-                  predict_duration=predict.duration)
+                  predict_duration=predict.duration,
+                  inference_times=inference_times,
+                  )
 
 
 def save_artifacts(estimator, config):

From 573322338ccba61d4605961387a4b65969175632 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 22:30:02 +0300
Subject: [PATCH 16/39] Bump ubuntu base to 22.04

---
 amlb/runners/docker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/amlb/runners/docker.py b/amlb/runners/docker.py
index 27cfedcf3..4dc601d63 100644
--- a/amlb/runners/docker.py
+++ b/amlb/runners/docker.py
@@ -116,7 +116,7 @@ def _upload_image(self, image):
         log.info(f"Successfully published docker image {image}.")
 
     def _generate_script(self, custom_commands):
-        docker_content = """FROM ubuntu:18.04
+        docker_content = """FROM ubuntu:22.04
 
 ENV DEBIAN_FRONTEND noninteractive
 RUN apt-get update

From 25a1bd443ac04096549fcecfb5463fb7534aee54 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 23:02:43 +0300
Subject: [PATCH 17/39] Add inference measurement for dataframe

---
 amlb/results.py              | 5 +++--
 frameworks/AutoGluon/exec.py | 9 +++++++--
 frameworks/shared/callee.py  | 7 +++++--
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 3d6cffd8c..ab3568430 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -446,8 +446,9 @@ def compute_score(self, result=None, meta_result=None):
             entry[m] = meta_result[m] if m in meta_result else nan
 
         if inference_times := Namespace.get(meta_result, "inference_times"):
-            for n_samples, measured_times in Namespace.dict(inference_times).items():
-                entry[f"infer_batch_size_{n_samples}"] = np.mean(measured_times)
+            for data_type, measurements in Namespace.dict(inference_times).items():
+                for n_samples, measured_times in Namespace.dict(measurements).items():
+                    entry[f"infer_batch_size_{data_type}_{n_samples}"] = np.mean(measured_times)
         result = self.get_result() if result is None else result
 
         scoring_errors = []
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index 79c4d37d6..1884c7919 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -77,9 +77,14 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
         return predictor.predict(data, as_pandas=False), None
 
     infer = inference_time_classification if is_classification else inference_time_regression
-    inference_times = None
+    inference_times = {}
     if config.measure_inference_time:
-        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        test_data = pd.read_parquet(dataset.test.path)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, test_data.sample(1, random_state=i)) for i in range(100)],
+        )
 
     test_data = TabularDataset(test_path)
     with Timer() as predict:
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index a77be3e18..2f94e5262 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -5,7 +5,9 @@
 import signal
 import sys
 from collections import defaultdict
-from typing import Callable, Any, Tuple
+from typing import Callable, Any, Tuple, Union, TypeVar
+
+import pandas as pd
 
 from .utils import InterruptTimeout, Namespace as ns, json_dump, json_loads, kill_proc_tree, touch
 from .utils import deserialize_data, serialize_data, Timer
@@ -95,7 +97,8 @@ def load_data(name, path, **_):
         res["others"]["inference_times"] = str(inference_file)
     json_dump(res, config.result_file, style='compact')
 
-def measure_inference_times(predict_fn: Callable[[str], Any], files: list[Tuple[int, str]]) -> dict[int, list[float]]:
+DATA_INPUT = TypeVar("DATA_INPUT", bound=Union[str, pd.DataFrame])
+def measure_inference_times(predict_fn: Callable[[DATA_INPUT], Any], files: list[Tuple[int, DATA_INPUT]]) -> dict[int, list[float]]:
     inference_times = defaultdict(list)
     for subsample_size, subsample_path in files:
         with Timer() as predict:

From a6f7a5213e273fd7d7a8d3c1ffa7bc1b0a1d8a4a Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 23:07:45 +0300
Subject: [PATCH 18/39] Defaults to display median inference time

This mitigates the effect of outliers, such as cold-start runs.
---
 amlb/results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/amlb/results.py b/amlb/results.py
index ab3568430..6e1a5bc60 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -448,7 +448,7 @@ def compute_score(self, result=None, meta_result=None):
         if inference_times := Namespace.get(meta_result, "inference_times"):
             for data_type, measurements in Namespace.dict(inference_times).items():
                 for n_samples, measured_times in Namespace.dict(measurements).items():
-                    entry[f"infer_batch_size_{data_type}_{n_samples}"] = np.mean(measured_times)
+                    entry[f"infer_batch_size_{data_type}_{n_samples}"] = np.median(measured_times)
         result = self.get_result() if result is None else result
 
         scoring_errors = []

From 94f11f6b12fadb67da936a30e09dce488d573552 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 23:17:19 +0300
Subject: [PATCH 19/39] Add seed to filename

Otherwise only one file is actually kept and used for experiments,
thus not actually mitigating the variances of sampling.
---
 amlb/datasets/openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index 1f8adde43..1a5d0f805 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -125,7 +125,7 @@ def _inference_subsample(self, fmt: str, n: int, seed: int = 0) -> pathlib.Path:
 
         _, test_path = self._get_split_paths()
         test_path = pathlib.Path(test_path)
-        subsample_path = test_path.parent / f"{test_path.stem}_{n}.{fmt}"
+        subsample_path = test_path.parent / f"{test_path.stem}_{n}_{seed}.{fmt}"
         if fmt == "csv":
             subsample.to_csv(subsample_path, header=True, index=False)
         elif fmt == "arff":

From 4119d5664e2b48c6b55bae135dc7d5ebc965228d Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Tue, 13 Jun 2023 23:41:48 +0300
Subject: [PATCH 20/39] Start with inference measurements (broken)

---
 amlb/datasets/openml.py          |  9 +++++----
 frameworks/H2OAutoML/__init__.py | 13 ++-----------
 frameworks/H2OAutoML/exec.py     | 18 ++++++++++++++++--
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index 1a5d0f805..3471fe7eb 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -93,7 +93,7 @@ def test(self):
         self._ensure_split_created()
         return self._test
 
-    def inference_subsample_files(self, fmt: str) -> list[Tuple[int, str]]:
+    def inference_subsample_files(self, fmt: str, with_labels: bool = False) -> list[Tuple[int, str]]:
         """Generates n subsamples of size k from the test dataset in `fmt` data format.
 
         We measure the inference time of the models for various batch sizes
@@ -104,20 +104,21 @@ def inference_subsample_files(self, fmt: str) -> list[Tuple[int, str]]:
         """
         seed = rget().seed(self.fold)
         return [
-            (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i)))
+            (n, str(self._inference_subsample(fmt=fmt, n=n, seed=seed + i, with_labels=with_labels)))
             for n in rconfig().inference_time_measurements.batch_sizes
             for i, _ in enumerate(range(rconfig().inference_time_measurements.repeats))
         ]
 
     @profile(logger=log)
-    def _inference_subsample(self, fmt: str, n: int, seed: int = 0) -> pathlib.Path:
+    def _inference_subsample(self, fmt: str, n: int, seed: int = 0, with_labels: bool = False) -> pathlib.Path:
         """ Write subset of `n` samples from the test split to disk in `fmt` format """
         # Just a hack for now, the splitters all work specifically with openml tasks.
         # The important thing is that we split to disk and can load it later.
 
         # We should consider taking a stratified sample if n is large enough,
         # inference time might differ based on class
-        subsample = self._test.X.sample(
+        test = self._test.data if with_labels else self._test.X
+        subsample = test.sample(
             n=n,
             replace=True,
             random_state=seed,
diff --git a/frameworks/H2OAutoML/__init__.py b/frameworks/H2OAutoML/__init__.py
index 2b45dc6d3..ce51582ef 100644
--- a/frameworks/H2OAutoML/__init__.py
+++ b/frameworks/H2OAutoML/__init__.py
@@ -8,25 +8,16 @@ def setup(*args, **kwargs):
     call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
 
 
-# def version():
-#     from frameworks.shared.caller import run_cmd_in_venv
-#     out, err = run_cmd_in_venv(__file__, """{py} -c "from h2o import __version__; print(__version__)" | grep "^\d\." """)
-#     if err:
-#         raise ValueError(err)
-#     return out
-
-
 def run(dataset: Dataset, config: TaskConfig):
     from frameworks.shared.caller import run_in_venv
-
     data = dict(
         train=dict(path=dataset.train.path),
         test=dict(path=dataset.test.path),
         target=dict(index=dataset.target.index),
         domains=dict(cardinalities=[0 if f.values is None else len(f.values) for f in dataset.features]),
-        format=dataset.train.format
+        format=dataset.train.format,
+        inference_subsample_files=dataset.inference_subsample_files(fmt=dataset.train.format, with_labels=True),
     )
-
     config.ext.monitoring = rconfig().monitoring
     return run_in_venv(__file__, "exec.py",
                        input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py
index 026d1d062..5a9e9d7fa 100644
--- a/frameworks/H2OAutoML/exec.py
+++ b/frameworks/H2OAutoML/exec.py
@@ -1,6 +1,8 @@
 import contextlib
 import logging
 import os
+import pathlib
+
 import psutil
 import re
 
@@ -10,7 +12,8 @@
 import h2o
 from h2o.automl import H2OAutoML
 
-from frameworks.shared.callee import FrameworkError, call_run, output_subdir, result
+from frameworks.shared.callee import FrameworkError, call_run, output_subdir, result, \
+    measure_inference_times
 from frameworks.shared.utils import Monitoring, Namespace as ns, Timer, clean_dir, touch, zip_path
 
 log = logging.getLogger(__name__)
@@ -115,6 +118,16 @@ def run(dataset, config):
         if not aml.leader:
             raise FrameworkError("H2O could not produce any model in the requested time.")
 
+        def infer(path: str):
+            filename = pathlib.Path(path).name
+            batch = h2o.import_file(path, destination_frame=frame_name(filename, config), **import_kwargs)
+            return aml.predict(batch)
+
+        inference_times = None
+        if config.measure_inference_time:
+            inference_times = measure_inference_times(infer,
+                                                      dataset.inference_subsample_files)
+
         with Timer() as predict:
             preds = aml.predict(test)
 
@@ -129,7 +142,8 @@ def run(dataset, config):
             probabilities_labels=preds.probabilities_labels,
             models_count=len(aml.leaderboard),
             training_duration=training.duration,
-            predict_duration=predict.duration
+            predict_duration=predict.duration,
+            inference_times=inference_times,
         )
 
     finally:

From 0000c63cce54301eae940213c1ca957fdb858c20 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 11:10:18 +0300
Subject: [PATCH 21/39] Add inference measurement on dataframe

---
 frameworks/GAMA/exec.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py
index 54498c8ac..12fa75111 100644
--- a/frameworks/GAMA/exec.py
+++ b/frameworks/GAMA/exec.py
@@ -75,27 +75,24 @@ def run(dataset, config):
     gama_automl = estimator(**kwargs)
 
     X_train, y_train = dataset.train.X, dataset.train.y
-    # data = file_to_pandas(dataset.train.path, encoding='utf-8')
-    # X_train, y_train = data.loc[:, data.columns != dataset.target], data.loc[:, dataset.target]
-
     with Timer() as training_timer:
         gama_automl.fit(X_train, y_train)
 
     log.info('Predicting on the test set.')
-    X_test, y_test = dataset.test.X, dataset.test.y
-    # data = file_to_pandas(dataset.test.path, encoding='utf-8')
-    # X_test, y_test = data.loc[:, data.columns != dataset.target], data.loc[:, dataset.target]
-
-    def infer(path: str):
-        test_data = pd.read_parquet(path)
+    def infer(data):
+        test_data = pd.read_parquet(data) if isinstance(data, str) else data
         predict_fn = gama_automl.predict_proba if is_classification else gama_automl.predict
         return predict_fn(test_data)
 
-    inference_times = None
+    inference_times = {}
     if config.measure_inference_time:
-        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
-
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
+        )
     with Timer() as predict_timer:
+        X_test, y_test = dataset.test.X, dataset.test.y
         predictions = gama_automl.predict(X_test)
 
     probabilities = None

From 1d2054a3d7d97938bfcc65746d4d084deff08155 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 11:22:01 +0300
Subject: [PATCH 22/39] Add inference time measurement with dataframes

---
 frameworks/flaml/exec.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py
index d0acebde1..510f02b9c 100644
--- a/frameworks/flaml/exec.py
+++ b/frameworks/flaml/exec.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import Union
 
 import pandas as pd
 from flaml import AutoML, __version__
@@ -15,7 +16,6 @@ def run(dataset, config):
     log.info(f"\n**** FLAML [v{__version__}] ****\n")
 
     X_train, y_train = dataset.train.X, dataset.train.y.squeeze()
-    X_test, y_test = dataset.test.X, dataset.test.y.squeeze()
 
     is_classification = config.type == 'classification'
     time_budget = config.max_runtime_seconds
@@ -52,16 +52,21 @@ def run(dataset, config):
                 log_file_name= flaml_log_file_name,
                 time_budget=time_budget, **training_params)
 
-    def infer(path: str):
-        data = pd.read_parquet(path)
+    def infer(data: Union[str, pd.DataFrame]):
+        data = pd.read_parquet(data) if isinstance(data, str) else data
         predict_fn = aml.predict_proba if is_classification else aml.predict
         return predict_fn(data)
 
-    inference_times = None
+    inference_times = {}
     if config.measure_inference_time:
-        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
+        )
 
     with Timer() as predict:
+        X_test, y_test = dataset.test.X, dataset.test.y.squeeze()
         predictions = aml.predict(X_test)
     probabilities = aml.predict_proba(X_test) if is_classification else None
     labels = None

From 357a63fd9087038bc6b3e93e71d1fbd6e4e30fa7 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 11:40:52 +0300
Subject: [PATCH 23/39] Add dataframe inference time measurement

---
 frameworks/mljarsupervised/exec.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/frameworks/mljarsupervised/exec.py b/frameworks/mljarsupervised/exec.py
index 3287e5be3..66d2c6d64 100644
--- a/frameworks/mljarsupervised/exec.py
+++ b/frameworks/mljarsupervised/exec.py
@@ -1,6 +1,7 @@
 import os
 import shutil
 import logging
+from typing import Union
 
 import numpy as np
 import matplotlib
@@ -45,7 +46,6 @@ def run(dataset, config):
     }
 
     X_train, y_train = dataset.train.X, dataset.train.y.squeeze()
-    X_test, y_test = dataset.test.X, dataset.test.y.squeeze()
 
     automl = AutoML(
         results_path=results_path,
@@ -60,16 +60,20 @@ def run(dataset, config):
         automl.fit(X_train, y_train)
 
 
-    def infer(path: str):
-        batch = pd.read_parquet(path)
+    def infer(data: Union[str, pd.DataFrame]):
+        batch = pd.read_parquet(data) if isinstance(data, str) else data
         return automl.predict_all(batch)
 
-    inference_times = None
+    inference_times = {}
     if config.measure_inference_time:
-        inference_times = measure_inference_times(infer,
-                                                  dataset.inference_subsample_files)
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
+        )
 
     with Timer() as predict:
+        X_test, y_test = dataset.test.X, dataset.test.y.squeeze()
         preds = automl.predict_all(X_test)
 
     predictions, probabilities, probabilities_labels = None, None, None

From 2dd1ffc7d26eaf3c800860a17c6b4fb221dcb175 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 11:42:33 +0300
Subject: [PATCH 24/39] Add dataframe inference time measurement

---
 frameworks/GAMA/exec.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py
index 12fa75111..d9e89a8e8 100644
--- a/frameworks/GAMA/exec.py
+++ b/frameworks/GAMA/exec.py
@@ -2,6 +2,7 @@
 import os
 import sys
 import tempfile as tmp
+from typing import Union
 
 import pandas as pd
 
@@ -79,7 +80,7 @@ def run(dataset, config):
         gama_automl.fit(X_train, y_train)
 
     log.info('Predicting on the test set.')
-    def infer(data):
+    def infer(data: Union[str, pd.DataFrame]):
         test_data = pd.read_parquet(data) if isinstance(data, str) else data
         predict_fn = gama_automl.predict_proba if is_classification else gama_automl.predict
         return predict_fn(test_data)

From 1e5a61dd965768fdb3b79f0709b384e705a6b05c Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 11:58:53 +0300
Subject: [PATCH 25/39] Add dataframe inference measurement (ignores encoding)

---
 frameworks/TPOT/exec.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py
index ff445eae7..56e72609e 100644
--- a/frameworks/TPOT/exec.py
+++ b/frameworks/TPOT/exec.py
@@ -5,6 +5,7 @@
 import tempfile as tmp
 
 import pandas as pd
+from numpy.random import default_rng
 
 if sys.platform == 'darwin':
     os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'
@@ -65,14 +66,8 @@ def run(dataset, config):
     with Timer() as training:
         tpot.fit(X_train, y_train)
 
-    log.info('Predicting on the test set.')
-    X_test = dataset.test.X
-    y_test = dataset.test.y
-    with Timer() as predict:
-        predictions = tpot.predict(X_test)
-
-    def infer(path):
-        data = pd.read_parquet(path)
+    def infer(data):
+        data = pd.read_parquet(data) if isinstance(data, str) else data
         if is_classification:
             try:
                 return tpot.predict_proba(data)
@@ -80,10 +75,22 @@ def infer(path):
                 return tpot.predict(data)
         return tpot.predict(data)
 
-    inference_times = None
+    inference_times = {}
     if config.measure_inference_time:
         log.info("TPOT inference time measurements exclude preprocessing time of AMLB.")
-        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["df"] = measure_inference_times(
+            infer, [
+                (1, dataset.test.X[default_rng(seed=i).integers(len(dataset.test.X)), :].reshape(1, -1))
+                for i in range(100)
+            ],
+        )
+
+    log.info('Predicting on the test set.')
+    y_test = dataset.test.y
+    with Timer() as predict:
+        X_test = dataset.test.X
+        predictions = tpot.predict(X_test)
 
     try:
         probabilities = tpot.predict_proba(X_test) if is_classification else None

From b48c5edc4c91c96cbf2ce74626b74fb2a7da43c1 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 12:04:59 +0300
Subject: [PATCH 26/39] Add dataframe inference measurement

---
 frameworks/autosklearn/exec.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
index 39f301fc5..e7ca55bab 100644
--- a/frameworks/autosklearn/exec.py
+++ b/frameworks/autosklearn/exec.py
@@ -4,8 +4,10 @@
 import shutil
 import tempfile as tmp
 import warnings
+from typing import Union
 
 import pandas as pd
+from numpy.random import default_rng
 
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
@@ -133,19 +135,25 @@ def run(dataset, config):
     with Timer() as training:
         auto_sklearn.fit(X_train, y_train, feat_type=predictors_type, **fit_extra_params)
 
-    def infer(path: str):
-        test_data = pd.read_parquet(path)
+    def infer(data: Union[str, pd.DataFrame]):
+        test_data = pd.read_parquet(data) if isinstance(data, str) else data
         predict_fn = auto_sklearn.predict_proba if is_classification else auto_sklearn.predict
         return predict_fn(test_data)
 
-    inference_times = None
+    inference_times = {}
     if config.measure_inference_time:
-        inference_times = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["df"] = measure_inference_times(
+            infer, [
+                (1, dataset.test.X[default_rng(seed=i).integers(len(dataset.test.X)), :].reshape(1, -1))
+                for i in range(100)
+            ],
+        )
 
     # Convert output to strings for classification
     log.info("Predicting on the test set.")
-    X_test = dataset.test.X
     with Timer() as predict:
+        X_test = dataset.test.X
         predictions = auto_sklearn.predict(X_test)
     probabilities = auto_sklearn.predict_proba(X_test) if is_classification else None
 

From 42d38aca074f59672de655e032655da0af3f49b2 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 12:38:19 +0300
Subject: [PATCH 27/39] Add inference time measurements

It seems lightautoml inference is considerably slower than that of
any other framework.
---
 frameworks/lightautoml/exec.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/frameworks/lightautoml/exec.py b/frameworks/lightautoml/exec.py
index 56cbe87c5..01217abd1 100644
--- a/frameworks/lightautoml/exec.py
+++ b/frameworks/lightautoml/exec.py
@@ -2,6 +2,7 @@
 import os
 import pickle
 import warnings
+from typing import Union
 
 import matplotlib
 import numpy as np
@@ -40,18 +41,21 @@ def run(dataset, config):
     with Timer() as training:
         automl.fit_predict(train_data=df_train, roles={'target': label})
 
-    def infer(path: str):
-        batch = pd.read_parquet(path)
+    def infer(data: Union[str, pd.DataFrame]):
+        batch = pd.read_parquet(data) if isinstance(data, str) else data
         return automl.predict(batch)
 
-    inference_times = None
+    inference_times = {}
     if config.measure_inference_time:
-        inference_times = measure_inference_times(infer,
-                                                  dataset.inference_subsample_files)
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
+        )
 
-    X_test, y_test = dataset.test.X, dataset.test.y
     log.info("Predicting on the test set...")
     with Timer() as predict:
+        X_test, y_test = dataset.test.X, dataset.test.y
         preds = automl.predict(X_test).data
 
     probabilities_labels = None

From 4a38f6cad3cc66e561cacac38b9dd9944d7180b1 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 14 Jun 2023 13:04:25 +0300
Subject: [PATCH 28/39] Add inference time measurements

---
 frameworks/H2OAutoML/exec.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py
index 5a9e9d7fa..f6603c886 100644
--- a/frameworks/H2OAutoML/exec.py
+++ b/frameworks/H2OAutoML/exec.py
@@ -123,10 +123,16 @@ def infer(path: str):
             batch = h2o.import_file(path, destination_frame=frame_name(filename, config), **import_kwargs)
             return aml.predict(batch)
 
-        inference_times = None
+        inference_times = {}
         if config.measure_inference_time:
-            inference_times = measure_inference_times(infer,
-                                                      dataset.inference_subsample_files)
+            # H2O can't do inference on single row arff:
+            # https://github.com/h2oai/h2o-3/issues/15572
+            without_single_row_files = [
+                (subsample_size, subsample_path)
+                for subsample_size, subsample_path in dataset.inference_subsample_files
+                if subsample_size > 1
+            ]
+            inference_times["file"] = measure_inference_times(infer, without_single_row_files)
 
         with Timer() as predict:
             preds = aml.predict(test)

From 6c9ccb36bb5d7ca235064517dcc8cd5d621c6a26 Mon Sep 17 00:00:00 2001
From: Pieter Gijsbers <p.gijsbers@tue.nl>
Date: Fri, 16 Jun 2023 10:04:04 +0300
Subject: [PATCH 29/39] Allow newer autosklearn versions to use the pandas data
 instead (#534)

---
 frameworks/autosklearn/__init__.py | 14 ++++++++------
 frameworks/autosklearn/exec.py     | 28 ++++++++++++++++++----------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/frameworks/autosklearn/__init__.py b/frameworks/autosklearn/__init__.py
index 92e059ef7..a00a7d833 100644
--- a/frameworks/autosklearn/__init__.py
+++ b/frameworks/autosklearn/__init__.py
@@ -10,16 +10,18 @@ def setup(*args, **kwargs):
 def run(dataset: Dataset, config: TaskConfig):
     from frameworks.shared.caller import run_in_venv
 
-    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
-    y_train, y_test = unsparsify(dataset.train.y_enc, dataset.test.y_enc)
     data = dict(
         train=dict(
-            X=X_train,
-            y=y_train
+            X=dataset.train.X,
+            y=dataset.train.y,
+            X_enc=dataset.train.X_enc,
+            y_enc=unsparsify(dataset.train.y_enc),
         ),
         test=dict(
-            X=X_test,
-            y=y_test
+            X=dataset.test.X,
+            y=dataset.test.y,
+            X_enc=dataset.test.X_enc,
+            y_enc=unsparsify(dataset.test.y_enc),
         ),
         predictors_type=['Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors],
         inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
index e7ca55bab..6c98fc199 100644
--- a/frameworks/autosklearn/exec.py
+++ b/frameworks/autosklearn/exec.py
@@ -67,8 +67,9 @@ def run(dataset, config):
     )
     log.info("Environment: %s", os.environ)
 
-    X_train = dataset.train.X
-    y_train = dataset.train.y
+    use_pandas = askl_version >= version.parse("0.15")
+    X_train = dataset.train.X if use_pandas else dataset.train.X_enc
+    y_train = dataset.train.y if use_pandas else dataset.train.y_enc
     predictors_type = dataset.predictors_type
     log.debug("predictors_type=%s", predictors_type)
 
@@ -123,6 +124,10 @@ def run(dataset, config):
     else:
         fit_extra_params['metric'] = perf_metric
 
+    if not use_pandas:
+        fit_extra_params["feat_type"] = predictors_type
+
+
     constr_params["time_left_for_this_task"] = config.max_runtime_seconds
     constr_params["n_jobs"] = n_jobs
     constr_params["seed"] = config.seed
@@ -133,7 +138,7 @@ def run(dataset, config):
 
     auto_sklearn = estimator(**constr_params, **training_params)
     with Timer() as training:
-        auto_sklearn.fit(X_train, y_train, feat_type=predictors_type, **fit_extra_params)
+        auto_sklearn.fit(X_train, y_train, **fit_extra_params)
 
     def infer(data: Union[str, pd.DataFrame]):
         test_data = pd.read_parquet(data) if isinstance(data, str) else data
@@ -143,17 +148,20 @@ def infer(data: Union[str, pd.DataFrame]):
     inference_times = {}
     if config.measure_inference_time:
         inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        test_data = dataset.test.X if use_pandas else dataset.test.X_enc
+        def sample_one_test_row(seed: int):
+            if use_pandas:
+                return test_data.sample(1, random_state=seed)
+            return test_data[default_rng(seed=seed).integers(len(test_data)), :]
+
         inference_times["df"] = measure_inference_times(
-            infer, [
-                (1, dataset.test.X[default_rng(seed=i).integers(len(dataset.test.X)), :].reshape(1, -1))
-                for i in range(100)
-            ],
+            infer, [(1, sample_one_test_row(seed=i)) for i in range(100)],
         )
 
     # Convert output to strings for classification
     log.info("Predicting on the test set.")
     with Timer() as predict:
-        X_test = dataset.test.X
+        X_test = dataset.test.X if use_pandas else dataset.test.X_enc
         predictions = auto_sklearn.predict(X_test)
     probabilities = auto_sklearn.predict_proba(X_test) if is_classification else None
 
@@ -161,9 +169,9 @@ def infer(data: Union[str, pd.DataFrame]):
 
     return result(output_file=config.output_predictions_file,
                   predictions=predictions,
-                  truth=dataset.test.y,
+                  truth=dataset.test.y if use_pandas else dataset.test.y_enc,
                   probabilities=probabilities,
-                  target_is_encoded=is_classification,
+                  target_is_encoded=is_classification and not use_pandas,
                   models_count=len(auto_sklearn.get_models_with_weights()),
                   training_duration=training.duration,
                   predict_duration=predict.duration,

From b24c12c801698b22dd583fe771b9dc2f2fc714e1 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 16 Jun 2023 15:45:33 +0300
Subject: [PATCH 30/39] Add single row file inference for H2O

---
 frameworks/H2OAutoML/exec.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py
index f6603c886..d0288903a 100644
--- a/frameworks/H2OAutoML/exec.py
+++ b/frameworks/H2OAutoML/exec.py
@@ -120,19 +120,14 @@ def run(dataset, config):
 
         def infer(path: str):
             filename = pathlib.Path(path).name
-            batch = h2o.import_file(path, destination_frame=frame_name(filename, config), **import_kwargs)
+            # H2O can't do inference on single row arff, it needs columns explicitly:
+            # https://github.com/h2oai/h2o-3/issues/15572
+            batch = h2o.import_file(path, col_names=train.names, destination_frame=frame_name(filename, config), **import_kwargs)
             return aml.predict(batch)
 
         inference_times = {}
         if config.measure_inference_time:
-            # H2O can't do inference on single row arff:
-            # https://github.com/h2oai/h2o-3/issues/15572
-            without_single_row_files = [
-                (subsample_size, subsample_path)
-                for subsample_size, subsample_path in dataset.inference_subsample_files
-                if subsample_size > 1
-            ]
-            inference_times["file"] = measure_inference_times(infer, without_single_row_files)
+            inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
 
         with Timer() as predict:
             preds = aml.predict(test)

From d5da2bd63cfae3d066d2d8f76223125667ed3bbd Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 16 Jun 2023 15:47:19 +0300
Subject: [PATCH 31/39] Update inference measurement to record its from file

---
 frameworks/constantpredictor/exec.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/frameworks/constantpredictor/exec.py b/frameworks/constantpredictor/exec.py
index 4a2c7cf9d..332de6592 100644
--- a/frameworks/constantpredictor/exec.py
+++ b/frameworks/constantpredictor/exec.py
@@ -35,7 +35,8 @@ def infer(path):
         data = pd.read_parquet(path)
         return predictor.predict(data)
 
-    inference_times = measure_inference_times(infer, dataset.inference_subsample_files(fmt="parquet"))
+    inference_times = {}
+    inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files(fmt="parquet"))
 
     save_predictions(dataset=dataset,
                      output_file=config.output_predictions_file,

From cdb08284ea20af4b2f6c0a2f39b65d5b61155743 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 16 Jun 2023 19:28:09 +0300
Subject: [PATCH 32/39] Dynamically set type depending on presence of pandas

---
 frameworks/shared/callee.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index 2f94e5262..84fe1c6dc 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -5,13 +5,12 @@
 import signal
 import sys
 from collections import defaultdict
-from typing import Callable, Any, Tuple, Union, TypeVar
-
-import pandas as pd
+from typing import Callable, Any, Tuple, Union, TypeVar, TYPE_CHECKING
 
 from .utils import InterruptTimeout, Namespace as ns, json_dump, json_loads, kill_proc_tree, touch
 from .utils import deserialize_data, serialize_data, Timer
 
+
 log = logging.getLogger(__name__)
 
 
@@ -97,7 +96,14 @@ def load_data(name, path, **_):
         res["others"]["inference_times"] = str(inference_file)
     json_dump(res, config.result_file, style='compact')
 
-DATA_INPUT = TypeVar("DATA_INPUT", bound=Union[str, pd.DataFrame])
+try:
+    import pandas as pd
+    DATA_TYPES = Union[str, pd.DataFrame]
+except ImportError:
+    DATA_TYPES = str
+
+DATA_INPUT = TypeVar("DATA_INPUT", bound=DATA_TYPES)
+
 def measure_inference_times(predict_fn: Callable[[DATA_INPUT], Any], files: list[Tuple[int, DATA_INPUT]]) -> dict[int, list[float]]:
     inference_times = defaultdict(list)
     for subsample_size, subsample_path in files:

From 4c14dbee45585306eb37407a987cc979bfa0aa16 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 16 Jun 2023 19:43:34 +0300
Subject: [PATCH 33/39] Add time to job timeout if inference measurements
 enabled

---
 amlb/benchmark.py     | 7 +++++--
 resources/config.yaml | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index dee7c5c14..2f913a8df 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -404,8 +404,11 @@ def __setattr__(self, name, value):
         if name == 'metrics':
             self.metric = value[0] if isinstance(value, list) else value
         elif name == 'max_runtime_seconds':
-            self.job_timeout_seconds = min(value * 2,
-                                           value + rconfig().benchmarks.overhead_time_seconds)
+            inference_time_extension = 0
+            if rconfig().inference_time_measurements.enabled:
+                inference_time_extension = rconfig().inference_time_measurements.additional_job_time
+            self.job_timeout_seconds = min(value * 2 + inference_time_extension,
+                                           value + rconfig().benchmarks.overhead_time_seconds + inference_time_extension)
         super().__setattr__(name, value)
 
     def __json__(self):
diff --git a/resources/config.yaml b/resources/config.yaml
index 73bc81d64..5fbbd968b 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -88,6 +88,7 @@ inference_time_measurements:  # configuration namespace for performing additiona
   enabled: true
   batch_sizes: [1, 10, 100, 1000, 10000]  # the batch sizes for which inference speed should be measured
   repeats: 100                            # the number of times to repeat the inference measurement for each batch size
+  additional_job_time: 300  # the time in seconds that will be added to the maximum job time if inference time is measured
 
 openml:                # configuration namespace for openML.
   apikey: c1994bdb7ecb3c6f3c8f3b35f4b47f1f

From acae403c5f7842ffd5fd73f17fbcb1632928e699 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 16 Jun 2023 20:56:41 +0300
Subject: [PATCH 34/39] Disable inference time measurements for CI

---
 .github/workflows/run_all_frameworks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml
index b11ce4629..ed772acab 100644
--- a/.github/workflows/run_all_frameworks.yml
+++ b/.github/workflows/run_all_frameworks.yml
@@ -156,6 +156,6 @@ jobs:
     - name: Run ${{ matrix.framework }} on ${{ matrix.task }}
       run: |
         source venv/bin/activate
-        python runbenchmark.py ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e
+        python runbenchmark.py ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e --Xinference_time_measurements.enabled=False
       env:
         GITHUB_PAT: ${{ secrets.PUBLIC_ACCESS_GITHUB_PAT }}

From 8caf62918a1b807b40ba8392394c5fdb517cc9ef Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sat, 17 Jun 2023 16:10:17 +0200
Subject: [PATCH 35/39] Remove one dash

---
 .github/workflows/run_all_frameworks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml
index ed772acab..15750019a 100644
--- a/.github/workflows/run_all_frameworks.yml
+++ b/.github/workflows/run_all_frameworks.yml
@@ -156,6 +156,6 @@ jobs:
     - name: Run ${{ matrix.framework }} on ${{ matrix.task }}
       run: |
         source venv/bin/activate
-        python runbenchmark.py ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e --Xinference_time_measurements.enabled=False
+        python runbenchmark.py ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e -Xinference_time_measurements.enabled=False
       env:
         GITHUB_PAT: ${{ secrets.PUBLIC_ACCESS_GITHUB_PAT }}

From f0cbfc055b3b489640bbb516b83148cb0ed11f60 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sat, 17 Jun 2023 16:30:49 +0200
Subject: [PATCH 36/39] Disable inference time measurement by default

---
 resources/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/config.yaml b/resources/config.yaml
index 5fbbd968b..e0d526a7a 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -85,7 +85,7 @@ results:                 # configuration namespace for the results.csv file.
   incremental_save: true # if true save results after each job., otherwise save results only when all jobs are completed.
 
 inference_time_measurements:  # configuration namespace for performing additional inference time measurements on various batch sizes
-  enabled: true
+  enabled: false
   batch_sizes: [1, 10, 100, 1000, 10000]  # the batch sizes for which inference speed should be measured
   repeats: 100                            # the number of times to repeat the inference measurement for each batch size
   additional_job_time: 300  # the time in seconds that will be added to the maximum job time if inference time is measured

From b2c9b385fe58568e09bc254826479a83cf23efdd Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sat, 17 Jun 2023 16:36:24 +0200
Subject: [PATCH 37/39] Make measuring inference time optional, also measure
 single row df

---
 frameworks/constantpredictor/exec.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/frameworks/constantpredictor/exec.py b/frameworks/constantpredictor/exec.py
index 332de6592..049e19b76 100644
--- a/frameworks/constantpredictor/exec.py
+++ b/frameworks/constantpredictor/exec.py
@@ -31,12 +31,18 @@ def run(dataset: Dataset, config: TaskConfig):
         predictions = predictor.predict(X_test)
     probabilities = predictor.predict_proba(X_test) if is_classification else None
 
-    def infer(path):
-        data = pd.read_parquet(path)
+    def infer(data):
+        data = pd.read_parquet(data) if isinstance(data, str) else data
         return predictor.predict(data)
 
     inference_times = {}
-    inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files(fmt="parquet"))
+    if config.measure_inference_time:
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files(fmt="parquet"))
+        test_data = X_test if isinstance(X_test, pd.DataFrame) else pd.DataFrame(X_test)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, test_data.sample(1, random_state=i)) for i in range(100)],
+        )
 
     save_predictions(dataset=dataset,
                      output_file=config.output_predictions_file,

From 7a3f433674666450331c8b860349f9425ba614bc Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sat, 17 Jun 2023 17:10:55 +0200
Subject: [PATCH 38/39] Add inference time measurement to (T)RF baselines

---
 frameworks/RandomForest/__init__.py      |  3 ++-
 frameworks/RandomForest/exec.py          | 20 ++++++++++++++++++--
 frameworks/TunedRandomForest/__init__.py |  3 ++-
 frameworks/TunedRandomForest/exec.py     | 18 ++++++++++++++++--
 4 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/frameworks/RandomForest/__init__.py b/frameworks/RandomForest/__init__.py
index a25cbeee7..3de306f59 100644
--- a/frameworks/RandomForest/__init__.py
+++ b/frameworks/RandomForest/__init__.py
@@ -23,7 +23,8 @@ def run(dataset: Dataset, config: TaskConfig):
         test=dict(
             X=X_test,
             y=y_test
-        )
+        ),
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
 
     return run_in_venv(__file__, "exec.py",
diff --git a/frameworks/RandomForest/exec.py b/frameworks/RandomForest/exec.py
index 77bdc99ef..dd23a763d 100644
--- a/frameworks/RandomForest/exec.py
+++ b/frameworks/RandomForest/exec.py
@@ -3,6 +3,8 @@
 import tempfile as tmp
 from typing import List
 
+import pandas as pd
+
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
@@ -12,7 +14,7 @@
 import sklearn
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
-from frameworks.shared.callee import call_run, result
+from frameworks.shared.callee import call_run, result, measure_inference_times
 from frameworks.shared.utils import Timer
 
 log = logging.getLogger(os.path.basename(__file__))
@@ -86,6 +88,19 @@ def run(dataset, config):
         predictions = rf.predict(X_test)
     probabilities = rf.predict_proba(X_test) if is_classification else None
 
+    def infer(data):
+        data = pd.read_parquet(data) if isinstance(data, str) else data
+        return rf.predict(data)
+
+    inference_times = {}
+    if config.measure_inference_time:
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        test_data = X_test if isinstance(X_test, pd.DataFrame) else pd.DataFrame(X_test)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, test_data.sample(1, random_state=i)) for i in range(100)],
+        )
+
     return result(output_file=config.output_predictions_file,
                   predictions=predictions,
                   truth=y_test,
@@ -93,7 +108,8 @@ def run(dataset, config):
                   target_is_encoded=encode,
                   models_count=len(rf),
                   training_duration=training.duration,
-                  predict_duration=predict.duration)
+                  predict_duration=predict.duration,
+                  inference_times=inference_times,)
 
 
 if __name__ == '__main__':
diff --git a/frameworks/TunedRandomForest/__init__.py b/frameworks/TunedRandomForest/__init__.py
index 561678497..dc0cad908 100644
--- a/frameworks/TunedRandomForest/__init__.py
+++ b/frameworks/TunedRandomForest/__init__.py
@@ -21,7 +21,8 @@ def run(dataset: Dataset, config: TaskConfig):
         test=dict(
             X=X_test,
             y=y_test
-        )
+        ),
+        inference_subsample_files=dataset.inference_subsample_files(fmt="parquet"),
     )
 
     return run_in_venv(__file__, "exec.py",
diff --git a/frameworks/TunedRandomForest/exec.py b/frameworks/TunedRandomForest/exec.py
index 7c7a7dc15..c724487d0 100644
--- a/frameworks/TunedRandomForest/exec.py
+++ b/frameworks/TunedRandomForest/exec.py
@@ -21,7 +21,7 @@
 import sklearn
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
-from frameworks.shared.callee import call_run, result
+from frameworks.shared.callee import call_run, result, measure_inference_times
 from frameworks.shared.utils import Timer
 from custom_validate import cross_validate
 
@@ -211,6 +211,19 @@ def run(dataset, config):
         predictions = rf.predict(X_test)
     probabilities = rf.predict_proba(X_test) if is_classification else None
 
+    def infer(data):
+        data = pd.read_parquet(data) if isinstance(data, str) else data
+        return rf.predict(data)
+
+    inference_times = {}
+    if config.measure_inference_time:
+        inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
+        test_data = X_test if isinstance(X_test, pd.DataFrame) else pd.DataFrame(X_test)
+        inference_times["df"] = measure_inference_times(
+            infer,
+            [(1, test_data.sample(1, random_state=i)) for i in range(100)],
+        )
+
     return result(
         output_file=config.output_predictions_file,
         predictions=predictions,
@@ -219,7 +232,8 @@ def run(dataset, config):
         target_is_encoded=is_classification,
         models_count=len(rf),
         training_duration=training.duration,
-        predict_duration=predict.duration
+        predict_duration=predict.duration,
+        inference_times=inference_times,
     )
 
 

From 54a2dd6257fdc9cf1f7097add446affc6930c0fc Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sat, 17 Jun 2023 17:13:35 +0200
Subject: [PATCH 39/39] Remove skip inference measurement override since its
 default now

---
 .github/workflows/run_all_frameworks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml
index 15750019a..b11ce4629 100644
--- a/.github/workflows/run_all_frameworks.yml
+++ b/.github/workflows/run_all_frameworks.yml
@@ -156,6 +156,6 @@ jobs:
     - name: Run ${{ matrix.framework }} on ${{ matrix.task }}
       run: |
         source venv/bin/activate
-        python runbenchmark.py ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e -Xinference_time_measurements.enabled=False
+        python runbenchmark.py ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e
       env:
         GITHUB_PAT: ${{ secrets.PUBLIC_ACCESS_GITHUB_PAT }}