Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add incremental algorithms support #160

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
96 changes: 96 additions & 0 deletions configs/incremental.json
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{
"PARAMETERS_SETS": {
"common": {"bench": {"n_runs": 10, "time_limit": 60}},
"covariance data": {
"data": [
{
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 1000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
]
},
"basic_statistics data": {
"data": {
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 10000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
},
"linear_regression data": {
"data": {
"source": "make_regression",
"split_kwargs": {"train_size": 0.2, "test_size": 0.8},
"generation_kwargs": {
"n_samples": 5000,
"n_features": [40, 100],
"n_informative": 5,
"noise": 2.0
}
}
},
"pca data": {
"data": {
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 1000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
},
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved
"covariance": {
"algorithm": [
{
"estimator": "IncrementalEmpiricalCovariance",
"library": "sklearnex.covariance",
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved
"estimator_methods": {"training": "partial_fit"},
"num_batches": {"training": 2}
}
]
},
"basic_statistics": {
"algorithm": [
{
"estimator": "IncrementalBasicStatistics",
"library": "sklearnex.basic_statistics",
"num_batches": {"training": 2}
}
]
},
"linear_regression": {
"algorithm": [
{
"estimator": "IncrementalLinearRegression",
"library": "sklearnex.linear_model",
"num_batches": {"training": 2}
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved
}
]
},
"pca": {
"algorithm": [
{
"estimator": "IncrementalPCA",
"library": "sklearnex.preview.decomposition",
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved
"num_batches": {"training": 2}
}
]
}
},
"TEMPLATES": {
"covariance": {"SETS": ["common", "covariance", "covariance data"]},
"linear_regression": {
"SETS": ["common", "linear_regression", "linear_regression data"]
},
"pca": {"SETS": ["common", "pca", "pca data"]}
}
}
40 changes: 29 additions & 11 deletions sklbench/benchmarks/sklearn_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str):
def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]:
# default estimator methods
estimator_methods = {
"training": ["fit"],
"training": ["partial_fit", "fit"],
"inference": ["predict", "predict_proba", "transform"],
}
for stage in estimator_methods.keys():
Expand Down Expand Up @@ -334,7 +334,9 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0


def create_online_function(method_instance, data_args, batch_size):
def create_online_function(
estimator_instance, method_instance, data_args, num_batches, batch_size
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved
):
n_batches = data_args[0].shape[0] // batch_size

if "y" in list(inspect.signature(method_instance).parameters):
Expand All @@ -345,23 +347,27 @@ def ndarray_function(x, y):
x[i * batch_size : (i + 1) * batch_size],
y[i * batch_size : (i + 1) * batch_size],
)
estimator_instance._onedal_finalize_fit()
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved

def dataframe_function(x, y):
for i in range(n_batches):
method_instance(
x.iloc[i * batch_size : (i + 1) * batch_size],
y.iloc[i * batch_size : (i + 1) * batch_size],
)
estimator_instance._onedal_finalize_fit()

else:

def ndarray_function(x):
for i in range(n_batches):
method_instance(x[i * batch_size : (i + 1) * batch_size])
estimator_instance._onedal_finalize_fit()

def dataframe_function(x):
for i in range(n_batches):
method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
estimator_instance._onedal_finalize_fit()

if "ndarray" in str(type(data_args[0])):
return ndarray_function
Expand Down Expand Up @@ -414,12 +420,28 @@ def measure_sklearn_estimator(
data_args = (x_train,)
else:
data_args = (x_test,)
batch_size = get_bench_case_value(
bench_case, f"algorithm:batch_size:{stage}"
)
if batch_size is not None:

if method == "partial_fit":
num_batches = get_bench_case_value(bench_case, "data:num_batches")
batch_size = get_bench_case_value(bench_case, "data:batch_size")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of separate branch for partial_fit, extend mechanism of online_inference_mode to partial fitting too.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you provide the exact link to implementation of this mechanism? i was not able to find the usage of this parameter, just see its setting in the config.

Copy link
Contributor

@Alexsandruss Alexsandruss Oct 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, online_inference_mode was removed as unnecessary before merge of refactor branch. This mode is enabled by batch_size != None only.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can split batch size into two for training and inference.

Copy link
Author

@olegkkruglov olegkkruglov Oct 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, online_inference_mode was removed as unnecessary before merge of refactor branch.

what should I extend then?


if batch_size is None:
if num_batches is None:
num_batches = 5
batch_size = (
data_args[0].shape[0] + num_batches - 1
) // num_batches
if num_batches is None:
num_batches = (
data_args[0].shape[0] + batch_size - 1
) // batch_size
Alexsandruss marked this conversation as resolved.
Show resolved Hide resolved

method_instance = create_online_function(
method_instance, data_args, batch_size
estimator_instance,
method_instance,
data_args,
num_batches,
batch_size,
)
# daal4py model builders enabling branch
if enable_modelbuilders and stage == "inference":
Expand All @@ -436,10 +458,6 @@ def measure_sklearn_estimator(
metrics[method]["time std[ms]"],
_,
) = measure_case(bench_case, method_instance, *data_args)
if batch_size is not None:
metrics[method]["throughput[samples/ms]"] = (
(data_args[0].shape[0] // batch_size) * batch_size
) / metrics[method]["time[ms]"]
if ensure_sklearnex_patching:
full_method_name = f"{estimator_class.__name__}.{method}"
sklearnex_logging_stream.seek(0)
Expand Down
10 changes: 7 additions & 3 deletions sklbench/report/implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import argparse
import json
from typing import Dict, List
from typing import Dict, Hashable, List

import openpyxl as xl
import pandas as pd
Expand Down Expand Up @@ -239,6 +239,7 @@ def get_result_tables_as_df(
bench_cases = pd.DataFrame(
[flatten_dict(bench_case) for bench_case in results["bench_cases"]]
)
bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is non-hashable object you are trying to convert?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

basic statistics result_options parameter is a list


if compatibility_mode:
bench_cases = transform_results_to_compatible(bench_cases)
Expand All @@ -248,7 +249,7 @@ def get_result_tables_as_df(
bench_cases.drop(columns=[column], inplace=True)
diffby_columns.remove(column)

return split_df_by_columns(bench_cases, splitby_columns)
return split_df_by_columns(bench_cases, splitby_columns, False)


def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
Expand All @@ -258,7 +259,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
# only relative improvements are included in summary currently
if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
metric_columns.append(column)
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
if metric_columns:
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
else:
summary = pd.DataFrame()
summary.index = pd.Index([df_name])
return summary

Expand Down
5 changes: 5 additions & 0 deletions test-configuration-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ steps:
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/incremental.json
displayName: Incremental algorithms example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
Expand Down
4 changes: 4 additions & 0 deletions test-configuration-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ steps:
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/incremental.json
displayName: Incremental algorithms example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json
Expand Down