Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: make multi-processing in baseline models more consistent #909

Merged
18 changes: 14 additions & 4 deletions src/safeds/ml/classical/classification/_baseline_classifier.py
sibre28 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import multiprocessing as mp
from concurrent.futures import ALL_COMPLETED, wait
from typing import Self

Expand Down Expand Up @@ -34,8 +35,11 @@ class BaselineClassifier:

Get a baseline by fitting data on multiple different models and comparing the best metrics.

Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the
classifier. This might result in significantly higher runtime.
Parameters
----------
extended_search:
If set to true, an extended set of models will be used to fit the classifier.
This might result in significantly higher runtime.
"""

def __init__(self, extended_search: bool = False):
Expand Down Expand Up @@ -86,7 +90,10 @@ def fit(self, train_data: TabularDataset) -> Self:

copied_model = copy.deepcopy(self)

with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
with ProcessPoolExecutor(
max_workers=len(self._list_of_model_types),
mp_context=mp.get_context("spawn"),
) as executor:
futures = []
for model in self._list_of_model_types:
futures.append(executor.submit(_fit_single_model, model, train_data))
Expand Down Expand Up @@ -149,7 +156,10 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]:
raise DatasetMissesDataError
_check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)

with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
with ProcessPoolExecutor(
max_workers=len(self._list_of_model_types),
mp_context=mp.get_context("spawn"),
) as executor:
results = []
futures = []
for model in self._fitted_models:
Expand Down
22 changes: 16 additions & 6 deletions src/safeds/ml/classical/regression/_baseline_regressor.py
sibre28 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import multiprocessing as mp
from concurrent.futures import ALL_COMPLETED, wait
from typing import Self

Expand Down Expand Up @@ -38,11 +39,14 @@ class BaselineRegressor:

Get a baseline by fitting data on multiple different models and comparing the best metrics.

Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the
classifier. This might result in significantly higher runtime.
Parameters
----------
extended_search:
If set to true, an extended set of models will be used to fit the classifier.
This might result in significantly higher runtime.
"""

def __init__(self, include_slower_models: bool = False):
def __init__(self, extended_search: bool = False):
self._is_fitted = False
self._list_of_model_types = [
AdaBoostRegressor(),
Expand All @@ -53,7 +57,7 @@ def __init__(self, include_slower_models: bool = False):
SupportVectorRegressor(),
]

if include_slower_models:
if extended_search:
self._list_of_model_types.extend(
[ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()],
) # pragma: no cover
Expand Down Expand Up @@ -95,7 +99,10 @@ def fit(self, train_data: TabularDataset) -> Self:

copied_model = copy.deepcopy(self)

with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
with ProcessPoolExecutor(
max_workers=len(self._list_of_model_types),
mp_context=mp.get_context("spawn"),
) as executor:
futures = []
for model in self._list_of_model_types:
futures.append(executor.submit(_fit_single_model, model, train_data))
Expand Down Expand Up @@ -159,7 +166,10 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]:
_check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)

# Start Processes
with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
with ProcessPoolExecutor(
max_workers=len(self._list_of_model_types),
mp_context=mp.get_context("spawn"),
) as executor:
results = []
futures = []
for model in self._fitted_models:
Expand Down