From fa245609da84cbdee64c82e537b41ce7d39f2215 Mon Sep 17 00:00:00 2001 From: Simon Breuer <86068340+sibre28@users.noreply.github.com> Date: Fri, 12 Jul 2024 20:57:22 +0200 Subject: [PATCH] fix: make multi-processing in baseline models more consistent (#909) Closes #907 Set context of ProcessPoolExecutors to spawn. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Co-authored-by: Lars Reimann --- .../classification/_baseline_classifier.py | 18 +++++++++++---- .../regression/_baseline_regressor.py | 22 ++++++++++++++----- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 7b58d61e2..d58c85962 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -1,4 +1,5 @@ import copy +import multiprocessing as mp from concurrent.futures import ALL_COMPLETED, wait from typing import Self @@ -34,8 +35,11 @@ class BaselineClassifier: Get a baseline by fitting data on multiple different models and comparing the best metrics. - Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the - classifier. This might result in significantly higher runtime. + Parameters + ---------- + extended_search: + If set to true, an extended set of models will be used to fit the classifier. + This might result in significantly higher runtime. """ def __init__(self, extended_search: bool = False): @@ -86,7 +90,10 @@ def fit(self, train_data: TabularDataset) -> Self: copied_model = copy.deepcopy(self) - with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + with ProcessPoolExecutor( + max_workers=len(self._list_of_model_types), + mp_context=mp.get_context("spawn"), + ) as executor: futures = [] for model in self._list_of_model_types: futures.append(executor.submit(_fit_single_model, model, train_data)) @@ -149,7 +156,10 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: raise DatasetMissesDataError _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names) - with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + with ProcessPoolExecutor( + max_workers=len(self._list_of_model_types), + mp_context=mp.get_context("spawn"), + ) as executor: results = [] futures = [] for model in self._fitted_models: diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 4562ed122..2efd1b10d 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -1,4 +1,5 @@ import copy +import multiprocessing as mp from concurrent.futures import ALL_COMPLETED, wait from typing import Self @@ -38,11 +39,14 @@ class BaselineRegressor: Get a baseline by fitting data on multiple different models and comparing the best metrics. - Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the - classifier. This might result in significantly higher runtime. + Parameters + ---------- + extended_search: + If set to true, an extended set of models will be used to fit the classifier. + This might result in significantly higher runtime. """ - def __init__(self, include_slower_models: bool = False): + def __init__(self, extended_search: bool = False): self._is_fitted = False self._list_of_model_types = [ AdaBoostRegressor(), @@ -53,7 +57,7 @@ def __init__(self, include_slower_models: bool = False): SupportVectorRegressor(), ] - if include_slower_models: + if extended_search: self._list_of_model_types.extend( [ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()], ) # pragma: no cover @@ -95,7 +99,10 @@ def fit(self, train_data: TabularDataset) -> Self: copied_model = copy.deepcopy(self) - with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + with ProcessPoolExecutor( + max_workers=len(self._list_of_model_types), + mp_context=mp.get_context("spawn"), + ) as executor: futures = [] for model in self._list_of_model_types: futures.append(executor.submit(_fit_single_model, model, train_data)) @@ -159,7 +166,10 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names) # Start Processes - with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + with ProcessPoolExecutor( + max_workers=len(self._list_of_model_types), + mp_context=mp.get_context("spawn"), + ) as executor: results = [] futures = [] for model in self._fitted_models: