From fa245609da84cbdee64c82e537b41ce7d39f2215 Mon Sep 17 00:00:00 2001
From: Simon Breuer <86068340+sibre28@users.noreply.github.com>
Date: Fri, 12 Jul 2024 20:57:22 +0200
Subject: [PATCH] fix: make multi-processing in baseline models more consistent
 (#909)

Closes #907

Set context of ProcessPoolExecutors to spawn.

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
Co-authored-by: Lars Reimann <mail@larsreimann.com>
---
 .../classification/_baseline_classifier.py    | 18 +++++++++++----
 .../regression/_baseline_regressor.py         | 22 ++++++++++++++-----
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py
index 7b58d61e2..d58c85962 100644
--- a/src/safeds/ml/classical/classification/_baseline_classifier.py
+++ b/src/safeds/ml/classical/classification/_baseline_classifier.py
@@ -1,4 +1,5 @@
 import copy
+import multiprocessing as mp
 from concurrent.futures import ALL_COMPLETED, wait
 from typing import Self
 
@@ -34,8 +35,11 @@ class BaselineClassifier:
 
     Get a baseline by fitting data on multiple different models and comparing the best metrics.
 
-    Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the
-    classifier. This might result in significantly higher runtime.
+    Parameters
+    ----------
+    extended_search:
+        If set to true, an extended set of models will be used to fit the classifier.
+        This might result in significantly higher runtime.
     """
 
     def __init__(self, extended_search: bool = False):
@@ -86,7 +90,10 @@ def fit(self, train_data: TabularDataset) -> Self:
 
         copied_model = copy.deepcopy(self)
 
-        with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
+        with ProcessPoolExecutor(
+            max_workers=len(self._list_of_model_types),
+            mp_context=mp.get_context("spawn"),
+        ) as executor:
             futures = []
             for model in self._list_of_model_types:
                 futures.append(executor.submit(_fit_single_model, model, train_data))
@@ -149,7 +156,10 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]:
             raise DatasetMissesDataError
         _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)
 
-        with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
+        with ProcessPoolExecutor(
+            max_workers=len(self._list_of_model_types),
+            mp_context=mp.get_context("spawn"),
+        ) as executor:
             results = []
             futures = []
             for model in self._fitted_models:
diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py
index 4562ed122..2efd1b10d 100644
--- a/src/safeds/ml/classical/regression/_baseline_regressor.py
+++ b/src/safeds/ml/classical/regression/_baseline_regressor.py
@@ -1,4 +1,5 @@
 import copy
+import multiprocessing as mp
 from concurrent.futures import ALL_COMPLETED, wait
 from typing import Self
 
@@ -38,11 +39,14 @@ class BaselineRegressor:
 
     Get a baseline by fitting data on multiple different models and comparing the best metrics.
 
-    Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the
-    classifier. This might result in significantly higher runtime.
+    Parameters
+    ----------
+    extended_search:
+        If set to true, an extended set of models will be used to fit the classifier.
+        This might result in significantly higher runtime.
     """
 
-    def __init__(self, include_slower_models: bool = False):
+    def __init__(self, extended_search: bool = False):
         self._is_fitted = False
         self._list_of_model_types = [
             AdaBoostRegressor(),
@@ -53,7 +57,7 @@ def __init__(self, include_slower_models: bool = False):
             SupportVectorRegressor(),
         ]
 
-        if include_slower_models:
+        if extended_search:
             self._list_of_model_types.extend(
                 [ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()],
             )  # pragma: no cover
@@ -95,7 +99,10 @@ def fit(self, train_data: TabularDataset) -> Self:
 
         copied_model = copy.deepcopy(self)
 
-        with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
+        with ProcessPoolExecutor(
+            max_workers=len(self._list_of_model_types),
+            mp_context=mp.get_context("spawn"),
+        ) as executor:
             futures = []
             for model in self._list_of_model_types:
                 futures.append(executor.submit(_fit_single_model, model, train_data))
@@ -159,7 +166,10 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]:
         _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)
 
         # Start Processes
-        with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
+        with ProcessPoolExecutor(
+            max_workers=len(self._list_of_model_types),
+            mp_context=mp.get_context("spawn"),
+        ) as executor:
             results = []
             futures = []
             for model in self._fitted_models: