Fix serialization and caching (#298)

Due to continuing serialization problems that were thought to be related with caching, #277 deactivated core test caching and #294 was prepared to do the same for the full test environment. This PR reactivates caching and instead refactors the class layout of `SKLearnClusteringRecommender` in an attempt to fix the root cause. Mysteriously, the top-level import of `sklearn.mixture.GaussianMixture` seems to cause trouble. While the reason is still unclear, turning it into a lazy import (which will also become handy later when making `scikit-learn` an optional dependency) seems to resolve the problem. On a side note: deactivating slots for the recommenders solves the problem as well, which suggests that the root cause could be related to classes not being properly garbage collected (since `attrs` needs to create new classes when slots are activated), which could also explain that `GaussianMixtureClusteringRecommender` seemed to have improperly overridden methods after deserialization (for example, the `__repr__` of a created Gaussian mixture recommender correctly pointed to its own class before serialization but to the `__repr__` of `SKLearnClusteringRecommender` after serialization – but weirdly only when executed in `tox`).
emdgroup · Jul 3, 2024 · fdb2109 · fdb2109
2 parents 6e38822 + 0dfba4e
commit fdb2109
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 29 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -137,6 +137,10 @@ jobs:
         id: setup-python
         with:
           python-version: ${{ matrix.py-version.semantic }}
+      - uses: actions/cache@v4
+        with:
+          path: .tox/coretest-${{ matrix.py-version.tox }}
+          key: coretest-${{ matrix.py-version.tox }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('tox.ini') }}
       - name: Run core tests
         run: |
           pip install tox-uv

diff --git a/.github/workflows/regular.yml b/.github/workflows/regular.yml
@@ -1,7 +1,8 @@
 # NOTES:
 # - The map syntax used for matrix is flagged red but actually works
 # - This runs everything in Python 3.10, 3.11 and 3.12
-# - No environments are cached due to space limit
+# - Environments are **not** cached in order to perform a full end-to-end test
+#   (and due to space limit)
 
 name: Regular Checks
 on:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,6 +40,7 @@ _ `_optional` subpackage for managing optional dependencies
 
 ### Fixed
 - `sequential` flag of `SequentialGreedyRecommender` is now set to `True`
+- Serialization bug related to class layout of `SKLearnClusteringRecommender`
 
 ### Deprecations
 - `SequentialGreedyRecommender` class replaced with `BotorchRecommender`

diff --git a/baybe/recommenders/pure/nonpredictive/clustering.py b/baybe/recommenders/pure/nonpredictive/clustering.py
@@ -1,18 +1,15 @@
 """Recommenders based on clustering."""
 
-from abc import ABC
+from abc import ABC, abstractmethod
 from typing import ClassVar
 
 import numpy as np
 import pandas as pd
 from attrs import define, field
 from scipy.stats import multivariate_normal
 from sklearn.base import ClusterMixin
-from sklearn.cluster import KMeans
 from sklearn.metrics import pairwise_distances
-from sklearn.mixture import GaussianMixture
 from sklearn.preprocessing import StandardScaler
-from sklearn_extra.cluster import KMedoids
 
 from baybe.recommenders.pure.nonpredictive.base import NonPredictiveRecommender
 from baybe.searchspace import SearchSpaceType, SubspaceDiscrete
@@ -30,19 +27,11 @@ class SKLearnClusteringRecommender(NonPredictiveRecommender, ABC):
     # Class variables
     compatibility: ClassVar[SearchSpaceType] = SearchSpaceType.DISCRETE
     # See base class.
-    # TODO: "Type" should not appear in ClassVar. Both PyCharm and mypy complain, see
-    #   also note in the mypy docs:
-    #       https://peps.python.org/pep-0526/#class-and-instance-variable-annotations
-    #   Figure out what is the right approach here. However, the issue might be
-    #   ultimately related to an overly restrictive PEP:
-    #       https://github.com/python/mypy/issues/5144
+
     # TODO: `use_custom_selector` can probably be replaced with a fallback mechanism
     #   that checks if a custom mechanism is implemented and uses default otherwise
     #   (similar to what is done in the recommenders)
 
-    model_class: ClassVar[type[ClusterMixin]]
-    """Class variable describing the model class."""
-
     model_cluster_num_parameter_name: ClassVar[str]
     """Class variable describing the name of the clustering parameter."""
 
@@ -54,6 +43,11 @@ class SKLearnClusteringRecommender(NonPredictiveRecommender, ABC):
     """Optional model parameter that will be passed to the surrogate constructor.
     This is initialized with reasonable default values for the derived child classes."""
 
+    @staticmethod
+    @abstractmethod
+    def _get_model_cls() -> type[ClusterMixin]:
+        """Return the surrogate model class."""
+
     def _make_selection_default(
         self,
         model: ClusterMixin,
@@ -68,7 +62,7 @@ def _make_selection_default(
             candidates_scaled: The already scaled candidates.
 
         Returns:
-           A list with positional indices of the selected candidates.
+            A list with positional indices of the selected candidates.
         """
         assigned_clusters = model.predict(candidates_scaled)
         selection = [
@@ -91,7 +85,7 @@ def _make_selection_custom(
             candidates_scaled: The already scaled candidates.
 
         Returns:
-           A list with positional indices of the selected candidates.
+            A list with positional indices of the selected candidates.
 
         Raises:
             NotImplementedError: If this function is not implemented. Should be
@@ -115,7 +109,7 @@ def _recommend_discrete(
         candidates_scaled = np.ascontiguousarray(scaler.transform(candidates_comp))
 
         # Set model parameters and perform fit
-        model = self.model_class(
+        model = self._get_model_cls()(
             **{self.model_cluster_num_parameter_name: batch_size},
             **self.model_params,
         )
@@ -135,9 +129,6 @@ def _recommend_discrete(
 class PAMClusteringRecommender(SKLearnClusteringRecommender):
     """Partitioning Around Medoids (PAM) clustering recommender."""
 
-    model_class: ClassVar[type[ClusterMixin]] = KMedoids
-    # See base class.
-
     model_cluster_num_parameter_name: ClassVar[str] = "n_clusters"
     # See base class.
 
@@ -153,6 +144,13 @@ def _default_model_params(self) -> dict:
         """Create the default model parameters."""
         return {"max_iter": 100, "init": "k-medoids++"}
 
+    @staticmethod
+    def _get_model_cls() -> type[ClusterMixin]:
+        # See base class.
+        from sklearn_extra.cluster import KMedoids
+
+        return KMedoids
+
     def _make_selection_custom(
         self,
         model: ClusterMixin,
@@ -168,7 +166,7 @@ def _make_selection_custom(
             candidates_scaled: The already scaled candidates. Unused.
 
         Returns:
-           A list with positional indices of the selected candidates.
+            A list with positional indices of the selected candidates.
         """
         selection = model.medoid_indices_.tolist()
         return selection
@@ -179,9 +177,6 @@ class KMeansClusteringRecommender(SKLearnClusteringRecommender):
     """K-means clustering recommender."""
 
     # Class variables
-    model_class: ClassVar[type[ClusterMixin]] = KMeans
-    # See base class.
-
     model_cluster_num_parameter_name: ClassVar[str] = "n_clusters"
     # See base class.
 
@@ -197,6 +192,13 @@ def _default_model_params(self) -> dict:
         """Create the default model parameters."""
         return {"max_iter": 1000, "n_init": 50}
 
+    @staticmethod
+    def _get_model_cls() -> type[ClusterMixin]:
+        # See base class.
+        from sklearn.cluster import KMeans
+
+        return KMeans
+
     def _make_selection_custom(
         self,
         model: ClusterMixin,
@@ -212,7 +214,7 @@ def _make_selection_custom(
             candidates_scaled: The already scaled candidates.
 
         Returns:
-           A list with positional indices of the selected candidates.
+            A list with positional indices of the selected candidates.
         """
         distances = pairwise_distances(candidates_scaled, model.cluster_centers_)
         # Set the distances of points that were not assigned by the model to that
@@ -231,12 +233,16 @@ class GaussianMixtureClusteringRecommender(SKLearnClusteringRecommender):
     """Gaussian mixture model (GMM) clustering recommender."""
 
     # Class variables
-    model_class: ClassVar[type[ClusterMixin]] = GaussianMixture
-    # See base class.
-
     model_cluster_num_parameter_name: ClassVar[str] = "n_components"
     # See base class.
 
+    @staticmethod
+    def _get_model_cls() -> type[ClusterMixin]:
+        # See base class.
+        from sklearn.mixture import GaussianMixture
+
+        return GaussianMixture
+
     def _make_selection_custom(
         self,
         model: ClusterMixin,
@@ -252,7 +258,7 @@ def _make_selection_custom(
             candidates_scaled: The already scaled candidates.
 
         Returns:
-           A list with positional indices of the selected candidates.
+            A list with positional indices of the selected candidates.
         """
         predicted_clusters = model.predict(candidates_scaled)
         selection = []