Skip to content

Commit

Permalink
Fix serialization and caching (#298)
Browse files Browse the repository at this point in the history
Due to continuing serialization problems that were thought to be related
with caching, #277 deactivated core test caching and #294 was prepared
to do the same for the full test environment.

This PR reactivates caching and instead refactors the class layout of
`SKLearnClusteringRecommender` in an attempt to fix the root cause.
Mysteriously, the top-level import of `sklearn.mixture.GaussianMixture`
seems to cause trouble. While the reason is still unclear, turning it
into a lazy import (which will also become handy later when making
`scikit-learn` an optional dependency) seems to resolve the problem.

On a side note: deactivating slots for the recommenders solves the
problem as well, which suggests that the root cause could be related to
classes not being properly garbage collected (since `attrs` needs to
create new classes when slots are activated), which could also explain
that `GaussianMixtureClusteringRecommender` seemed to have improperly
overridden methods after deserialization (for example, the `__repr__` of
a created Gaussian mixture recommender correctly pointed to its own
class before serialization but to the `__repr__` of
`SKLearnClusteringRecommender` after serialization – but weirdly only
when executed in `tox`).
  • Loading branch information
AdrianSosic authored Jul 3, 2024
2 parents 6e38822 + 0dfba4e commit fdb2109
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 29 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ jobs:
id: setup-python
with:
python-version: ${{ matrix.py-version.semantic }}
- uses: actions/cache@v4
with:
path: .tox/coretest-${{ matrix.py-version.tox }}
key: coretest-${{ matrix.py-version.tox }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('tox.ini') }}
- name: Run core tests
run: |
pip install tox-uv
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/regular.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# NOTES:
# - The map syntax used for matrix is flagged red but actually works
# - This runs everything in Python 3.10, 3.11 and 3.12
# - No environments are cached due to space limit
# - Environments are **not** cached in order to perform a full end-to-end test
# (and due to space limit)

name: Regular Checks
on:
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ _ `_optional` subpackage for managing optional dependencies

### Fixed
- `sequential` flag of `SequentialGreedyRecommender` is now set to `True`
- Serialization bug related to class layout of `SKLearnClusteringRecommender`

### Deprecations
- `SequentialGreedyRecommender` class replaced with `BotorchRecommender`
Expand Down
62 changes: 34 additions & 28 deletions baybe/recommenders/pure/nonpredictive/clustering.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
"""Recommenders based on clustering."""

from abc import ABC
from abc import ABC, abstractmethod
from typing import ClassVar

import numpy as np
import pandas as pd
from attrs import define, field
from scipy.stats import multivariate_normal
from sklearn.base import ClusterMixin
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids

from baybe.recommenders.pure.nonpredictive.base import NonPredictiveRecommender
from baybe.searchspace import SearchSpaceType, SubspaceDiscrete
Expand All @@ -30,19 +27,11 @@ class SKLearnClusteringRecommender(NonPredictiveRecommender, ABC):
# Class variables
compatibility: ClassVar[SearchSpaceType] = SearchSpaceType.DISCRETE
# See base class.
# TODO: "Type" should not appear in ClassVar. Both PyCharm and mypy complain, see
# also note in the mypy docs:
# https://peps.python.org/pep-0526/#class-and-instance-variable-annotations
# Figure out what is the right approach here. However, the issue might be
# ultimately related to an overly restrictive PEP:
# https://github.com/python/mypy/issues/5144

# TODO: `use_custom_selector` can probably be replaced with a fallback mechanism
# that checks if a custom mechanism is implemented and uses default otherwise
# (similar to what is done in the recommenders)

model_class: ClassVar[type[ClusterMixin]]
"""Class variable describing the model class."""

model_cluster_num_parameter_name: ClassVar[str]
"""Class variable describing the name of the clustering parameter."""

Expand All @@ -54,6 +43,11 @@ class SKLearnClusteringRecommender(NonPredictiveRecommender, ABC):
"""Optional model parameter that will be passed to the surrogate constructor.
This is initialized with reasonable default values for the derived child classes."""

@staticmethod
@abstractmethod
def _get_model_cls() -> type[ClusterMixin]:
"""Return the surrogate model class."""

def _make_selection_default(
self,
model: ClusterMixin,
Expand All @@ -68,7 +62,7 @@ def _make_selection_default(
candidates_scaled: The already scaled candidates.
Returns:
A list with positional indices of the selected candidates.
A list with positional indices of the selected candidates.
"""
assigned_clusters = model.predict(candidates_scaled)
selection = [
Expand All @@ -91,7 +85,7 @@ def _make_selection_custom(
candidates_scaled: The already scaled candidates.
Returns:
A list with positional indices of the selected candidates.
A list with positional indices of the selected candidates.
Raises:
NotImplementedError: If this function is not implemented. Should be
Expand All @@ -115,7 +109,7 @@ def _recommend_discrete(
candidates_scaled = np.ascontiguousarray(scaler.transform(candidates_comp))

# Set model parameters and perform fit
model = self.model_class(
model = self._get_model_cls()(
**{self.model_cluster_num_parameter_name: batch_size},
**self.model_params,
)
Expand All @@ -135,9 +129,6 @@ def _recommend_discrete(
class PAMClusteringRecommender(SKLearnClusteringRecommender):
"""Partitioning Around Medoids (PAM) clustering recommender."""

model_class: ClassVar[type[ClusterMixin]] = KMedoids
# See base class.

model_cluster_num_parameter_name: ClassVar[str] = "n_clusters"
# See base class.

Expand All @@ -153,6 +144,13 @@ def _default_model_params(self) -> dict:
"""Create the default model parameters."""
return {"max_iter": 100, "init": "k-medoids++"}

@staticmethod
def _get_model_cls() -> type[ClusterMixin]:
# See base class.
from sklearn_extra.cluster import KMedoids

return KMedoids

def _make_selection_custom(
self,
model: ClusterMixin,
Expand All @@ -168,7 +166,7 @@ def _make_selection_custom(
candidates_scaled: The already scaled candidates. Unused.
Returns:
A list with positional indices of the selected candidates.
A list with positional indices of the selected candidates.
"""
selection = model.medoid_indices_.tolist()
return selection
Expand All @@ -179,9 +177,6 @@ class KMeansClusteringRecommender(SKLearnClusteringRecommender):
"""K-means clustering recommender."""

# Class variables
model_class: ClassVar[type[ClusterMixin]] = KMeans
# See base class.

model_cluster_num_parameter_name: ClassVar[str] = "n_clusters"
# See base class.

Expand All @@ -197,6 +192,13 @@ def _default_model_params(self) -> dict:
"""Create the default model parameters."""
return {"max_iter": 1000, "n_init": 50}

@staticmethod
def _get_model_cls() -> type[ClusterMixin]:
# See base class.
from sklearn.cluster import KMeans

return KMeans

def _make_selection_custom(
self,
model: ClusterMixin,
Expand All @@ -212,7 +214,7 @@ def _make_selection_custom(
candidates_scaled: The already scaled candidates.
Returns:
A list with positional indices of the selected candidates.
A list with positional indices of the selected candidates.
"""
distances = pairwise_distances(candidates_scaled, model.cluster_centers_)
# Set the distances of points that were not assigned by the model to that
Expand All @@ -231,12 +233,16 @@ class GaussianMixtureClusteringRecommender(SKLearnClusteringRecommender):
"""Gaussian mixture model (GMM) clustering recommender."""

# Class variables
model_class: ClassVar[type[ClusterMixin]] = GaussianMixture
# See base class.

model_cluster_num_parameter_name: ClassVar[str] = "n_components"
# See base class.

@staticmethod
def _get_model_cls() -> type[ClusterMixin]:
# See base class.
from sklearn.mixture import GaussianMixture

return GaussianMixture

def _make_selection_custom(
self,
model: ClusterMixin,
Expand All @@ -252,7 +258,7 @@ def _make_selection_custom(
candidates_scaled: The already scaled candidates.
Returns:
A list with positional indices of the selected candidates.
A list with positional indices of the selected candidates.
"""
predicted_clusters = model.predict(candidates_scaled)
selection = []
Expand Down

0 comments on commit fdb2109

Please sign in to comment.