diff --git a/botorch/acquisition/max_value_entropy_search.py b/botorch/acquisition/max_value_entropy_search.py index 7fbc73cfcf..0db0e34a54 100644 --- a/botorch/acquisition/max_value_entropy_search.py +++ b/botorch/acquisition/max_value_entropy_search.py @@ -13,8 +13,8 @@ .. [Moss2021gibbon] Moss, H. B., et al., - GIBBON: General-purpose Information-Based Bayesian OptimisatioN - arXiv:2102.03324, 2021 + GIBBON: General-purpose Information-Based Bayesian OptimisatioN. + Journal of Machine Learning Research, 2021. .. [Takeno2020mfmves] S. Takeno, H. Fukuoka, Y. Tsukada, T. Koyama, M. Shiga, I. Takeuchi, diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py index 7d1f70a834..2b1bad5a2d 100644 --- a/botorch/models/approximate_gp.py +++ b/botorch/models/approximate_gp.py @@ -13,22 +13,25 @@ Journal of Machine Learning Research, 2020, http://jmlr.org/papers/v21/19-1015.html. -.. [chen2018dpp] - Laming Chen and Guoxin Zhang and Hanning Zhou, Fast greedy MAP inference - for determinantal point process to improve recommendation diversity, - Proceedings of the 32nd International Conference on Neural Information - Processing Systems, 2018, https://arxiv.org/abs/1709.05135. - .. [hensman2013svgp] James Hensman and Nicolo Fusi and Neil D. Lawrence, Gaussian Processes for Big Data, Proceedings of the 29th Conference on Uncertainty in Artificial Intelligence, 2013, https://arxiv.org/abs/1309.6835. +.. [moss2023ipa] + Henry B. Moss and Sebastian W. Ober and Victor Picheny, + Inducing Point Allocation for Sparse Gaussian Processes + in High-Throughput Bayesian Optimization,Proceedings of + the 25th International Conference on Artificial Intelligence + and Statistics, 2023, https://arxiv.org/pdf/2301.10123.pdf. + """ from __future__ import annotations import copy +import warnings + from typing import Optional, Type, Union import torch @@ -36,6 +39,10 @@ from botorch.models.transforms.input import InputTransform from botorch.models.transforms.outcome import OutcomeTransform from botorch.models.utils import validate_input_scaling +from botorch.models.utils.inducing_point_allocators import ( + GreedyVarianceReduction, + InducingPointAllocator, +) from botorch.posteriors.gpytorch import GPyTorchPosterior from gpytorch.constraints import GreaterThan from gpytorch.distributions import MultivariateNormal @@ -47,7 +54,6 @@ ) from gpytorch.means import ConstantMean, Mean from gpytorch.models import ApproximateGP -from gpytorch.module import Module from gpytorch.priors import GammaPrior from gpytorch.utils.memoize import clear_cache_hook from gpytorch.variational import ( @@ -57,12 +63,10 @@ IndependentMultitaskVariationalStrategy, VariationalStrategy, ) -from linear_operator.operators import LinearOperator from torch import Tensor MIN_INFERRED_NOISE_LEVEL = 1e-4 -NEG_INF = -(torch.tensor(float("inf"))) class ApproximateGPyTorchModel(GPyTorchModel): @@ -148,7 +152,8 @@ class _SingleTaskVariationalGP(ApproximateGP): Base class wrapper for a stochastic variational Gaussian Process (SVGP) model [hensman2013svgp]_. - Uses pivoted Cholesky initialization for the inducing points. + Uses by default pivoted Cholesky initialization for allocating inducing points, + however, custom inducing point allocators can be provided. """ def __init__( @@ -162,6 +167,7 @@ def __init__( variational_distribution: Optional[_VariationalDistribution] = None, variational_strategy: Type[_VariationalStrategy] = VariationalStrategy, inducing_points: Optional[Union[Tensor, int]] = None, + inducing_point_allocator: Optional[InducingPointAllocator] = None, ) -> None: r""" Args: @@ -179,6 +185,9 @@ def __init__( VariationalStrategy). The default setting uses "whitening" of the variational distribution to make training easier. inducing_points: The number or specific locations of the inducing points. + inducing_point_allocator: The `InducingPointAllocator` used to + initialize the inducing point locations. If omitted, + uses `GreedyVarianceReduction`. """ # We use the model subclass wrapper to deal with input / outcome transforms. # The number of outputs will be correct here due to the check in @@ -209,14 +218,17 @@ def __init__( "covar_module.base_kernel.raw_lengthscale": -3, } - # initialize inducing points with a pivoted cholesky init if they are not given + if inducing_point_allocator is None: + inducing_point_allocator = GreedyVarianceReduction() + + # initialize inducing points if they are not given if not isinstance(inducing_points, Tensor): if inducing_points is None: # number of inducing points is 25% the number of data points # as a heuristic inducing_points = int(0.25 * train_X.shape[-2]) - inducing_points = _select_inducing_points( + inducing_points = inducing_point_allocator.allocate_inducing_points( inputs=train_X, covar_module=covar_module, num_inducing=inducing_points, @@ -255,8 +267,14 @@ def forward(self, X) -> MultivariateNormal: class SingleTaskVariationalGP(ApproximateGPyTorchModel): - r"""A single-task variational GP model following [hensman2013svgp]_ with pivoted - Cholesky initialization following [chen2018dpp]_ and [burt2020svgp]_. + r"""A single-task variational GP model following [hensman2013svgp]_. + + By default, the inducing points are initialized though the + `GreedyVarianceReduction` of [burt2020svgp]_, which is known to be + effective for building globally accurate models. However, custom + inducing point allocators designed for specific down-stream tasks can also be + provided (see [moss2023ipa]_ for details), e.g. `GreedyImprovementReduction` + when the goal is to build a model suitable for standard BO. A single-task variational GP using relatively strong priors on the Kernel hyperparameters, which work best when covariates are normalized to the unit @@ -299,6 +317,7 @@ def __init__( inducing_points: Optional[Union[Tensor, int]] = None, outcome_transform: Optional[OutcomeTransform] = None, input_transform: Optional[InputTransform] = None, + inducing_point_allocator: Optional[InducingPointAllocator] = None, ) -> None: r""" Args: @@ -319,6 +338,9 @@ def __init__( VariationalStrategy). The default setting uses "whitening" of the variational distribution to make training easier. inducing_points: The number or specific locations of the inducing points. + inducing_point_allocator: The `InducingPointAllocator` used to + initialize the inducing point locations. If omitted, + uses `GreedyVarianceReduction`. """ with torch.no_grad(): transformed_X = self.transform_inputs( @@ -357,6 +379,19 @@ def __init__( else: self._is_custom_likelihood = True + if learn_inducing_points and (inducing_point_allocator is not None): + warnings.warn( + "After all the effort of specifying an inducing point allocator, " + "you probably want to stop the inducing point locations " + "being further optimized during the model fit. If so " + "then set `learn_inducing_points` to False.", + UserWarning, + ) + + if inducing_point_allocator is None: + inducing_point_allocator = GreedyVarianceReduction() + self._inducing_point_allocator = inducing_point_allocator + model = _SingleTaskVariationalGP( train_X=transformed_X, train_Y=train_Y, @@ -367,6 +402,7 @@ def __init__( variational_distribution=variational_distribution, variational_strategy=variational_strategy, inducing_points=inducing_points, + inducing_point_allocator=self._inducing_point_allocator, ) super().__init__(model=model, likelihood=likelihood, num_outputs=num_outputs) @@ -390,7 +426,7 @@ def init_inducing_points( ) -> Tensor: r""" Reinitialize the inducing point locations in-place with the current kernel - applied to `inputs`. + applied to `inputs` through the model's inducing point allocation strategy. The variational distribution and variational strategy caches are reset. Args: @@ -407,7 +443,7 @@ def init_inducing_points( with torch.no_grad(): num_inducing = var_strat.inducing_points.size(-2) - inducing_points = _select_inducing_points( + inducing_points = self._inducing_point_allocator.allocate_inducing_points( inputs=inputs, covar_module=self.model.covar_module, num_inducing=num_inducing, @@ -417,131 +453,3 @@ def init_inducing_points( var_strat.variational_params_initialized.fill_(0) return inducing_points - - -def _select_inducing_points( - inputs: Tensor, - covar_module: Module, - num_inducing: int, - input_batch_shape: torch.Size, -) -> Tensor: - r""" - Utility function that evaluates a kernel at given inputs and selects inducing point - locations based on the pivoted Cholesky heuristic. - - Args: - inputs: A (*batch_shape, n, d)-dim input data tensor. - covar_module: GPyTorch Module returning a LinearOperator kernel matrix. - num_inducing: The maximun number (m) of inducing points (m <= n). - input_batch_shape: The non-task-related batch shape. - - Returns: - A (*batch_shape, m, d)-dim tensor of inducing point locations. - """ - - train_train_kernel = covar_module(inputs).evaluate_kernel() - - # base case - if train_train_kernel.ndimension() == 2: - inducing_points = _pivoted_cholesky_init( - train_inputs=inputs, - kernel_matrix=train_train_kernel, - max_length=num_inducing, - ) - # multi-task case - elif train_train_kernel.ndimension() == 3 and len(input_batch_shape) == 0: - input_element = inputs[0] if inputs.ndimension() == 3 else inputs - kernel_element = train_train_kernel[0] - inducing_points = _pivoted_cholesky_init( - train_inputs=input_element, - kernel_matrix=kernel_element, - max_length=num_inducing, - ) - # batched input cases - else: - batched_inputs = ( - inputs.expand(*input_batch_shape, -1, -1) - if inputs.ndimension() == 2 - else inputs - ) - reshaped_inputs = batched_inputs.flatten(end_dim=-3) - inducing_points = [] - for input_element in reshaped_inputs: - # the extra kernel evals are a little wasteful but make it - # easier to infer the task batch size - kernel_element = covar_module(input_element).evaluate_kernel() - # handle extra task batch dimension - kernel_element = ( - kernel_element[0] - if kernel_element.ndimension() == 3 - else kernel_element - ) - inducing_points.append( - _pivoted_cholesky_init( - train_inputs=input_element, - kernel_matrix=kernel_element, - max_length=num_inducing, - ) - ) - inducing_points = torch.stack(inducing_points).view( - *input_batch_shape, num_inducing, -1 - ) - - return inducing_points - - -def _pivoted_cholesky_init( - train_inputs: Tensor, - kernel_matrix: Union[Tensor, LinearOperator], - max_length: int, - epsilon: float = 1e-6, -) -> Tensor: - r""" - A pivoted cholesky initialization method for the inducing points, - originally proposed in [burt2020svgp]_ with the algorithm itself coming from - [chen2018dpp]_. Code is a PyTorch version from [chen2018dpp]_, copied from - https://github.com/laming-chen/fast-map-dpp/blob/master/dpp.py. - - Args: - train_inputs: training inputs (of shape n x d) - kernel_matrix: kernel matrix on the training - inputs - max_length: number of inducing points to initialize - epsilon: numerical jitter for stability. - - Returns: - max_length x d tensor of the training inputs corresponding to the top - max_length pivots of the training kernel matrix - """ - - # this is numerically equivalent to iteratively performing a pivoted cholesky - # while storing the diagonal pivots at each iteration - # TODO: use gpytorch's pivoted cholesky instead once that gets an exposed list - # TODO: ensure this works in batch mode, which it does not currently. - - item_size = kernel_matrix.shape[-2] - cis = torch.zeros( - (max_length, item_size), device=kernel_matrix.device, dtype=kernel_matrix.dtype - ) - di2s = kernel_matrix.diag() - selected_items = [] - selected_item = torch.argmax(di2s) - selected_items.append(selected_item) - - while len(selected_items) < max_length: - k = len(selected_items) - 1 - ci_optimal = cis[:k, selected_item] - di_optimal = torch.sqrt(di2s[selected_item]) - elements = kernel_matrix[..., selected_item, :] - eis = (elements - torch.matmul(ci_optimal, cis[:k, :])) / di_optimal - cis[k, :] = eis - di2s = di2s - eis.pow(2.0) - di2s[selected_item] = NEG_INF - selected_item = torch.argmax(di2s) - if di2s[selected_item] < epsilon: - break - selected_items.append(selected_item) - - ind_points = train_inputs[torch.stack(selected_items)] - - return ind_points diff --git a/botorch/models/utils/inducing_point_allocators.py b/botorch/models/utils/inducing_point_allocators.py new file mode 100644 index 0000000000..6269339653 --- /dev/null +++ b/botorch/models/utils/inducing_point_allocators.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r""" +Functionality for allocating the inducing points of sparse Gaussian +process models. + +References + +.. [chen2018dpp] + Laming Chen and Guoxin Zhang and Hanning Zhou, Fast greedy MAP inference + for determinantal point process to improve recommendation diversity, + Proceedings of the 32nd International Conference on Neural Information + Processing Systems, 2018, https://arxiv.org/abs/1709.05135. + +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Union + +import torch +from botorch.models.model import Model + +from botorch.utils.probability.utils import ndtr as Phi, phi +from gpytorch.module import Module +from linear_operator.operators import LinearOperator +from torch import Tensor + +NEG_INF = torch.tensor(float("-inf")) + + +class InducingPointAllocator(ABC): + r""" + This class provides functionality to initialize the inducing point locations + of an inducing point-based model, e.g. a `SingleTaskVariationalGP`. + """ + + @abstractmethod + def _get_quality_function( + self, + ) -> QualityFunction: + """ + Build the quality function required for this inducing point allocation strategy. + + Returns: + A quality function. + """ + + pass # pragma: no cover + + def allocate_inducing_points( + self, + inputs: Tensor, + covar_module: Module, + num_inducing: int, + input_batch_shape: torch.Size, + ) -> Tensor: + r""" + Initialize the `num_inducing` inducing point locations according to a + specific initialization strategy. todo say something about quality + + Args: + inputs: A (\*batch_shape, n, d)-dim input data tensor. + covar_module: GPyTorch Module returning a LinearOperator kernel matrix. + num_inducing: The maximun number (m) of inducing points (m <= n). + input_batch_shape: The non-task-related batch shape. + + Returns: + A (\*batch_shape, m, d)-dim tensor of inducing point locations. + """ + quality_function = self._get_quality_function() + covar_module = covar_module.to(inputs.device) + + train_train_kernel = covar_module(inputs).evaluate_kernel() + + # base case + if train_train_kernel.ndimension() == 2: + quality_scores = quality_function(inputs) + inducing_points = _pivoted_cholesky_init( + train_inputs=inputs, + kernel_matrix=train_train_kernel, + max_length=num_inducing, + quality_scores=quality_scores, + ) + # multi-task case + elif train_train_kernel.ndimension() == 3 and len(input_batch_shape) == 0: + quality_scores = quality_function(inputs) + input_element = inputs[0] if inputs.ndimension() == 3 else inputs + kernel_element = train_train_kernel[0] + quality_scores = quality_function(input_element) + inducing_points = _pivoted_cholesky_init( + train_inputs=input_element, + kernel_matrix=kernel_element, + max_length=num_inducing, + quality_scores=quality_scores, + ) + # batched input cases + else: + batched_inputs = ( + inputs.expand(*input_batch_shape, -1, -1) + if inputs.ndimension() == 2 + else inputs + ) + reshaped_inputs = batched_inputs.flatten(end_dim=-3) + inducing_points = [] + for input_element in reshaped_inputs: + # the extra kernel evals are a little wasteful but make it + # easier to infer the task batch size + kernel_element = covar_module(input_element).evaluate_kernel() + # handle extra task batch dimension + kernel_element = ( + kernel_element[0] + if kernel_element.ndimension() == 3 + else kernel_element + ) + quality_scores = quality_function(input_element) + inducing_points.append( + _pivoted_cholesky_init( + train_inputs=input_element, + kernel_matrix=kernel_element, + max_length=num_inducing, + quality_scores=quality_scores, + ) + ) + inducing_points = torch.stack(inducing_points).view( + *input_batch_shape, num_inducing, -1 + ) + + return inducing_points + + +class QualityFunction(ABC): + """A function that scores inputs with respect + to a specific criterion.""" + + @abstractmethod + def __call__(self, inputs: Tensor) -> Tensor: # [n, d] -> [n] + """ + Args: + inputs: inputs (of shape n x d) + + Returns: + A tensor of quality scores for each input, of shape [n] + """ + + pass # pragma: no cover + + +class UnitQualityFunction(QualityFunction): + """ + A function returning ones for each element. Using this quality function + for inducing point allocation corresponds to allocating inducing points + with the sole aim of minimizing predictive variance, i.e. the approach + of [burt2020svgp]_. + """ + + @torch.no_grad() + def __call__(self, inputs: Tensor) -> Tensor: # [n, d]-> [n] + """ + Args: + inputs: inputs (of shape n x d) + + Returns: + A tensor of ones for each input, of shape [n] + """ + return torch.ones([inputs.shape[0]], device=inputs.device, dtype=inputs.dtype) + + +class ExpectedImprovementQualityFunction(QualityFunction): + """ + A function measuring the quality of input points as their expected + improvement with respect to a conservative baseline. Expectations + are according to the model from the previous BO step. See [moss2023ipa]_ + for details and justification. + """ + + def __init__(self, model: Model, maximize: bool): + r""" + Args: + model: The model fitted during the previous BO step. For now, this + must be a single task model (i.e. num_outputs=1). + maximize: Set True if we are performing function maximization, else + set False. + """ + if model.num_outputs != 1: + raise NotImplementedError( + "Multi-output models are currently not supported. " + ) + self._model = model + self._maximize = maximize + + @torch.no_grad() + def __call__(self, inputs: Tensor) -> Tensor: # [n, d] -> [n] + """ + Args: + inputs: inputs (of shape n x d) + + Returns: + A tensor of quality scores for each input, of shape [n] + """ + + posterior = self._model.posterior(inputs) + mean = posterior.mean.squeeze(-2).squeeze(-1) # removing redundant dimensions + sigma = posterior.variance.clamp_min(1e-12).sqrt().view(mean.shape) + + best_f = torch.max(mean) if self._maximize else torch.min(mean) + u = (mean - best_f) / sigma if self._maximize else -(mean - best_f) / sigma + return sigma * (phi(u) + u * Phi(u)) + + +class GreedyVarianceReduction(InducingPointAllocator): + r""" + The inducing point allocator proposed by [burt2020svgp]_, that + greedily chooses inducing point locations with maximal (conditional) + predictive variance. + """ + + def _get_quality_function( + self, + ) -> QualityFunction: + """ + Build the unit quality function required for the greedy variance + reduction inducing point allocation strategy. + + Returns: + A quality function. + """ + + return UnitQualityFunction() + + +class GreedyImprovementReduction(InducingPointAllocator): + r""" + An inducing point allocator that greedily chooses inducing points with large + predictive variance and that are in promising regions of the search + space (according to the model form the previous BO step), see [moss2023ipa]_. + """ + + def __init__(self, model: Model, maximize: bool): + r""" + + Args: + model: The model fitted during the previous BO step. + maximize: Set True if we are performing function maximization, else + set False. + """ + self._model = model + self._maximize = maximize + + def _get_quality_function( + self, + ) -> QualityFunction: + """ + Build the improvement-based quality function required for the greedy + improvement reduction inducing point allocation strategy. + + Returns: + A quality function. + """ + + return ExpectedImprovementQualityFunction(self._model, self._maximize) + + +def _pivoted_cholesky_init( + train_inputs: Tensor, + kernel_matrix: Union[Tensor, LinearOperator], + max_length: int, + quality_scores: Tensor, + epsilon: float = 1e-6, +) -> Tensor: + r""" + A pivoted Cholesky initialization method for the inducing points, + originally proposed in [burt2020svgp]_ with the algorithm itself coming from + [chen2018dpp]_. Code is a PyTorch version from [chen2018dpp]_, based on + https://github.com/laming-chen/fast-map-dpp/blob/master/dpp.py but with a small + modification to allow the underlying DPP to be defined through its diversity-quality + decomposition,as discussed by [moss2023ipa]_. This method returns a greedy + approximation of the MAP estimate of the specified DPP, i.e. its returns a + set of points that are highly diverse (according to the provided kernel_matrix) + and have high quality (according to the provided quality_scores). + + Args: + train_inputs: training inputs (of shape n x d) + kernel_matrix: kernel matrix on the training inputs + max_length: number of inducing points to initialize + quality_scores: scores representing the quality of each candidate + input (of shape [n]) + epsilon: numerical jitter for stability. + + Returns: + max_length x d tensor of the training inputs corresponding to the top + max_length pivots of the training kernel matrix + """ + + # this is numerically equivalent to iteratively performing a pivoted cholesky + # while storing the diagonal pivots at each iteration + # TODO: use gpytorch's pivoted cholesky instead once that gets an exposed list + # TODO: ensure this works in batch mode, which it does not currently. + + # todo test for shape of quality function + + if quality_scores.shape[0] != train_inputs.shape[0]: + raise ValueError( + "_pivoted_cholesky_init requires a quality score for each of train_inputs" + ) + + item_size = kernel_matrix.shape[-2] + cis = torch.zeros( + (max_length, item_size), device=kernel_matrix.device, dtype=kernel_matrix.dtype + ) + di2s = kernel_matrix.diag() + scores = di2s * (quality_scores**2) + selected_items = [] + selected_item = torch.argmax(scores) + selected_items.append(selected_item) + + while len(selected_items) < max_length: + k = len(selected_items) - 1 + ci_optimal = cis[:k, selected_item] + di_optimal = torch.sqrt(di2s[selected_item]) + elements = kernel_matrix[..., selected_item, :] + eis = (elements - torch.matmul(ci_optimal, cis[:k, :])) / di_optimal + cis[k, :] = eis + di2s = di2s - eis.pow(2.0) + di2s[selected_item] = NEG_INF + scores = di2s * (quality_scores**2) + selected_item = torch.argmax(scores) + if di2s[selected_item] < epsilon: + break + selected_items.append(selected_item) + + ind_points = train_inputs[torch.stack(selected_items)] + + return ind_points[:max_length, :] diff --git a/sphinx/source/models.rst b/sphinx/source/models.rst index 533f6fe52c..c7f3c12730 100644 --- a/sphinx/source/models.rst +++ b/sphinx/source/models.rst @@ -165,6 +165,12 @@ Model Conversion .. automodule:: botorch.models.converter :members: +Inducing Point Allocators +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: botorch.models.utils.inducing_point_allocators + :members: + :private-members: _pivoted_cholesky_init + Other Utilties ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. automodule:: botorch.models.utils.assorted diff --git a/test/models/test_approximate_gp.py b/test/models/test_approximate_gp.py index 5c1b5daabb..40d10ca29d 100644 --- a/test/models/test_approximate_gp.py +++ b/test/models/test_approximate_gp.py @@ -14,6 +14,10 @@ ) from botorch.models.transforms.input import Normalize from botorch.models.transforms.outcome import Log +from botorch.models.utils.inducing_point_allocators import ( + GreedyImprovementReduction, + GreedyVarianceReduction, +) from botorch.posteriors import GPyTorchPosterior, TransformedPosterior from botorch.utils.testing import BotorchTestCase from gpytorch.likelihoods import GaussianLikelihood, MultitaskGaussianLikelihood @@ -181,6 +185,27 @@ def test_initializations(self): else: self.assertFalse(hasattr(model, "outcome_transform")) + # test default inducing point allocator + self.assertIsInstance(model._inducing_point_allocator, GreedyVarianceReduction) + + # test that can specify an inducing point allocator + for ipa in [ + GreedyVarianceReduction(), + GreedyImprovementReduction(model, maximize=True), + ]: + model = SingleTaskVariationalGP(train_X, inducing_point_allocator=ipa) + self.assertTrue(type(model._inducing_point_allocator), type(ipa)) + + # test warning when learning on and custom IPA provided + with self.assertWarnsRegex( + UserWarning, r"set `learn_inducing_points` to False" + ): + SingleTaskVariationalGP( + train_X, + learn_inducing_points=True, + inducing_point_allocator=GreedyVarianceReduction(), + ) + def test_inducing_point_init(self): train_X_1 = torch.rand(15, 1, device=self.device) train_X_2 = torch.rand(15, 1, device=self.device) @@ -193,6 +218,8 @@ def test_inducing_point_init(self): model_2 = SingleTaskVariationalGP(train_X=train_X_2, inducing_points=5) model_2_inducing = model_2.model.variational_strategy.inducing_points + self.assertEqual(model_1_inducing.shape, (5, 1)) + self.assertEqual(model_2_inducing.shape, (5, 1)) self.assertAllClose(model_1_inducing, model_2_inducing) # multi-task @@ -211,6 +238,8 @@ def test_inducing_point_init(self): model_2.model.variational_strategy.base_variational_strategy.inducing_points ) + self.assertEqual(model_1_inducing.shape, (5, 1)) + self.assertEqual(model_2_inducing.shape, (5, 1)) self.assertAllClose(model_1_inducing, model_2_inducing) # batched inputs @@ -223,12 +252,55 @@ def test_inducing_point_init(self): ) model_1.init_inducing_points(train_X_2) model_1_inducing = model_1.model.variational_strategy.inducing_points - model_2 = SingleTaskVariationalGP( train_X=train_X_2, train_Y=train_Y, inducing_points=5 ) model_2_inducing = model_2.model.variational_strategy.inducing_points - self.assertTrue(model_1_inducing.shape == (2, 5, 1)) - self.assertTrue(model_2_inducing.shape == (2, 5, 1)) + self.assertEqual(model_1_inducing.shape, (2, 5, 1)) + self.assertEqual(model_2_inducing.shape, (2, 5, 1)) + self.assertAllClose(model_1_inducing, model_2_inducing) + + def test_custom_inducing_point_init(self): + train_X_0 = torch.rand(15, 1, device=self.device) + train_X_1 = torch.rand(15, 1, device=self.device) + train_X_2 = torch.rand(15, 1, device=self.device) + train_X_3 = torch.rand(15, 1, device=self.device) + + model_from_previous_step = SingleTaskVariationalGP( + train_X=train_X_0, inducing_points=5 + ) + + model_1 = SingleTaskVariationalGP( + train_X=train_X_1, + inducing_points=5, + inducing_point_allocator=GreedyImprovementReduction( + model_from_previous_step, maximize=True + ), + ) + model_1.init_inducing_points(train_X_2) + model_1_inducing = model_1.model.variational_strategy.inducing_points + + model_2 = SingleTaskVariationalGP( + train_X=train_X_2, + inducing_points=5, + inducing_point_allocator=GreedyImprovementReduction( + model_from_previous_step, maximize=True + ), + ) + model_2_inducing = model_2.model.variational_strategy.inducing_points + + model_3 = SingleTaskVariationalGP( + train_X=train_X_3, + inducing_points=5, + inducing_point_allocator=GreedyImprovementReduction( + model_from_previous_step, maximize=False + ), + ) + model_3.init_inducing_points(train_X_2) + model_3_inducing = model_3.model.variational_strategy.inducing_points + + self.assertEqual(model_1_inducing.shape, (5, 1)) + self.assertEqual(model_2_inducing.shape, (5, 1)) self.assertAllClose(model_1_inducing, model_2_inducing) + self.assertFalse(model_1_inducing[0, 0] == model_3_inducing[0, 0]) diff --git a/test/models/utils/test_inducing_point_allocators.py b/test/models/utils/test_inducing_point_allocators.py new file mode 100644 index 0000000000..4873629fe5 --- /dev/null +++ b/test/models/utils/test_inducing_point_allocators.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +from botorch.models.approximate_gp import SingleTaskVariationalGP +from botorch.models.utils.inducing_point_allocators import ( + _pivoted_cholesky_init, + ExpectedImprovementQualityFunction, + GreedyImprovementReduction, + GreedyVarianceReduction, + UnitQualityFunction, +) +from botorch.utils.testing import BotorchTestCase + +from gpytorch.kernels import MaternKernel +from gpytorch.likelihoods import GaussianLikelihood +from gpytorch.mlls import VariationalELBO + + +class TestUnitQualityFunction(BotorchTestCase): + def setUp(self): + super().setUp() + self.quality_function = UnitQualityFunction() + + def test_returns_ones_and_correct_shape(self): + train_X = torch.rand(15, 1, device=self.device) + scores = self.quality_function(train_X) + self.assertTrue(torch.equal(scores, torch.ones([15], device=self.device))) + + +class TestExpectedImprovementQualityFunction(BotorchTestCase): + def setUp(self): + super().setUp() + train_X = torch.rand(10, 1, device=self.device) + train_y = torch.sin(train_X) + torch.randn_like(train_X) * 0.2 + + self.previous_model = SingleTaskVariationalGP( + train_X=train_X, likelihood=GaussianLikelihood() + ).to(self.device) + + mll = VariationalELBO( + self.previous_model.likelihood, self.previous_model.model, num_data=10 + ) + loss = -mll( + self.previous_model.likelihood(self.previous_model(train_X)), train_y + ).sum() + loss.backward() + + def test_returns_correct_shape(self): + train_X = torch.rand(15, 1, device=self.device) + for maximize in [True, False]: + quality_function = ExpectedImprovementQualityFunction( + self.previous_model, maximize=maximize + ) + scores = quality_function(train_X) + self.assertEqual(scores.shape, torch.Size([15])) + + def test_raises_for_multi_output_model(self): + train_X = torch.rand(15, 1, device=self.device) + mo_model = SingleTaskVariationalGP( + train_X=train_X, likelihood=GaussianLikelihood(), num_outputs=5 + ).to(self.device) + with self.assertRaises(NotImplementedError): + ExpectedImprovementQualityFunction(mo_model, maximize=True) + + def test_different_for_maximize_and_minimize(self): + train_X = torch.rand(15, 1, device=self.device) + + quality_function_for_max = ExpectedImprovementQualityFunction( + self.previous_model, maximize=True + ) + scores_for_max = quality_function_for_max(train_X) + + quality_function_for_min = ExpectedImprovementQualityFunction( + self.previous_model, maximize=False + ) + scores_for_min = quality_function_for_min(train_X) + + self.assertFalse(torch.equal(scores_for_min, scores_for_max)) + + def test_ei_calc_via_monte_carlo(self): + for maximize in [True, False]: + train_X = torch.rand(10, 1, device=self.device) + posterior = self.previous_model.posterior(train_X) + mean = posterior.mean.squeeze(-2).squeeze(-1) + sigma = posterior.variance.sqrt().view(mean.shape) + normal = torch.distributions.Normal(mean, sigma) + samples = normal.sample([1_000_000]) + if maximize: + baseline = torch.min(mean) + ei = torch.clamp(samples - baseline, min=0.0).mean(axis=0) + else: + baseline = torch.max(mean) + ei = torch.clamp(baseline - samples, min=0.0).mean(axis=0) + + quality_function = ExpectedImprovementQualityFunction( + self.previous_model, maximize + ) + + self.assertAllClose(ei, quality_function(train_X), atol=0.01, rtol=0.01) + + +class TestGreedyVarianceReduction(BotorchTestCase): + def setUp(self): + super().setUp() + self.ipa = GreedyVarianceReduction() + + def test_initialization(self): + self.assertIsInstance(self.ipa, GreedyVarianceReduction) + + def test_inducing_points_shape_and_repeatability(self): + + for train_X in [ + torch.rand(15, 1, device=self.device), # single task + torch.rand(2, 15, 1, device=self.device), # batched inputs + ]: + + inducing_points_1 = self.ipa.allocate_inducing_points( + inputs=train_X, + covar_module=MaternKernel(), + num_inducing=5, + input_batch_shape=torch.Size([]), + ) + + inducing_points_2 = self.ipa.allocate_inducing_points( + inputs=train_X, + covar_module=MaternKernel(), + num_inducing=5, + input_batch_shape=torch.Size([]), + ) + + if len(train_X) == 3: # batched inputs + self.assertEqual(inducing_points_1.shape, (2, 5, 1)) + self.assertEqual(inducing_points_2.shape, (2, 5, 1)) + else: + self.assertEqual(inducing_points_1.shape, (5, 1)) + self.assertEqual(inducing_points_2.shape, (5, 1)) + self.assertAllClose(inducing_points_1, inducing_points_2) + + def test_that_we_dont_get_redundant_inducing_points(self): + train_X = torch.rand(15, 1, device=self.device) + stacked_train_X = torch.cat((train_X, train_X), dim=0) + num_inducing = 20 + inducing_points_1 = self.ipa.allocate_inducing_points( + inputs=stacked_train_X, + covar_module=MaternKernel(), + num_inducing=num_inducing, + input_batch_shape=torch.Size([]), + ) + # should not have 20 inducing points when 15 singular dimensions + # are passed + self.assertLess(inducing_points_1.shape[-2], num_inducing) + + +class TestGreedyImprovementReduction(BotorchTestCase): + def setUp(self): + super().setUp() + train_X = torch.rand(10, 1, device=self.device) + train_y = torch.sin(train_X) + torch.randn_like(train_X) * 0.2 + + self.previous_model = SingleTaskVariationalGP( + train_X=train_X, likelihood=GaussianLikelihood() + ).to(self.device) + + mll = VariationalELBO( + self.previous_model.likelihood, self.previous_model.model, num_data=10 + ) + loss = -mll( + self.previous_model.likelihood(self.previous_model(train_X)), train_y + ).sum() + loss.backward() + + self.ipa = GreedyImprovementReduction(self.previous_model, maximize=True) + + def test_initialization(self): + self.assertIsInstance(self.ipa, GreedyImprovementReduction) + self.assertIsInstance(self.ipa._model, SingleTaskVariationalGP) + self.assertEqual(self.ipa._maximize, True) + + def test_raises_for_multi_output_model(self): + train_X = torch.rand(10, 1, device=self.device) + model = SingleTaskVariationalGP( + train_X=train_X, likelihood=GaussianLikelihood(), num_outputs=5 + ).to(self.device) + ipa = GreedyImprovementReduction(model, maximize=True) + with self.assertRaises(NotImplementedError): + ipa.allocate_inducing_points( + inputs=train_X, + covar_module=MaternKernel(), + num_inducing=5, + input_batch_shape=torch.Size([]), + ) + + def test_inducing_points_shape_and_repeatability(self): + train_X = torch.rand(15, 1, device=self.device) + + for train_X in [ + torch.rand(15, 1, device=self.device), # single task + torch.rand(2, 15, 1, device=self.device), # batched inputs + ]: + + inducing_points_1 = self.ipa.allocate_inducing_points( + inputs=train_X, + covar_module=MaternKernel(), + num_inducing=5, + input_batch_shape=torch.Size([]), + ) + + inducing_points_2 = self.ipa.allocate_inducing_points( + inputs=train_X, + covar_module=MaternKernel(), + num_inducing=5, + input_batch_shape=torch.Size([]), + ) + + if len(train_X) == 3: # batched inputs + self.assertEqual(inducing_points_1.shape, (2, 5, 1)) + self.assertEqual(inducing_points_2.shape, (2, 5, 1)) + else: + self.assertEqual(inducing_points_1.shape, (5, 1)) + self.assertEqual(inducing_points_2.shape, (5, 1)) + self.assertAllClose(inducing_points_1, inducing_points_2) + + def test_that_we_dont_get_redundant_inducing_points(self): + train_X = torch.rand(15, 1, device=self.device) + stacked_train_X = torch.cat((train_X, train_X), dim=0) + num_inducing = 20 + inducing_points_1 = self.ipa.allocate_inducing_points( + inputs=stacked_train_X, + covar_module=MaternKernel(), + num_inducing=num_inducing, + input_batch_shape=torch.Size([]), + ) + # should not have 20 inducing points when 15 singular dimensions + # are passed + self.assertLess(inducing_points_1.shape[-2], num_inducing) + + def test_inducing_points_different_when_minimizing(self): + ipa_for_max = GreedyImprovementReduction(self.previous_model, maximize=True) + ipa_for_min = GreedyImprovementReduction(self.previous_model, maximize=False) + + train_X = torch.rand(15, 1, device=self.device) + inducing_points_for_max = ipa_for_max.allocate_inducing_points( + inputs=train_X, + covar_module=MaternKernel(), + num_inducing=10, + input_batch_shape=torch.Size([]), + ) + inducing_points_for_min = ipa_for_min.allocate_inducing_points( + inputs=train_X, + covar_module=MaternKernel(), + num_inducing=10, + input_batch_shape=torch.Size([]), + ) + + self.assertFalse(torch.equal(inducing_points_for_min, inducing_points_for_max)) + + +class TestPivotedCholeskyInit(BotorchTestCase): + def test_raises_for_quality_function_with_invalid_shape(self): + with self.assertRaises(ValueError): + inputs = torch.rand(15, 1, device=self.device) + train_train_kernel = ( + MaternKernel().to(self.device)(inputs).evaluate_kernel() + ) + quality_scores = torch.ones([10, 1], device=self.device) + _pivoted_cholesky_init( + train_inputs=inputs, + kernel_matrix=train_train_kernel, + max_length=10, + quality_scores=quality_scores, + )