Fix fantasization with FixedNoiseGP and outcome transforms and use Fa…

…ntasizeMixin (#2011) Summary: This fixes fantasization with FixedNoiseGP when using outcome transforms----previously, already-transformed noise was transformed again during fantasization. This also improves the fantasization for batched and batched multi-output models to use the average noise for each batch and output. This also removes repeated code and uses the logic in `FantasizeMixin.fantasize` for handling `X` with size 0 on the -2 dimension. This also deprecates the use of `observation_noise` as a boolean argument to fantasize. Reviewed By: Balandat Differential Revision: D49200325
pytorch · Sep 18, 2023 · 169cb69 · 169cb69
1 parent fa51038
commit 169cb69
Show file tree

Hide file tree

Showing 15 changed files with 259 additions and 100 deletions.
diff --git a/botorch/acquisition/active_learning.py b/botorch/acquisition/active_learning.py
@@ -93,7 +93,8 @@ def forward(self, X: Tensor) -> Tensor:
         # Construct the fantasy model (we actually do not use the full model,
         # this is just a convenient way of computing fast posterior covariances
         fantasy_model = self.model.fantasize(
-            X=X, sampler=self.sampler, observation_noise=True
+            X=X,
+            sampler=self.sampler,
         )
 
         bdims = tuple(1 for _ in X.shape[:-2])

diff --git a/botorch/acquisition/knowledge_gradient.py b/botorch/acquisition/knowledge_gradient.py
@@ -184,7 +184,8 @@ def forward(self, X: Tensor) -> Tensor:
 
         # construct the fantasy model of shape `num_fantasies x b`
         fantasy_model = self.model.fantasize(
-            X=X_actual, sampler=self.sampler, observation_noise=True
+            X=X_actual,
+            sampler=self.sampler,
         )
 
         # get the value function
@@ -233,7 +234,8 @@ def evaluate(self, X: Tensor, bounds: Tensor, **kwargs: Any) -> Tensor:
 
         # construct the fantasy model of shape `num_fantasies x b`
         fantasy_model = self.model.fantasize(
-            X=X, sampler=self.sampler, observation_noise=True
+            X=X,
+            sampler=self.sampler,
         )
 
         # get the value function
@@ -451,7 +453,8 @@ def forward(self, X: Tensor) -> Tensor:
         # construct the fantasy model of shape `num_fantasies x b`
         # expand X (to potentially add trace observations)
         fantasy_model = self.model.fantasize(
-            X=self.expand(X_eval), sampler=self.sampler, observation_noise=True
+            X=self.expand(X_eval),
+            sampler=self.sampler,
         )
         # get the value function
         value_function = _get_value_function(

diff --git a/botorch/acquisition/max_value_entropy_search.py b/botorch/acquisition/max_value_entropy_search.py
@@ -389,7 +389,8 @@ def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
         if X_pending is not None:
             # fantasize the model and use this as the new model
             self.model = init_model.fantasize(
-                X=X_pending, sampler=self.fantasies_sampler, observation_noise=True
+                X=X_pending,
+                sampler=self.fantasies_sampler,
             )
         else:
             self.model = init_model

diff --git a/botorch/acquisition/multi_objective/max_value_entropy_search.py b/botorch/acquisition/multi_objective/max_value_entropy_search.py
@@ -146,7 +146,8 @@ def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
         if X_pending is not None:
             # fantasize the model
             fantasy_model = self._init_model.fantasize(
-                X=X_pending, sampler=self.fantasies_sampler, observation_noise=True
+                X=X_pending,
+                sampler=self.fantasies_sampler,
             )
             self.mo_model = fantasy_model
             # convert model to batched single outcome model.

diff --git a/botorch/acquisition/multi_step_lookahead.py b/botorch/acquisition/multi_step_lookahead.py
@@ -399,7 +399,7 @@ def _step(
     # construct fantasy model (with batch shape f_{j+1} x ... x f_1 x batch_shape)
     prop_grads = step_index > 0  # need to propagate gradients for steps > 0
     fantasy_model = model.fantasize(
-        X=X, sampler=samplers[0], observation_noise=True, propagate_grads=prop_grads
+        X=X, sampler=samplers[0], propagate_grads=prop_grads
     )
 
     # augment sample weights appropriately
@@ -585,7 +585,6 @@ def _get_induced_fantasy_model(
         fantasy_model = model.fantasize(
             X=Xs[0],
             sampler=samplers[0],
-            observation_noise=True,
         )
 
         return _get_induced_fantasy_model(

diff --git a/botorch/models/gp_regression.py b/botorch/models/gp_regression.py
@@ -30,15 +30,14 @@
 
 from __future__ import annotations
 
-from typing import Any, List, NoReturn, Optional, Union
+from typing import Any, List, NoReturn, Optional
 
 import torch
-from botorch import settings
 from botorch.models.gpytorch import BatchedMultiOutputGPyTorchModel
 from botorch.models.model import FantasizeMixin
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import Log, OutcomeTransform
-from botorch.models.utils import fantasize as fantasize_flag, validate_input_scaling
+from botorch.models.utils import validate_input_scaling
 from botorch.models.utils.gpytorch_modules import (
     get_gaussian_likelihood_with_gamma_prior,
     get_matern_kernel_with_gamma_prior,
@@ -164,7 +163,7 @@ def forward(self, x: Tensor) -> MultivariateNormal:
         return MultivariateNormal(mean_x, covar_x)
 
 
-class FixedNoiseGP(BatchedMultiOutputGPyTorchModel, ExactGP):
+class FixedNoiseGP(BatchedMultiOutputGPyTorchModel, ExactGP, FantasizeMixin):
     r"""A single-task exact GP model using fixed noise levels.
 
     A single-task exact GP that uses fixed observation noise levels, differing from
@@ -270,7 +269,7 @@ def fantasize(
         self,
         X: Tensor,
         sampler: MCSampler,
-        observation_noise: Union[bool, Tensor] = True,
+        observation_noise: Optional[Tensor] = None,
         **kwargs: Any,
     ) -> FixedNoiseGP:
         r"""Construct a fantasy model.
@@ -290,29 +289,32 @@ def fantasize(
                 `batch_shape` is the batch shape (must be compatible with the
                 batch shape of the model).
             sampler: The sampler used for sampling from the posterior at `X`.
-            observation_noise: If True, include the mean across the observation
-                noise in the training data as observation noise in the posterior
-                from which the samples are drawn. If a Tensor, use it directly
-                as the specified measurement noise.
+            observation_noise: The noise level for fantasization if
+                provided. If `None`, the mean across the observation
+                noise in the training data is used as observation noise in
+                the posterior from which the samples are drawn and
+                the fantasized noise level. If observation noise is
+                provided, it is assumed to be in the outcome-transformed
+                space, if an outcome transform is used.
 
         Returns:
             The constructed fantasy model.
         """
-        propagate_grads = kwargs.pop("propagate_grads", False)
-        with fantasize_flag():
-            with settings.propagate_grads(propagate_grads):
-                post_X = self.posterior(
-                    X, observation_noise=observation_noise, **kwargs
-                )
-            Y_fantasized = sampler(post_X)  # num_fantasies x batch_shape x n' x m
-            # Use the mean of the previous noise values (TODO: be smarter here).
-            # noise should be batch_shape x q x m when X is batch_shape x q x d, and
-            # Y_fantasized is num_fantasies x batch_shape x q x m.
-            noise_shape = Y_fantasized.shape[1:]
-            noise = self.likelihood.noise.mean().expand(noise_shape)
-            return self.condition_on_observations(
-                X=self.transform_inputs(X), Y=Y_fantasized, noise=noise
-            )
+        # self.likelihood.noise is an `batch_shape x n x s(m)`-dimensional tensor
+        if observation_noise is None:
+            if self.num_outputs > 1:
+                # make noise ... x n x m
+                observation_noise = self.likelihood.noise.transpose(-1, -2)
+            else:
+                observation_noise = self.likelihood.noise.unsqueeze(-1)
+            observation_noise = observation_noise.mean(dim=-2, keepdim=True)
+
+        return super().fantasize(
+            X=X,
+            sampler=sampler,
+            observation_noise=observation_noise,
+            **kwargs,
+        )
 
     def forward(self, x: Tensor) -> MultivariateNormal:
         # TODO: reduce redundancy with the 'forward' method of

diff --git a/botorch/models/gpytorch.py b/botorch/models/gpytorch.py
@@ -159,7 +159,9 @@ def posterior(
                 jointly.
             observation_noise: If True, add the observation noise from the
                 likelihood to the posterior. If a Tensor, use it directly as the
-                observation noise (must be of shape `(batch_shape) x q`).
+                observation noise (must be of shape `(batch_shape) x q`). It is
+                assumed to be in the outcome-transformed space if an outcome
+                transform is used.
             posterior_transform: An optional PosteriorTransform.
 
         Returns:
@@ -223,7 +225,8 @@ def condition_on_observations(self, X: Tensor, Y: Tensor, **kwargs: Any) -> Mode
             # pass the transformed data to get_fantasy_model below
             # (unless we've already trasnformed if BatchedMultiOutputGPyTorchModel)
             if not isinstance(self, BatchedMultiOutputGPyTorchModel):
-                Y, Yvar = self.outcome_transform(Y, Yvar)
+                # `noise` is assumed to already be outcome-transformed.
+                Y, _ = self.outcome_transform(Y, Yvar)
         # validate using strict=False, since we cannot tell if Y has an explicit
         # output dimension
         self._validate_tensor_args(X=X, Y=Y, Yvar=Yvar, strict=False)
@@ -373,18 +376,32 @@ def posterior(
                 )
             mvn = self(X)
             if observation_noise is not False:
+                if self._num_outputs > 1:
+                    noise_shape = X.shape[:-3] + torch.Size(
+                        [self._num_outputs, X.shape[-2]]
+                    )
+                else:
+                    noise_shape = X.shape[:-1]
                 if torch.is_tensor(observation_noise):
                     # TODO: Validate noise shape
                     # make observation_noise `batch_shape x q x n`
                     if self.num_outputs > 1:
                         obs_noise = observation_noise.transpose(-1, -2)
                     else:
                         obs_noise = observation_noise.squeeze(-1)
-                    mvn = self.likelihood(mvn, X, noise=obs_noise)
+                    mvn = self.likelihood(
+                        mvn,
+                        X,
+                        noise=obs_noise.expand(noise_shape),
+                    )
                 elif isinstance(self.likelihood, FixedNoiseGaussianLikelihood):
                     # Use the mean of the previous noise values (TODO: be smarter here).
-                    noise = self.likelihood.noise.mean().expand(X.shape[:-1])
-                    mvn = self.likelihood(mvn, X, noise=noise)
+                    observation_noise = self.likelihood.noise.mean(dim=-1, keepdim=True)
+                    mvn = self.likelihood(
+                        mvn,
+                        X,
+                        noise=observation_noise.expand(noise_shape),
+                    )
                 else:
                     mvn = self.likelihood(mvn, X)
             if self._num_outputs > 1:
@@ -443,8 +460,9 @@ def condition_on_observations(
         """
         noise = kwargs.get("noise")
         if hasattr(self, "outcome_transform"):
-            # we need to apply transforms before shifting batch indices around
-            Y, noise = self.outcome_transform(Y, noise)
+            # We need to apply transforms before shifting batch indices around.
+            # `noise` is assumed to already be outcome-transformed.
+            Y, _ = self.outcome_transform(Y)
         self._validate_tensor_args(X=X, Y=Y, Yvar=noise, strict=False)
         inputs = X
         if self._num_outputs > 1:

diff --git a/botorch/models/model.py b/botorch/models/model.py
@@ -33,7 +33,11 @@
 import numpy as np
 import torch
 from botorch import settings
-from botorch.exceptions.errors import BotorchTensorDimensionError, InputDataError
+from botorch.exceptions.errors import (
+    BotorchTensorDimensionError,
+    DeprecationError,
+    InputDataError,
+)
 from botorch.logging import shape_to_str
 from botorch.models.utils.assorted import fantasize as fantasize_flag
 from botorch.posteriors import Posterior, PosteriorList
@@ -83,7 +87,7 @@ def posterior(
         self,
         X: Tensor,
         output_indices: Optional[List[int]] = None,
-        observation_noise: bool = False,
+        observation_noise: Union[bool, Tensor] = False,
         posterior_transform: Optional[PosteriorTransform] = None,
         **kwargs: Any,
     ) -> Posterior:
@@ -102,7 +106,12 @@ def posterior(
                 Can be used to speed up computation if only a subset of the
                 model's outputs are required for optimization. If omitted,
                 computes the posterior over all model outputs.
-            observation_noise: If True, add observation noise to the posterior.
+            observation_noise: For models with an inferred noise level, if True,
+                include observation noise. For models with an observed noise level,
+                this must be a `model_batch_shape x 1 x m`-dim tensor or
+                a `model_batch_shape x n' x m`-dim tensor containing the average
+                noise for each batch and output. `noise` must be in the
+                outcome-transformed space if an outcome transform is used.
             posterior_transform: An optional PosteriorTransform.
 
         Returns:
@@ -310,7 +319,7 @@ def fantasize(
         # TODO: see if any of these can be imported only if TYPE_CHECKING
         X: Tensor,
         sampler: MCSampler,
-        observation_noise: bool = True,
+        observation_noise: Optional[Tensor] = None,
         **kwargs: Any,
     ) -> TFantasizeMixin:
         r"""Construct a fantasy model.
@@ -328,12 +337,21 @@ def fantasize(
                 `batch_shape` is the batch shape (must be compatible with the
                 batch shape of the model).
             sampler: The sampler used for sampling from the posterior at `X`.
-            observation_noise: If True, include observation noise.
+            observation_noise: A `model_batch_shape x 1 x m`-dim tensor or
+                a `model_batch_shape x n' x m`-dim tensor containing the average
+                noise for each batch and output, where `m` is the number of outputs.
+                `noise` must be in the outcome-transformed space if an outcome
+                transform is used. If None, then the noise will be the inferred
+                noise level.
             kwargs: Will be passed to `model.condition_on_observations`
 
         Returns:
             The constructed fantasy model.
         """
+        if not isinstance(observation_noise, Tensor) and observation_noise is not None:
+            raise DeprecationError(
+                "`fantasize` no longer accepts a boolean for `observation_noise`."
+            )
         # if the inputs are empty, expand the inputs
         if X.shape[-2] == 0:
             output_shape = (
@@ -350,8 +368,15 @@ def fantasize(
         propagate_grads = kwargs.pop("propagate_grads", False)
         with fantasize_flag():
             with settings.propagate_grads(propagate_grads):
-                post_X = self.posterior(X, observation_noise=observation_noise)
+                post_X = self.posterior(
+                    X,
+                    observation_noise=True
+                    if observation_noise is None
+                    else observation_noise,
+                )
             Y_fantasized = sampler(post_X)  # num_fantasies x batch_shape x n' x m
+            if observation_noise is not None:
+                kwargs["noise"] = observation_noise.expand(Y_fantasized.shape[1:])
             return self.condition_on_observations(
                 X=self.transform_inputs(X), Y=Y_fantasized, **kwargs
             )
@@ -434,7 +459,9 @@ def posterior(
                 respective likelihoods to the posterior. If a Tensor of shape
                 `(batch_shape) x q x m`, use it directly as the observation
                 noise (with `observation_noise[...,i]` added to the posterior
-                of the `i`-th model).
+                of the `i`-th model). `observation_noise` is assumed
+                to be in the outcome-transformed space, if an outcome transform
+                is used by the model.
             posterior_transform: An optional PosteriorTransform.
 
         Returns:
@@ -553,7 +580,7 @@ def fantasize(
         self,
         X: Tensor,
         sampler: MCSampler,
-        observation_noise: bool = True,
+        observation_noise: Optional[Tensor] = None,
         evaluation_mask: Optional[Tensor] = None,
         **kwargs: Any,
     ) -> Model:
@@ -573,7 +600,12 @@ def fantasize(
                 batch shape of the model).
             sampler: The sampler used for sampling from the posterior at `X`. If
                 evaluation_mask is not None, this must be a `ListSampler`.
-            observation_noise: If True, include observation noise.
+            observation_noise: A `model_batch_shape x 1 x m`-dim tensor or
+                a `model_batch_shape x n' x m`-dim tensor containing the average
+                noise for each batch and output, where `m` is the number of outputs.
+                `noise` must be in the outcome-transformed space if an outcome
+                transform is used. If None, then the noise will be the inferred
+                noise level.
             evaluation_mask: A `n' x m`-dim tensor of booleans indicating which
                 outputs should be fantasized for a given design. This uses the same
                 evaluation mask for all batches.
@@ -595,6 +627,8 @@ def fantasize(
 
         fant_models = []
         X_i = X
+        if observation_noise is None:
+            observation_noise_i = observation_noise
         for i in range(self.num_outputs):
             # get the inputs to fantasize at for output i
             if evaluation_mask is not None:
@@ -604,12 +638,15 @@ def fantasize(
                 # samples from a single Sobol sequence or consider requiring that the
                 # sampling is IID to ensure good coverage.
                 sampler_i = sampler.samplers[i]
+                if observation_noise is not None:
+                    observation_noise_i = observation_noise[..., mask_i, i : i + 1]
             else:
                 sampler_i = sampler
+
             fant_model = self.models[i].fantasize(
                 X=X_i,
                 sampler=sampler_i,
-                observation_noise=observation_noise,
+                observation_noise=observation_noise_i,
                 **kwargs,
             )
             fant_models.append(fant_model)

diff --git a/botorch/utils/testing.py b/botorch/utils/testing.py
@@ -375,6 +375,7 @@ def _get_random_data(
         [torch.linspace(0, 0.95, n, **tkwargs) for _ in range(d)], dim=-1
     )
     train_x = train_x + 0.05 * torch.rand_like(train_x).repeat(rep_shape)
+    train_x[0] += 0.02  # modify the first batch
     train_y = torch.sin(train_x[..., :1] * (2 * math.pi))
     train_y = train_y + 0.2 * torch.randn(n, m, **tkwargs).repeat(rep_shape)
     return train_x, train_y