Use Standardize/Normalize by default for SingleTaskGP (#2458)

Summary: X-link: facebook/Ax#2630 Pull Request resolved: #2458 D60080819 recently updated the default `SingleTaskGP` BoTorch priors. One significant change was to remove the use of an outputscale, which may not work well if the outputs aren't standardized. This diff changes the `SingleTaskGP` to use `Standardize` and `Normalize` by default if no input/outcome transforms are specified (this allows users to explicitly pass in `None` if they don't want to use any transforms). Differential Revision: D60492937
pytorch · Aug 7, 2024 · ec16d7a · ec16d7a
1 parent c1b73b8
commit ec16d7a
Show file tree

Hide file tree

Showing 15 changed files with 228 additions and 83 deletions.
diff --git a/botorch/acquisition/analytic.py b/botorch/acquisition/analytic.py
@@ -1091,15 +1091,17 @@ def _get_noiseless_fantasy_model(
     # are used across all batches (by default, a GP with batched training data
     # uses independent hyperparameters for each batch).
 
-    # Don't apply `outcome_transform` and `input_transform` here,
-    # since the data being passed has already been transformed.
-    # So we will instead set them afterwards.
+    # We don't want to use the true `outcome_transform` and `input_transform` here
+    # since the data being passed has already been transformed. We thus pass `None`
+    # and will instead set them afterwards.
     fantasy_model = SingleTaskGP(
         train_X=model.train_inputs[0],
         train_Y=model.train_targets.unsqueeze(-1),
         train_Yvar=model.likelihood.noise_covar.noise.unsqueeze(-1),
         covar_module=deepcopy(model.covar_module),
         mean_module=deepcopy(model.mean_module),
+        outcome_transform=None,
+        input_transform=None,
     )
 
     Yvar = torch.full_like(Y_fantasized, 1e-7)

diff --git a/botorch/acquisition/multi_objective/max_value_entropy_search.py b/botorch/acquisition/multi_objective/max_value_entropy_search.py
@@ -64,7 +64,7 @@ class qMultiObjectiveMaxValueEntropy(
         _default_sample_shape: The `sample_shape` for the default sampler.
 
     Example:
-        >>> model = SingleTaskGP(train_X, train_Y)
+        >>> model = SingleTaskGP(train_X, train_Y, outcome_transform=None)
         >>> MESMO = qMultiObjectiveMaxValueEntropy(model, sample_pfs)
         >>> mesmo = MESMO(test_X)
     """

diff --git a/botorch/models/contextual.py b/botorch/models/contextual.py
@@ -102,7 +102,13 @@ def __init__(
                 dimension is set to 1 for each categorical variable.
             context_weight_dict: Known population weights of each context.
         """
-        super().__init__(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar)
+        super().__init__(
+            train_X=train_X,
+            train_Y=train_Y,
+            train_Yvar=train_Yvar,
+            input_transform=None,
+            outcome_transform=None,
+        )
         self.covar_module = LCEAKernel(
             decomposition=decomposition,
             batch_shape=self._aug_batch_shape,

diff --git a/botorch/models/converter.py b/botorch/models/converter.py
@@ -17,6 +17,7 @@
 import torch
 from botorch.exceptions import UnsupportedError
 from botorch.exceptions.warnings import BotorchWarning
+from botorch.models import SingleTaskGP
 from botorch.models.gp_regression import HeteroskedasticSingleTaskGP
 from botorch.models.gp_regression_fidelity import SingleTaskMultiFidelityGP
 from botorch.models.gp_regression_mixed import MixedSingleTaskGP
@@ -179,6 +180,11 @@ def model_list_to_batched(model_list: ModelListGP) -> BatchedMultiOutputGPyTorch
         batch_length = len(models)
         covar_module = _batched_kernel(models[0].covar_module, batch_length)
         kwargs["covar_module"] = covar_module
+    # SingleTaskGP uses default input/outcome transforms while this converter doesn't
+    # support outcome transforms. We need to explicitly pass down `None` to make sure
+    # no outcome transform is being used.
+    if isinstance(models[0], SingleTaskGP):
+        kwargs["outcome_transform"] = None
 
     # construct the batched GP model
     input_transform = getattr(models[0], "input_transform", None)
@@ -418,6 +424,12 @@ def batched_multi_output_to_single_output(
         kwargs["train_Yvar"] = noise_covar.noise.clone().unsqueeze(-1)
     if isinstance(batch_mo_model, SingleTaskMultiFidelityGP):
         kwargs.update(batch_mo_model._init_args)
+    # SingleTaskGP uses default input/outcome transforms while this converter doesn't
+    # support outcome transforms. We need to explicitly pass down `None` to make sure
+    # no outcome transform is being used.
+    if isinstance(batch_mo_model, SingleTaskGP):
+        kwargs["outcome_transform"] = None
+
     single_outcome_model = batch_mo_model.__class__(
         input_transform=input_transform, **kwargs
     )

diff --git a/botorch/models/gp_regression.py b/botorch/models/gp_regression.py
@@ -36,8 +36,8 @@
 import torch
 from botorch.models.gpytorch import BatchedMultiOutputGPyTorchModel
 from botorch.models.model import FantasizeMixin
-from botorch.models.transforms.input import InputTransform
-from botorch.models.transforms.outcome import Log, OutcomeTransform
+from botorch.models.transforms.input import InputTransform, Normalize
+from botorch.models.transforms.outcome import Log, OutcomeTransform, Standardize
 from botorch.models.utils import validate_input_scaling
 from botorch.models.utils.gpytorch_modules import (
     get_covar_module_with_dim_scaled_prior,
@@ -46,6 +46,7 @@
 )
 from botorch.utils.containers import BotorchContainer
 from botorch.utils.datasets import SupervisedDataset
+from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.constraints.constraints import GreaterThan
 from gpytorch.distributions.multivariate_normal import MultivariateNormal
 from gpytorch.likelihoods.gaussian_likelihood import (
@@ -134,8 +135,8 @@ def __init__(
         likelihood: Optional[Likelihood] = None,
         covar_module: Optional[Module] = None,
         mean_module: Optional[Mean] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
+        outcome_transform: Optional[Union[OutcomeTransform, _DefaultType]] = DEFAULT,
+        input_transform: Optional[Union[InputTransform, _DefaultType]] = DEFAULT,
     ) -> None:
         r"""
         Args:
@@ -154,16 +155,27 @@ def __init__(
             outcome_transform: An outcome transform that is applied to the
                 training data during instantiation and to the posterior during
                 inference (that is, the `Posterior` obtained by calling
-                `.posterior` on the model will be on the original scale).
-            input_transform: An input transform that is applied in the model's
-                forward pass.
+                `.posterior` on the model will be on the original scale). We use a
+                `Standardize` transform if no `outcome_transform` is specified.
+                Pass down `None` to use no outcome transform.
+            input_transform: An input transform that is applied in the model's forward
+                pass. We use a `Normalize` transform if no `input_transform` is
+                specified. Pass down `None` to use no input transform.
         """
+        self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar)
+        if outcome_transform == DEFAULT:
+            outcome_transform = Standardize(
+                m=train_Y.shape[-1], batch_shape=train_X.shape[:-2]
+            )
+        if input_transform == DEFAULT:
+            input_transform = Normalize(d=train_X.shape[-1], transform_on_train=True)
         with torch.no_grad():
             transformed_X = self.transform_inputs(
                 X=train_X, input_transform=input_transform
             )
         if outcome_transform is not None:
             train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar)
+        # Validate again after applying the transforms
         self._validate_tensor_args(X=transformed_X, Y=train_Y, Yvar=train_Yvar)
         ignore_X_dims = getattr(self, "_ignore_X_dims_scaling_check", None)
         validate_input_scaling(
@@ -352,6 +364,7 @@ def __init__(
             train_X=train_X,
             train_Y=train_Y,
             likelihood=likelihood,
+            outcome_transform=None,
             input_transform=input_transform,
         )
         self.register_added_loss_term("noise_added_loss")

diff --git a/test/acquisition/multi_objective/test_max_value_entropy_search.py b/test/acquisition/multi_objective/test_max_value_entropy_search.py
@@ -14,6 +14,7 @@
     qMultiObjectiveMaxValueEntropy,
 )
 from botorch.acquisition.multi_objective.utils import compute_sample_box_decomposition
+from botorch.exceptions.errors import UnsupportedError
 from botorch.models.gp_regression import SingleTaskGP
 from botorch.models.model_list_gp_regression import ModelListGP
 from botorch.models.transforms.outcome import Standardize
@@ -71,15 +72,30 @@ def test_multi_objective_max_value_entropy(self):
             # test batched model
             train_X = torch.rand(1, 1, 2, dtype=dtype, device=self.device)
             train_Y = torch.rand(1, 1, m, dtype=dtype, device=self.device)
-            model = SingleTaskGP(train_X, train_Y)
+            model = SingleTaskGP(train_X, train_Y, outcome_transform=None)
             with self.assertRaises(NotImplementedError):
-                qMultiObjectiveMaxValueEntropy(model, dummy_sample_pareto_frontiers)
+                qMultiObjectiveMaxValueEntropy(
+                    model=model, sample_pareto_frontiers=dummy_sample_pareto_frontiers
+                )
             # test initialization
             train_X = torch.rand(4, 2, dtype=dtype, device=self.device)
             train_Y = torch.rand(4, m, dtype=dtype, device=self.device)
-            # test batched MO model
+            # Models with outcome transforms aren't supported.
             model = SingleTaskGP(train_X, train_Y)
-            mesmo = qMultiObjectiveMaxValueEntropy(model, dummy_sample_pareto_frontiers)
+            with self.assertRaisesRegex(
+                UnsupportedError,
+                "Conversion of models with outcome transforms is currently "
+                "unsupported.",
+            ):
+                qMultiObjectiveMaxValueEntropy(
+                    model=ModelListGP(model, model),
+                    sample_pareto_frontiers=dummy_sample_pareto_frontiers,
+                )
+            # test batched MO model
+            model = SingleTaskGP(train_X, train_Y, outcome_transform=None)
+            mesmo = qMultiObjectiveMaxValueEntropy(
+                model=model, sample_pareto_frontiers=dummy_sample_pareto_frontiers
+            )
             self.assertEqual(mesmo.num_fantasies, 16)
             # Initialize the sampler.
             dummy_post = model.posterior(train_X[:1])
@@ -98,11 +114,16 @@ def test_multi_objective_max_value_entropy(self):
             )
             # test ModelListGP
             model = ModelListGP(
-                *[SingleTaskGP(train_X, train_Y[:, i : i + 1]) for i in range(m)]
+                *[
+                    SingleTaskGP(train_X, train_Y[:, i : i + 1], outcome_transform=None)
+                    for i in range(m)
+                ]
             )
             mock_sample_pfs = mock.Mock()
             mock_sample_pfs.return_value = dummy_sample_pareto_frontiers(model=model)
-            mesmo = qMultiObjectiveMaxValueEntropy(model, mock_sample_pfs)
+            mesmo = qMultiObjectiveMaxValueEntropy(
+                model=model, sample_pareto_frontiers=mock_sample_pfs
+            )
             self.assertEqual(mesmo.num_fantasies, 16)
             # Initialize the sampler.
             dummy_post = model.posterior(train_X[:1])
@@ -156,7 +177,7 @@ def test_multi_objective_max_value_entropy(self):
                 ],
                 dim=1,
             )
-            fantasy_model = SingleTaskGP(fant_X, fant_Y)
+            fantasy_model = SingleTaskGP(fant_X, fant_Y, outcome_transform=None)
 
             # test with X_pending is not None
             with mock.patch.object(

diff --git a/test/acquisition/test_proximal.py b/test/acquisition/test_proximal.py
@@ -245,7 +245,7 @@ def test_proximal_model_list(self):
             train_X = torch.rand(5, 3, device=self.device, dtype=dtype)
             train_Y = train_X.norm(dim=-1, keepdim=True)
 
-            gp = SingleTaskGP(train_X, train_Y).to(device=self.device)
+            gp = SingleTaskGP(train_X, train_Y)
             model = ModelListGP(gp, gp)
 
             scalarized_posterior_transform = ScalarizedPosteriorTransform(
@@ -263,11 +263,12 @@ def test_proximal_model_list(self):
             EI_prox = ProximalAcquisitionFunction(EI, proximal_weights=proximal_weights)
 
             ei = EI(test_X)
-            mv_normal = MultivariateNormal(train_X[-1], torch.diag(proximal_weights))
-            test_prox_weight = torch.exp(mv_normal.log_prob(test_X)) / torch.exp(
-                mv_normal.log_prob(train_X[-1])
+            train_X_trans = gp.input_transform.transform(train_X[-1])
+            test_X_trans = gp.input_transform.transform(test_X)
+            mv_normal = MultivariateNormal(train_X_trans, torch.diag(proximal_weights))
+            test_prox_weight = torch.exp(mv_normal.log_prob(test_X_trans)) / torch.exp(
+                mv_normal.log_prob(train_X_trans)
             )
-
             # test calculation
             ei_prox = EI_prox(test_X)
 
@@ -282,9 +283,10 @@ def test_proximal_model_list(self):
             )
 
             qei = qEI(test_X)
-            mv_normal = MultivariateNormal(train_X[-1], torch.diag(proximal_weights))
-            test_prox_weight = torch.exp(mv_normal.log_prob(test_X)) / torch.exp(
-                mv_normal.log_prob(train_X[-1])
+            test_X_trans = gp.input_transform.transform(test_X)
+            mv_normal = MultivariateNormal(train_X_trans, torch.diag(proximal_weights))
+            test_prox_weight = torch.exp(mv_normal.log_prob(test_X_trans)) / torch.exp(
+                mv_normal.log_prob(train_X_trans)
             )
 
             qei_prox = qEI_prox(test_X)