From 0836fb51d1c220aaeb02480e2feb1b20bb80f410 Mon Sep 17 00:00:00 2001
From: Jerry Lin <zylin@fb.com>
Date: Tue, 26 Apr 2022 00:05:58 -0700
Subject: [PATCH] PairwiseGP modularization and PairwiseLogitLikelihood

Summary: Modularize PairwiseGP's likelihood and add logit likelihood support in addition to the original probit likelihood

Differential Revision: D35921953

fbshipit-source-id: 3136a3ed5f81100df8b37368609f73e03af5dc8f
---
 botorch/models/likelihoods/__init__.py |  12 ++
 botorch/models/likelihoods/pairwise.py | 161 +++++++++++++++++++++++
 botorch/models/pairwise_gp.py          | 173 +++++++------------------
 3 files changed, 217 insertions(+), 129 deletions(-)
 create mode 100644 botorch/models/likelihoods/__init__.py
 create mode 100644 botorch/models/likelihoods/pairwise.py

diff --git a/botorch/models/likelihoods/__init__.py b/botorch/models/likelihoods/__init__.py
new file mode 100644
index 0000000000..03e53f0dc6
--- /dev/null
+++ b/botorch/models/likelihoods/__init__.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from botorch.models.likelihoods.pairwise import PairwiseProbitLikelihood
+
+
+__all__ = [
+    "PairwiseProbitLikelihood",
+]
diff --git a/botorch/models/likelihoods/pairwise.py b/botorch/models/likelihoods/pairwise.py
new file mode 100644
index 0000000000..80abc1d44d
--- /dev/null
+++ b/botorch/models/likelihoods/pairwise.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Tuple, Any
+
+import torch
+from gpytorch.likelihoods import Likelihood
+from torch import Tensor
+from torch.distributions import Bernoulli
+
+
+class PairwiseLikelihood(Likelihood):
+    """Pairwise likelihood base class for Laplace approximation-based PairwiseGP class"""
+
+    def forward(self, utility: Tensor, D: Tensor, **kwargs: Any) -> Bernoulli:
+        """Given the difference in (estimated) utility util_diff = f(v) - f(u),
+        return a Bernoulli distribution object representing the likelihood of
+        the user prefer v over u."""
+        return Bernoulli(probs=self.p(utility=utility, D=D, log=False))
+
+    def p(self, utility: Tensor, D: Tensor, log: bool = False) -> Tensor:
+        """Given the difference in (estimated) utility util_diff = f(v) - f(u),
+        return the probability of the user prefer v over u.
+
+        Args:
+            utility: A Tensor of shape `(batch_size) x n`, the utility at MAP point
+            D: D is `(batch_size x) m x n` matrix with all elements being zero in last
+                dimension except at two positions D[..., i] = 1 and D[..., j] = -1
+                respectively, representing item i is preferred over item j.
+            log: if true, return log probability
+        """
+        raise NotImplementedError
+
+    def negative_log_gradient_sum(self, utility: Tensor, D: Tensor) -> Tensor:
+        """Calculate the sum of negative log gradient with respect to each item's latent
+            utility values
+        Args:
+            utility: A Tensor of shape `(batch_size x) n`, the utility at MAP point
+            D: D is `(batch_size x) m x n` matrix with all elements being zero in last
+                dimension except at two positions D[..., i] = 1 and D[..., j] = -1
+                respectively, representing item i is preferred over item j.
+
+        Returns:
+            A `(batch_size x) n` Tensor representing the sum of negative log gradient
+            values of the likelihood over all comparisons (i.e., the m dimension)
+            with respect to each item.
+        """
+        raise NotImplementedError
+
+    def negative_log_hessian_sum(self, utility: Tensor, D: Tensor) -> Tensor:
+        """Calculate the sum of negative log hessian with respect to each item's latent
+            utility values
+        Args:
+            utility: A Tensor of shape `(batch_size) x n`, the utility at MAP point
+            D: D is `(batch_size x) m x n` matrix with all elements being zero in last
+                dimension except at two positions D[..., i] = 1 and D[..., j] = -1
+                respectively, representing item i is preferred over item j.
+
+        Returns:
+            A `(batch_size x) n x n` Tensor representing the sum of negative log hessian
+            values of the likelihood over all comparisons (i.e., the m dimension) with
+            respect to each item.
+        """
+        raise NotImplementedError
+
+
+class PairwiseProbitLikelihood(PairwiseLikelihood):
+    # Clamping z lim for better numerical stability. See self._calc_z for detail
+    # norm_cdf(z=3) ~= 0.999, top 0.1% percent
+    _zlim = 3
+
+    def _calc_z(self, utility: Tensor, D: Tensor) -> Tensor:
+        scaled_util = (utility / math.sqrt(2)).to(D)
+        z = D @ scaled_util
+        z = z.clamp(-self._zlim, self._zlim)
+        return z
+
+    def _calc_z_derived(self, z: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        std_norm = torch.distributions.normal.Normal(
+            torch.zeros(1, dtype=z.dtype, device=z.device),
+            torch.ones(1, dtype=z.dtype, device=z.device),
+        )
+        z_logpdf = std_norm.log_prob(z)
+        z_cdf = std_norm.cdf(z)
+        z_logcdf = torch.log(z_cdf)
+        hazard = torch.exp(z_logpdf - z_logcdf)
+        return z_logpdf, z_logcdf, hazard
+
+    def p(self, utility: Tensor, D: Tensor, log: bool = False) -> Tensor:
+        z = self._calc_z(utility=utility, D=D)
+        std_norm = torch.distributions.normal.Normal(
+            torch.zeros(1, dtype=z.dtype, device=z.device),
+            torch.ones(1, dtype=z.dtype, device=z.device),
+        )
+        z_cdf = std_norm.cdf(z)
+        return torch.log(z_cdf) if log else z_cdf
+
+    def negative_log_gradient_sum(self, utility: Tensor, D: Tensor) -> Tensor:
+        # Compute the sum over of grad. of negative Log-LH wrt utility f.
+        # Original grad should be of dimension m x n, as in (6) from [Chu2005preference]_.
+        # The sum over the m dimension of grad. of negative log likelihood
+        # with respect to the utility
+        z = self._calc_z(utility, D)
+        _, _, h = self._calc_z_derived(z)
+        h_factor = h / math.sqrt(2)
+        grad = h_factor @ (-D)
+
+        return grad
+
+    def negative_log_hessian_sum(self, utility: Tensor, D: Tensor) -> Tensor:
+        # Original hess should be of dimension m x n x n, as in (7) from
+        # [Chu2005preference]_ Sum over the first dimension and return a tensor of
+        # shape n x n.
+        # The sum over the m dimension of hessian of negative log likelihood
+        # with respect to the utility
+        DT = D.T
+        z = self._calc_z(utility, D)
+        _, _, h = self._calc_z_derived(z)
+        mul_factor = h * (h + z) / 2
+        weighted_DT = DT * mul_factor.unsqueeze(-2).expand(*DT.size())
+        hess = weighted_DT @ D
+
+        return hess
+
+
+class PairwiseLogitLikelihood(PairwiseLikelihood):
+    # Clamping z lim for better numerical stability. See self._calc_z for detail
+    # logistic(4 - (-4)) = logistic(8) ~= 0.9997, top 0.03% percent
+    _logit_lim = 4
+
+    def _calc_logit(self, utility: Tensor, D: Tensor) -> Tensor:
+        logit = D @ utility.to(D)
+        logit = logit.clamp(-self._logit_lim, self._logit_lim)
+        return logit
+
+    def p(self, utility: Tensor, D: Tensor, log: bool = False) -> Tensor:
+        logit = self._calc_logit(utility=utility, D=D)
+        probs = torch.sigmoid(logit)
+        return torch.log(probs) if log else probs
+
+    def negative_log_gradient_sum(self, utility: Tensor, D: Tensor) -> Tensor:
+        winner_indices = (D == 1).nonzero(as_tuple=True)[-1]
+        loser_indices = (D == -1).nonzero(as_tuple=True)[-1]
+        ex, ey = torch.exp(utility[winner_indices]), torch.exp(utility[loser_indices])
+        unsigned_grad = ey / (ex + ey)
+        grad = unsigned_grad @ (-D)
+        return grad
+
+    def negative_log_hessian_sum(self, utility: Tensor, D: Tensor) -> Tensor:
+        DT = D.T
+        neg_logit = -(D @ utility)
+        term = torch.sigmoid(neg_logit)
+        unsigned_hess = term - (term) ** 2
+        weighted_DT = DT * unsigned_hess.unsqueeze(-2).expand(*DT.size())
+        hess = weighted_DT @ D
+
+        return hess
diff --git a/botorch/models/pairwise_gp.py b/botorch/models/pairwise_gp.py
index 21efd42ed6..eb92459431 100644
--- a/botorch/models/pairwise_gp.py
+++ b/botorch/models/pairwise_gp.py
@@ -20,7 +20,6 @@
 
 from __future__ import annotations
 
-import math
 import warnings
 from copy import deepcopy
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -28,6 +27,10 @@
 import numpy as np
 import torch
 from botorch.acquisition.objective import PosteriorTransform
+from botorch.models.likelihoods.pairwise import (
+    PairwiseLikelihood,
+    PairwiseProbitLikelihood,
+)
 from botorch.models.model import Model
 from botorch.models.transforms.input import InputTransform
 from botorch.posteriors.gpytorch import GPyTorchPosterior
@@ -80,6 +83,7 @@ def __init__(
         self,
         datapoints: Tensor,
         comparisons: Tensor,
+        likelihood: Optional[PairwiseLikelihood] = None,
         covar_module: Optional[Module] = None,
         input_transform: Optional[InputTransform] = None,
         **kwargs,
@@ -106,7 +110,9 @@ def __init__(
         # Compatibility variables with fit_gpytorch_*: Dummy likelihood
         # Likelihood is tightly tied with this model and
         # it doesn't make much sense to keep it separate
-        self.likelihood = None
+        self.likelihood = (
+            PairwiseProbitLikelihood() if likelihood is None else likelihood
+        )
 
         for key in self._buffer_names:
             self.register_buffer(key, None)
@@ -126,9 +132,6 @@ def __init__(
         # Set optional parameters
         # jitter to add for numerical stability
         self._jitter = kwargs.get("jitter", 1e-6)
-        # Clamping z lim for better numerical stability. See self._calc_z for detail
-        # norm_cdf(z=3) ~= 0.999, top 0.1% percent
-        self._zlim = kwargs.get("zlim", 3)
         # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update()
         # If None, set to 1e-6 by default in _update
         self._xtol = kwargs.get("xtol")
@@ -299,81 +302,6 @@ def _add_jitter(self, X: Tensor) -> Tensor:
         )
         return X
 
-    def _calc_z(
-        self, utility: Tensor, D: Tensor
-    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
-        r"""Calculate z score.
-
-        Calculate z score as in [Chu2005preference]_: the standarized difference
-
-        Args:
-            utility: A Tensor of shape `batch_size x n`, the utility at MAP point
-            D: as in self.D
-
-        Returns:
-            z: z score calculated as in [Chu2005preference]_.
-            z_logpdf: log PDF of z
-            z_logcdf: log CDF of z
-            hazard: hazard function defined as pdf(z)/cdf(z)
-        """
-
-        scaled_util = (utility / math.sqrt(2)).unsqueeze(-1).to(D)
-        z = (D @ scaled_util).squeeze(-1)
-        std_norm = torch.distributions.normal.Normal(
-            torch.zeros(1, dtype=z.dtype, device=z.device),
-            torch.ones(1, dtype=z.dtype, device=z.device),
-        )
-        # Clamp z for stable log transformation. This also prevent extreme values
-        # from appearing in the hess matrix, which should help with numerical
-        # stability and avoid extreme fitted hyperparameters
-        z = z.clamp(-self._zlim, self._zlim)
-        z_logpdf = std_norm.log_prob(z)
-        z_cdf = std_norm.cdf(z)
-        z_logcdf = torch.log(z_cdf)
-        hazard = torch.exp(z_logpdf - z_logcdf)
-        return z, z_logpdf, z_logcdf, hazard
-
-    def _grad_likelihood_f_sum(self, utility: Tensor, D: Tensor) -> Tensor:
-        r"""Compute the sum over of grad. of negative Log-LH wrt utility f.
-        Original grad should be of dimension m x n, as in (6) from [Chu2005preference]_.
-        Sum over the first dimension and return a tensor of shape n
-        Needed for calculating utility value at f_map to fill in torch gradient
-
-        Args:
-            utility: A Tensor of shape `batch_size x n`
-            D: A Tensor of shape `batch_size x m x n` as in self.D
-
-        Returns:
-            The sum over the first dimension of grad. of negative Log-LH wrt utility f
-        """
-        _, _, _, h = self._calc_z(utility, D)
-        h_factor = (h / math.sqrt(2)).unsqueeze(-2)
-        grad = (h_factor @ (-D)).squeeze(-2)
-
-        return grad
-
-    def _hess_likelihood_f_sum(self, utility: Tensor, D: Tensor, DT: Tensor) -> Tensor:
-        r"""Compute the sum over of hessian of neg. Log-LH wrt utility f.
-
-        Original hess should be of dimension m x n x n, as in (7) from
-        [Chu2005preference]_ Sum over the first dimension and return a tensor of
-        shape n x n.
-
-        Args:
-            utility: A Tensor of shape `batch_size x n`
-            D: A Tensor of shape `batch_size x m x n` as in self.D
-            DT: Transpose of D. A Tensor of shape `batch_size x n x m` as in self.DT
-
-        Returns:
-            The sum over the first dimension of hess. of negative Log-LH wrt utility f
-        """
-        z, _, _, h = self._calc_z(utility, D)
-        mul_factor = h * (h + z) / 2
-        weighted_DT = DT * mul_factor.unsqueeze(-2).expand(*DT.size())
-        hess = weighted_DT @ D
-
-        return hess
-
     def _grad_posterior_f(
         self,
         utility: Union[Tensor, np.ndarray],
@@ -405,7 +333,7 @@ def _grad_posterior_f(
             utility = torch.tensor(utility, dtype=self.datapoints.dtype)
             prior_mean = prior_mean.cpu()
 
-        b = self._grad_likelihood_f_sum(utility, D)
+        b = self.likelihood.negative_log_gradient_sum(utility=utility, D=D)
 
         # g_ = covar_inv x (utility - pred_prior)
         p = (utility - prior_mean).unsqueeze(-1).to(covar_chol)
@@ -445,27 +373,10 @@ def _hess_posterior_f(
         if ret_np:
             utility = torch.tensor(utility, dtype=self.datapoints.dtype)
 
-        hl = self._hess_likelihood_f_sum(utility, D, DT)
+        hl = self.likelihood.negative_log_hessian_sum(utility=utility, D=D)
         hess = hl + covar_inv
         return hess.numpy() if ret_np else hess
 
-    def _posterior_f(self, utility: Union[Tensor, np.ndarray]) -> Tensor:
-        r"""Calculate the negative of the log posterior, i.e., -log(P(f|D)).
-
-        This is the S loss function as in equation (10) of [Chu2005preference]_.
-
-        Args:
-            utility: A Tensor of shape `batch_size x n`
-        """
-        _, _, z_logcdf, _ = self._calc_z(utility, self.D)
-        loss1 = -(torch.sum(z_logcdf, dim=-1))
-        inv_prod = torch.cholesky_solve(utility.unsqueeze(-1), self.covar_chol)
-        loss2 = 0.5 * (utility.unsqueeze(-2) @ inv_prod).squeeze(-1).squeeze(-1)
-        loss = loss1 + loss2
-        loss = loss.clamp(min=0)
-
-        return loss
-
     def _update_utility_derived_values(self) -> None:
         r"""Calculate utility-derived values not needed during optimization
 
@@ -572,7 +483,9 @@ def _update(self, datapoints: Tensor, **kwargs) -> None:
         # when calling forward() in order to obtain correct gradients
         # self.likelihood_hess is updated here is for the rare case where we
         # do not want to call forward()
-        self.likelihood_hess = self._hess_likelihood_f_sum(f, self.D, self.DT)
+        self.likelihood_hess = self.likelihood.negative_log_hessian_sum(
+            utility=f, D=self.D
+        )
 
         # Lazy update hlcov_eye, which is used in calculating posterior during training
         self.pred_cov_fac_need_update = True
@@ -618,7 +531,7 @@ def _util_newton_updates(self, dp, x0, max_iter=1, xtol=None) -> Tensor:
 
         This is used in `forward` to calculate and fill in gradient into tensors.
         Instead of doing utility -= H^-1 @ g, use substition method.
-        See more explanation in _update_utility_derived_values.dd
+        See more explanation in _update_utility_derived_values.
         By default only need to run one iteration just to fill the the gradients.
 
         Args:
@@ -641,7 +554,9 @@ def _util_newton_updates(self, dp, x0, max_iter=1, xtol=None) -> Tensor:
         x = x0
         eye = None
         while i < max_iter and diff > xtol:
-            hl = self._hess_likelihood_f_sum(x, D, DT)
+            hl = self.likelihood_hess = self.likelihood.negative_log_hessian_sum(
+                utility=x, D=D
+            )
             cov_hl = covar @ hl
             if eye is None:
                 eye = torch.eye(
@@ -763,7 +678,7 @@ def set_train_data(
         self.n = self.datapoints.shape[-2]  # num datapoints
         self.m = self.comparisons.shape[-2]  # num pairwise comparisons
         self.utility = None
-        # D is batch_sizem x n or num_comparison x num_datapoints.
+        # D is batch_size x m x n or num_comparison x num_datapoints.
         # D_k_i is the s_k(x_i) value as in equation (6) in [Chu2005preference]_
         # D will usually be very sparse as well
         # TODO swap out scatter_ so that comparisons could be int instead of long
@@ -859,9 +774,7 @@ def forward(self, datapoints: Tensor) -> MultivariateNormal:
                 transformed_dp, self.utility, max_iter=1
             )
 
-            hl = self.likelihood_hess = self._hess_likelihood_f_sum(
-                self.utility, self.D, self.DT
-            )
+            hl = self.likelihood_hess
             covar = self.covar
             # Apply matrix inversion lemma on eq. in page 27 of [Brochu2010tutorial]_
             # (A + B)^-1 = A^-1 - A^-1 @ (I + BA^-1)^-1 @ BA^-1
@@ -1032,17 +945,10 @@ def condition_on_observations(self, X: Tensor, Y: Tensor, **kwargs: Any) -> Mode
 
 
 class PairwiseLaplaceMarginalLogLikelihood(MarginalLogLikelihood):
-    def __init__(self, model: PairwiseGP) -> None:
-        r"""Laplace-approximated marginal log likelihood/evidence for PairwiseGP
-
-        See (12) from [Chu2005preference]_.
+    r"""Laplace-approximated marginal log likelihood/evidence for PairwiseGP
 
-        Args:
-            model: A model using laplace approximation (currently only supports
-                `PairwiseGP`)
-        """
-        # Do not use likelihood module here as it's implicitly included in the model
-        super().__init__(None, model)
+    See (12) from [Chu2005preference]_.
+    """
 
     def forward(self, post: Posterior, comp: Tensor) -> Tensor:
         r"""Calculate approximated log evidence, i.e., log(P(D|theta))
@@ -1056,28 +962,37 @@ def forward(self, post: Posterior, comp: Tensor) -> Tensor:
         """
 
         model = self.model
+        likelihood = self.likelihood
         if comp is not model.comparisons:
             raise RuntimeError("Must train on training data")
 
-        f_max = post.mean
-        log_posterior = model._posterior_f(f_max)
-        part1 = -log_posterior
+        f_map = post.mean.squeeze(-1)
+
+        log_likelihood = likelihood.p(utility=f_map, D=model.D, log=True)
+        log_likelihood = -(torch.sum(log_likelihood, dim=-1))
+
+        # 1/2 f_map^T @ covar_inv @ f_map
+        inv_prod = torch.cholesky_solve(f_map.unsqueeze(-1), model.covar_chol)
+        log_prior = 0.5 * (f_map.unsqueeze(-2) @ inv_prod).squeeze(-1).squeeze(-1)
+        log_posterior = log_likelihood + log_prior
+        # log_posterior is the S loss function in [Chu2005preference]_
+        log_posterior = -log_posterior.clamp(min=0)
 
-        part2 = model.covar @ model.likelihood_hess
+        mll = model.covar @ model.likelihood_hess
         eye = torch.eye(
-            part2.size(-1), dtype=model.datapoints.dtype, device=model.datapoints.device
-        ).expand(part2.shape)
-        part2 = part2 + eye
-        part2 = -0.5 * torch.logdet(part2)
+            mll.size(-1), dtype=model.datapoints.dtype, device=model.datapoints.device
+        ).expand(mll.shape)
+        mll = mll + eye
+        mll = -0.5 * torch.logdet(mll)
 
-        evidence = part1 + part2
+        mll = mll + log_posterior
 
-        # Sum up mll first so that when adding prior probs it won't
+        # Sum up mll first so that when adding parameter prior probs it won't
         # propagate and double count
-        evidence = evidence.sum()
+        mll = mll.sum()
 
         # Add log probs of priors on the (functions of) parameters
         for _, module, prior, closure, _ in self.named_priors():
-            evidence = evidence.add(prior.log_prob(closure(module)).sum())
+            mll = mll.add(prior.log_prob(closure(module)).sum())
 
-        return evidence
+        return mll