From 0836fb51d1c220aaeb02480e2feb1b20bb80f410 Mon Sep 17 00:00:00 2001 From: Jerry Lin Date: Tue, 26 Apr 2022 00:05:58 -0700 Subject: [PATCH] PairwiseGP modularization and PairwiseLogitLikelihood Summary: Modularize PairwiseGP's likelihood and add logit likelihood support in addition to the original probit likelihood Differential Revision: D35921953 fbshipit-source-id: 3136a3ed5f81100df8b37368609f73e03af5dc8f --- botorch/models/likelihoods/__init__.py | 12 ++ botorch/models/likelihoods/pairwise.py | 161 +++++++++++++++++++++++ botorch/models/pairwise_gp.py | 173 +++++++------------------ 3 files changed, 217 insertions(+), 129 deletions(-) create mode 100644 botorch/models/likelihoods/__init__.py create mode 100644 botorch/models/likelihoods/pairwise.py diff --git a/botorch/models/likelihoods/__init__.py b/botorch/models/likelihoods/__init__.py new file mode 100644 index 0000000000..03e53f0dc6 --- /dev/null +++ b/botorch/models/likelihoods/__init__.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from botorch.models.likelihoods.pairwise import PairwiseProbitLikelihood + + +__all__ = [ + "PairwiseProbitLikelihood", +] diff --git a/botorch/models/likelihoods/pairwise.py b/botorch/models/likelihoods/pairwise.py new file mode 100644 index 0000000000..80abc1d44d --- /dev/null +++ b/botorch/models/likelihoods/pairwise.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Tuple, Any + +import torch +from gpytorch.likelihoods import Likelihood +from torch import Tensor +from torch.distributions import Bernoulli + + +class PairwiseLikelihood(Likelihood): + """Pairwise likelihood base class for Laplace approximation-based PairwiseGP class""" + + def forward(self, utility: Tensor, D: Tensor, **kwargs: Any) -> Bernoulli: + """Given the difference in (estimated) utility util_diff = f(v) - f(u), + return a Bernoulli distribution object representing the likelihood of + the user prefer v over u.""" + return Bernoulli(probs=self.p(utility=utility, D=D, log=False)) + + def p(self, utility: Tensor, D: Tensor, log: bool = False) -> Tensor: + """Given the difference in (estimated) utility util_diff = f(v) - f(u), + return the probability of the user prefer v over u. + + Args: + utility: A Tensor of shape `(batch_size) x n`, the utility at MAP point + D: D is `(batch_size x) m x n` matrix with all elements being zero in last + dimension except at two positions D[..., i] = 1 and D[..., j] = -1 + respectively, representing item i is preferred over item j. + log: if true, return log probability + """ + raise NotImplementedError + + def negative_log_gradient_sum(self, utility: Tensor, D: Tensor) -> Tensor: + """Calculate the sum of negative log gradient with respect to each item's latent + utility values + Args: + utility: A Tensor of shape `(batch_size x) n`, the utility at MAP point + D: D is `(batch_size x) m x n` matrix with all elements being zero in last + dimension except at two positions D[..., i] = 1 and D[..., j] = -1 + respectively, representing item i is preferred over item j. + + Returns: + A `(batch_size x) n` Tensor representing the sum of negative log gradient + values of the likelihood over all comparisons (i.e., the m dimension) + with respect to each item. + """ + raise NotImplementedError + + def negative_log_hessian_sum(self, utility: Tensor, D: Tensor) -> Tensor: + """Calculate the sum of negative log hessian with respect to each item's latent + utility values + Args: + utility: A Tensor of shape `(batch_size) x n`, the utility at MAP point + D: D is `(batch_size x) m x n` matrix with all elements being zero in last + dimension except at two positions D[..., i] = 1 and D[..., j] = -1 + respectively, representing item i is preferred over item j. + + Returns: + A `(batch_size x) n x n` Tensor representing the sum of negative log hessian + values of the likelihood over all comparisons (i.e., the m dimension) with + respect to each item. + """ + raise NotImplementedError + + +class PairwiseProbitLikelihood(PairwiseLikelihood): + # Clamping z lim for better numerical stability. See self._calc_z for detail + # norm_cdf(z=3) ~= 0.999, top 0.1% percent + _zlim = 3 + + def _calc_z(self, utility: Tensor, D: Tensor) -> Tensor: + scaled_util = (utility / math.sqrt(2)).to(D) + z = D @ scaled_util + z = z.clamp(-self._zlim, self._zlim) + return z + + def _calc_z_derived(self, z: Tensor) -> Tuple[Tensor, Tensor, Tensor]: + std_norm = torch.distributions.normal.Normal( + torch.zeros(1, dtype=z.dtype, device=z.device), + torch.ones(1, dtype=z.dtype, device=z.device), + ) + z_logpdf = std_norm.log_prob(z) + z_cdf = std_norm.cdf(z) + z_logcdf = torch.log(z_cdf) + hazard = torch.exp(z_logpdf - z_logcdf) + return z_logpdf, z_logcdf, hazard + + def p(self, utility: Tensor, D: Tensor, log: bool = False) -> Tensor: + z = self._calc_z(utility=utility, D=D) + std_norm = torch.distributions.normal.Normal( + torch.zeros(1, dtype=z.dtype, device=z.device), + torch.ones(1, dtype=z.dtype, device=z.device), + ) + z_cdf = std_norm.cdf(z) + return torch.log(z_cdf) if log else z_cdf + + def negative_log_gradient_sum(self, utility: Tensor, D: Tensor) -> Tensor: + # Compute the sum over of grad. of negative Log-LH wrt utility f. + # Original grad should be of dimension m x n, as in (6) from [Chu2005preference]_. + # The sum over the m dimension of grad. of negative log likelihood + # with respect to the utility + z = self._calc_z(utility, D) + _, _, h = self._calc_z_derived(z) + h_factor = h / math.sqrt(2) + grad = h_factor @ (-D) + + return grad + + def negative_log_hessian_sum(self, utility: Tensor, D: Tensor) -> Tensor: + # Original hess should be of dimension m x n x n, as in (7) from + # [Chu2005preference]_ Sum over the first dimension and return a tensor of + # shape n x n. + # The sum over the m dimension of hessian of negative log likelihood + # with respect to the utility + DT = D.T + z = self._calc_z(utility, D) + _, _, h = self._calc_z_derived(z) + mul_factor = h * (h + z) / 2 + weighted_DT = DT * mul_factor.unsqueeze(-2).expand(*DT.size()) + hess = weighted_DT @ D + + return hess + + +class PairwiseLogitLikelihood(PairwiseLikelihood): + # Clamping z lim for better numerical stability. See self._calc_z for detail + # logistic(4 - (-4)) = logistic(8) ~= 0.9997, top 0.03% percent + _logit_lim = 4 + + def _calc_logit(self, utility: Tensor, D: Tensor) -> Tensor: + logit = D @ utility.to(D) + logit = logit.clamp(-self._logit_lim, self._logit_lim) + return logit + + def p(self, utility: Tensor, D: Tensor, log: bool = False) -> Tensor: + logit = self._calc_logit(utility=utility, D=D) + probs = torch.sigmoid(logit) + return torch.log(probs) if log else probs + + def negative_log_gradient_sum(self, utility: Tensor, D: Tensor) -> Tensor: + winner_indices = (D == 1).nonzero(as_tuple=True)[-1] + loser_indices = (D == -1).nonzero(as_tuple=True)[-1] + ex, ey = torch.exp(utility[winner_indices]), torch.exp(utility[loser_indices]) + unsigned_grad = ey / (ex + ey) + grad = unsigned_grad @ (-D) + return grad + + def negative_log_hessian_sum(self, utility: Tensor, D: Tensor) -> Tensor: + DT = D.T + neg_logit = -(D @ utility) + term = torch.sigmoid(neg_logit) + unsigned_hess = term - (term) ** 2 + weighted_DT = DT * unsigned_hess.unsqueeze(-2).expand(*DT.size()) + hess = weighted_DT @ D + + return hess diff --git a/botorch/models/pairwise_gp.py b/botorch/models/pairwise_gp.py index 21efd42ed6..eb92459431 100644 --- a/botorch/models/pairwise_gp.py +++ b/botorch/models/pairwise_gp.py @@ -20,7 +20,6 @@ from __future__ import annotations -import math import warnings from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple, Union @@ -28,6 +27,10 @@ import numpy as np import torch from botorch.acquisition.objective import PosteriorTransform +from botorch.models.likelihoods.pairwise import ( + PairwiseLikelihood, + PairwiseProbitLikelihood, +) from botorch.models.model import Model from botorch.models.transforms.input import InputTransform from botorch.posteriors.gpytorch import GPyTorchPosterior @@ -80,6 +83,7 @@ def __init__( self, datapoints: Tensor, comparisons: Tensor, + likelihood: Optional[PairwiseLikelihood] = None, covar_module: Optional[Module] = None, input_transform: Optional[InputTransform] = None, **kwargs, @@ -106,7 +110,9 @@ def __init__( # Compatibility variables with fit_gpytorch_*: Dummy likelihood # Likelihood is tightly tied with this model and # it doesn't make much sense to keep it separate - self.likelihood = None + self.likelihood = ( + PairwiseProbitLikelihood() if likelihood is None else likelihood + ) for key in self._buffer_names: self.register_buffer(key, None) @@ -126,9 +132,6 @@ def __init__( # Set optional parameters # jitter to add for numerical stability self._jitter = kwargs.get("jitter", 1e-6) - # Clamping z lim for better numerical stability. See self._calc_z for detail - # norm_cdf(z=3) ~= 0.999, top 0.1% percent - self._zlim = kwargs.get("zlim", 3) # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update() # If None, set to 1e-6 by default in _update self._xtol = kwargs.get("xtol") @@ -299,81 +302,6 @@ def _add_jitter(self, X: Tensor) -> Tensor: ) return X - def _calc_z( - self, utility: Tensor, D: Tensor - ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: - r"""Calculate z score. - - Calculate z score as in [Chu2005preference]_: the standarized difference - - Args: - utility: A Tensor of shape `batch_size x n`, the utility at MAP point - D: as in self.D - - Returns: - z: z score calculated as in [Chu2005preference]_. - z_logpdf: log PDF of z - z_logcdf: log CDF of z - hazard: hazard function defined as pdf(z)/cdf(z) - """ - - scaled_util = (utility / math.sqrt(2)).unsqueeze(-1).to(D) - z = (D @ scaled_util).squeeze(-1) - std_norm = torch.distributions.normal.Normal( - torch.zeros(1, dtype=z.dtype, device=z.device), - torch.ones(1, dtype=z.dtype, device=z.device), - ) - # Clamp z for stable log transformation. This also prevent extreme values - # from appearing in the hess matrix, which should help with numerical - # stability and avoid extreme fitted hyperparameters - z = z.clamp(-self._zlim, self._zlim) - z_logpdf = std_norm.log_prob(z) - z_cdf = std_norm.cdf(z) - z_logcdf = torch.log(z_cdf) - hazard = torch.exp(z_logpdf - z_logcdf) - return z, z_logpdf, z_logcdf, hazard - - def _grad_likelihood_f_sum(self, utility: Tensor, D: Tensor) -> Tensor: - r"""Compute the sum over of grad. of negative Log-LH wrt utility f. - Original grad should be of dimension m x n, as in (6) from [Chu2005preference]_. - Sum over the first dimension and return a tensor of shape n - Needed for calculating utility value at f_map to fill in torch gradient - - Args: - utility: A Tensor of shape `batch_size x n` - D: A Tensor of shape `batch_size x m x n` as in self.D - - Returns: - The sum over the first dimension of grad. of negative Log-LH wrt utility f - """ - _, _, _, h = self._calc_z(utility, D) - h_factor = (h / math.sqrt(2)).unsqueeze(-2) - grad = (h_factor @ (-D)).squeeze(-2) - - return grad - - def _hess_likelihood_f_sum(self, utility: Tensor, D: Tensor, DT: Tensor) -> Tensor: - r"""Compute the sum over of hessian of neg. Log-LH wrt utility f. - - Original hess should be of dimension m x n x n, as in (7) from - [Chu2005preference]_ Sum over the first dimension and return a tensor of - shape n x n. - - Args: - utility: A Tensor of shape `batch_size x n` - D: A Tensor of shape `batch_size x m x n` as in self.D - DT: Transpose of D. A Tensor of shape `batch_size x n x m` as in self.DT - - Returns: - The sum over the first dimension of hess. of negative Log-LH wrt utility f - """ - z, _, _, h = self._calc_z(utility, D) - mul_factor = h * (h + z) / 2 - weighted_DT = DT * mul_factor.unsqueeze(-2).expand(*DT.size()) - hess = weighted_DT @ D - - return hess - def _grad_posterior_f( self, utility: Union[Tensor, np.ndarray], @@ -405,7 +333,7 @@ def _grad_posterior_f( utility = torch.tensor(utility, dtype=self.datapoints.dtype) prior_mean = prior_mean.cpu() - b = self._grad_likelihood_f_sum(utility, D) + b = self.likelihood.negative_log_gradient_sum(utility=utility, D=D) # g_ = covar_inv x (utility - pred_prior) p = (utility - prior_mean).unsqueeze(-1).to(covar_chol) @@ -445,27 +373,10 @@ def _hess_posterior_f( if ret_np: utility = torch.tensor(utility, dtype=self.datapoints.dtype) - hl = self._hess_likelihood_f_sum(utility, D, DT) + hl = self.likelihood.negative_log_hessian_sum(utility=utility, D=D) hess = hl + covar_inv return hess.numpy() if ret_np else hess - def _posterior_f(self, utility: Union[Tensor, np.ndarray]) -> Tensor: - r"""Calculate the negative of the log posterior, i.e., -log(P(f|D)). - - This is the S loss function as in equation (10) of [Chu2005preference]_. - - Args: - utility: A Tensor of shape `batch_size x n` - """ - _, _, z_logcdf, _ = self._calc_z(utility, self.D) - loss1 = -(torch.sum(z_logcdf, dim=-1)) - inv_prod = torch.cholesky_solve(utility.unsqueeze(-1), self.covar_chol) - loss2 = 0.5 * (utility.unsqueeze(-2) @ inv_prod).squeeze(-1).squeeze(-1) - loss = loss1 + loss2 - loss = loss.clamp(min=0) - - return loss - def _update_utility_derived_values(self) -> None: r"""Calculate utility-derived values not needed during optimization @@ -572,7 +483,9 @@ def _update(self, datapoints: Tensor, **kwargs) -> None: # when calling forward() in order to obtain correct gradients # self.likelihood_hess is updated here is for the rare case where we # do not want to call forward() - self.likelihood_hess = self._hess_likelihood_f_sum(f, self.D, self.DT) + self.likelihood_hess = self.likelihood.negative_log_hessian_sum( + utility=f, D=self.D + ) # Lazy update hlcov_eye, which is used in calculating posterior during training self.pred_cov_fac_need_update = True @@ -618,7 +531,7 @@ def _util_newton_updates(self, dp, x0, max_iter=1, xtol=None) -> Tensor: This is used in `forward` to calculate and fill in gradient into tensors. Instead of doing utility -= H^-1 @ g, use substition method. - See more explanation in _update_utility_derived_values.dd + See more explanation in _update_utility_derived_values. By default only need to run one iteration just to fill the the gradients. Args: @@ -641,7 +554,9 @@ def _util_newton_updates(self, dp, x0, max_iter=1, xtol=None) -> Tensor: x = x0 eye = None while i < max_iter and diff > xtol: - hl = self._hess_likelihood_f_sum(x, D, DT) + hl = self.likelihood_hess = self.likelihood.negative_log_hessian_sum( + utility=x, D=D + ) cov_hl = covar @ hl if eye is None: eye = torch.eye( @@ -763,7 +678,7 @@ def set_train_data( self.n = self.datapoints.shape[-2] # num datapoints self.m = self.comparisons.shape[-2] # num pairwise comparisons self.utility = None - # D is batch_sizem x n or num_comparison x num_datapoints. + # D is batch_size x m x n or num_comparison x num_datapoints. # D_k_i is the s_k(x_i) value as in equation (6) in [Chu2005preference]_ # D will usually be very sparse as well # TODO swap out scatter_ so that comparisons could be int instead of long @@ -859,9 +774,7 @@ def forward(self, datapoints: Tensor) -> MultivariateNormal: transformed_dp, self.utility, max_iter=1 ) - hl = self.likelihood_hess = self._hess_likelihood_f_sum( - self.utility, self.D, self.DT - ) + hl = self.likelihood_hess covar = self.covar # Apply matrix inversion lemma on eq. in page 27 of [Brochu2010tutorial]_ # (A + B)^-1 = A^-1 - A^-1 @ (I + BA^-1)^-1 @ BA^-1 @@ -1032,17 +945,10 @@ def condition_on_observations(self, X: Tensor, Y: Tensor, **kwargs: Any) -> Mode class PairwiseLaplaceMarginalLogLikelihood(MarginalLogLikelihood): - def __init__(self, model: PairwiseGP) -> None: - r"""Laplace-approximated marginal log likelihood/evidence for PairwiseGP - - See (12) from [Chu2005preference]_. + r"""Laplace-approximated marginal log likelihood/evidence for PairwiseGP - Args: - model: A model using laplace approximation (currently only supports - `PairwiseGP`) - """ - # Do not use likelihood module here as it's implicitly included in the model - super().__init__(None, model) + See (12) from [Chu2005preference]_. + """ def forward(self, post: Posterior, comp: Tensor) -> Tensor: r"""Calculate approximated log evidence, i.e., log(P(D|theta)) @@ -1056,28 +962,37 @@ def forward(self, post: Posterior, comp: Tensor) -> Tensor: """ model = self.model + likelihood = self.likelihood if comp is not model.comparisons: raise RuntimeError("Must train on training data") - f_max = post.mean - log_posterior = model._posterior_f(f_max) - part1 = -log_posterior + f_map = post.mean.squeeze(-1) + + log_likelihood = likelihood.p(utility=f_map, D=model.D, log=True) + log_likelihood = -(torch.sum(log_likelihood, dim=-1)) + + # 1/2 f_map^T @ covar_inv @ f_map + inv_prod = torch.cholesky_solve(f_map.unsqueeze(-1), model.covar_chol) + log_prior = 0.5 * (f_map.unsqueeze(-2) @ inv_prod).squeeze(-1).squeeze(-1) + log_posterior = log_likelihood + log_prior + # log_posterior is the S loss function in [Chu2005preference]_ + log_posterior = -log_posterior.clamp(min=0) - part2 = model.covar @ model.likelihood_hess + mll = model.covar @ model.likelihood_hess eye = torch.eye( - part2.size(-1), dtype=model.datapoints.dtype, device=model.datapoints.device - ).expand(part2.shape) - part2 = part2 + eye - part2 = -0.5 * torch.logdet(part2) + mll.size(-1), dtype=model.datapoints.dtype, device=model.datapoints.device + ).expand(mll.shape) + mll = mll + eye + mll = -0.5 * torch.logdet(mll) - evidence = part1 + part2 + mll = mll + log_posterior - # Sum up mll first so that when adding prior probs it won't + # Sum up mll first so that when adding parameter prior probs it won't # propagate and double count - evidence = evidence.sum() + mll = mll.sum() # Add log probs of priors on the (functions of) parameters for _, module, prior, closure, _ in self.named_priors(): - evidence = evidence.add(prior.log_prob(closure(module)).sum()) + mll = mll.add(prior.log_prob(closure(module)).sum()) - return evidence + return mll