diff --git a/botorch/acquisition/max_value_entropy_search.py b/botorch/acquisition/max_value_entropy_search.py
index 7fbc73cfcf..0db0e34a54 100644
--- a/botorch/acquisition/max_value_entropy_search.py
+++ b/botorch/acquisition/max_value_entropy_search.py
@@ -13,8 +13,8 @@
 
 .. [Moss2021gibbon]
     Moss, H. B., et al.,
-    GIBBON: General-purpose Information-Based Bayesian OptimisatioN
-    arXiv:2102.03324, 2021
+    GIBBON: General-purpose Information-Based Bayesian OptimisatioN.
+    Journal of Machine Learning Research, 2021.
 
 .. [Takeno2020mfmves]
     S. Takeno, H. Fukuoka, Y. Tsukada, T. Koyama, M. Shiga, I. Takeuchi,
diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py
index 7d1f70a834..2b1bad5a2d 100644
--- a/botorch/models/approximate_gp.py
+++ b/botorch/models/approximate_gp.py
@@ -13,22 +13,25 @@
     Journal of Machine Learning Research, 2020,
     http://jmlr.org/papers/v21/19-1015.html.
 
-.. [chen2018dpp]
-    Laming Chen and Guoxin Zhang and Hanning Zhou, Fast greedy MAP inference
-    for determinantal point process to improve recommendation diversity,
-    Proceedings of the 32nd International Conference on Neural Information
-    Processing Systems, 2018, https://arxiv.org/abs/1709.05135.
-
 .. [hensman2013svgp]
     James Hensman and Nicolo Fusi and Neil D. Lawrence, Gaussian Processes
     for Big Data, Proceedings of the 29th Conference on Uncertainty in
     Artificial Intelligence, 2013, https://arxiv.org/abs/1309.6835.
 
+.. [moss2023ipa]
+    Henry B. Moss and Sebastian W. Ober and Victor Picheny,
+    Inducing Point Allocation for Sparse Gaussian Processes
+    in High-Throughput Bayesian Optimization,Proceedings of
+    the 25th International Conference on Artificial Intelligence
+    and Statistics, 2023, https://arxiv.org/pdf/2301.10123.pdf.
+
 """
 
 from __future__ import annotations
 
 import copy
+import warnings
+
 from typing import Optional, Type, Union
 
 import torch
@@ -36,6 +39,10 @@
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
 from botorch.models.utils import validate_input_scaling
+from botorch.models.utils.inducing_point_allocators import (
+    GreedyVarianceReduction,
+    InducingPointAllocator,
+)
 from botorch.posteriors.gpytorch import GPyTorchPosterior
 from gpytorch.constraints import GreaterThan
 from gpytorch.distributions import MultivariateNormal
@@ -47,7 +54,6 @@
 )
 from gpytorch.means import ConstantMean, Mean
 from gpytorch.models import ApproximateGP
-from gpytorch.module import Module
 from gpytorch.priors import GammaPrior
 from gpytorch.utils.memoize import clear_cache_hook
 from gpytorch.variational import (
@@ -57,12 +63,10 @@
     IndependentMultitaskVariationalStrategy,
     VariationalStrategy,
 )
-from linear_operator.operators import LinearOperator
 from torch import Tensor
 
 
 MIN_INFERRED_NOISE_LEVEL = 1e-4
-NEG_INF = -(torch.tensor(float("inf")))
 
 
 class ApproximateGPyTorchModel(GPyTorchModel):
@@ -148,7 +152,8 @@ class _SingleTaskVariationalGP(ApproximateGP):
     Base class wrapper for a stochastic variational Gaussian Process (SVGP)
     model [hensman2013svgp]_.
 
-    Uses pivoted Cholesky initialization for the inducing points.
+    Uses by default pivoted Cholesky initialization for allocating inducing points,
+    however, custom inducing point allocators can be provided.
     """
 
     def __init__(
@@ -162,6 +167,7 @@ def __init__(
         variational_distribution: Optional[_VariationalDistribution] = None,
         variational_strategy: Type[_VariationalStrategy] = VariationalStrategy,
         inducing_points: Optional[Union[Tensor, int]] = None,
+        inducing_point_allocator: Optional[InducingPointAllocator] = None,
     ) -> None:
         r"""
         Args:
@@ -179,6 +185,9 @@ def __init__(
                 VariationalStrategy). The default setting uses "whitening" of the
                 variational distribution to make training easier.
             inducing_points: The number or specific locations of the inducing points.
+            inducing_point_allocator: The `InducingPointAllocator` used to
+                initialize the inducing point locations. If omitted,
+                uses `GreedyVarianceReduction`.
         """
         # We use the model subclass wrapper to deal with input / outcome transforms.
         # The number of outputs will be correct here due to the check in
@@ -209,14 +218,17 @@ def __init__(
                 "covar_module.base_kernel.raw_lengthscale": -3,
             }
 
-        # initialize inducing points with a pivoted cholesky init if they are not given
+        if inducing_point_allocator is None:
+            inducing_point_allocator = GreedyVarianceReduction()
+
+        # initialize inducing points if they are not given
         if not isinstance(inducing_points, Tensor):
             if inducing_points is None:
                 # number of inducing points is 25% the number of data points
                 # as a heuristic
                 inducing_points = int(0.25 * train_X.shape[-2])
 
-            inducing_points = _select_inducing_points(
+            inducing_points = inducing_point_allocator.allocate_inducing_points(
                 inputs=train_X,
                 covar_module=covar_module,
                 num_inducing=inducing_points,
@@ -255,8 +267,14 @@ def forward(self, X) -> MultivariateNormal:
 
 
 class SingleTaskVariationalGP(ApproximateGPyTorchModel):
-    r"""A single-task variational GP model following [hensman2013svgp]_ with pivoted
-    Cholesky initialization following [chen2018dpp]_ and [burt2020svgp]_.
+    r"""A single-task variational GP model following [hensman2013svgp]_.
+
+    By default, the inducing points are initialized though the
+    `GreedyVarianceReduction` of [burt2020svgp]_, which is known to be
+    effective for building globally accurate models. However, custom
+    inducing point allocators designed for specific down-stream tasks can also be
+    provided (see [moss2023ipa]_ for details), e.g. `GreedyImprovementReduction`
+    when the goal is to build a model suitable for standard BO.
 
     A single-task variational GP using relatively strong priors on the Kernel
     hyperparameters, which work best when covariates are normalized to the unit
@@ -299,6 +317,7 @@ def __init__(
         inducing_points: Optional[Union[Tensor, int]] = None,
         outcome_transform: Optional[OutcomeTransform] = None,
         input_transform: Optional[InputTransform] = None,
+        inducing_point_allocator: Optional[InducingPointAllocator] = None,
     ) -> None:
         r"""
         Args:
@@ -319,6 +338,9 @@ def __init__(
                 VariationalStrategy). The default setting uses "whitening" of the
                 variational distribution to make training easier.
             inducing_points: The number or specific locations of the inducing points.
+            inducing_point_allocator: The `InducingPointAllocator` used to
+                initialize the inducing point locations. If omitted,
+                uses `GreedyVarianceReduction`.
         """
         with torch.no_grad():
             transformed_X = self.transform_inputs(
@@ -357,6 +379,19 @@ def __init__(
         else:
             self._is_custom_likelihood = True
 
+        if learn_inducing_points and (inducing_point_allocator is not None):
+            warnings.warn(
+                "After all the effort of specifying an inducing point allocator, "
+                "you probably want to stop the inducing point locations "
+                "being further optimized during the model fit. If so "
+                "then set `learn_inducing_points` to False.",
+                UserWarning,
+            )
+
+        if inducing_point_allocator is None:
+            inducing_point_allocator = GreedyVarianceReduction()
+        self._inducing_point_allocator = inducing_point_allocator
+
         model = _SingleTaskVariationalGP(
             train_X=transformed_X,
             train_Y=train_Y,
@@ -367,6 +402,7 @@ def __init__(
             variational_distribution=variational_distribution,
             variational_strategy=variational_strategy,
             inducing_points=inducing_points,
+            inducing_point_allocator=self._inducing_point_allocator,
         )
 
         super().__init__(model=model, likelihood=likelihood, num_outputs=num_outputs)
@@ -390,7 +426,7 @@ def init_inducing_points(
     ) -> Tensor:
         r"""
         Reinitialize the inducing point locations in-place with the current kernel
-        applied to `inputs`.
+        applied to `inputs` through the model's inducing point allocation strategy.
         The variational distribution and variational strategy caches are reset.
 
         Args:
@@ -407,7 +443,7 @@ def init_inducing_points(
 
         with torch.no_grad():
             num_inducing = var_strat.inducing_points.size(-2)
-            inducing_points = _select_inducing_points(
+            inducing_points = self._inducing_point_allocator.allocate_inducing_points(
                 inputs=inputs,
                 covar_module=self.model.covar_module,
                 num_inducing=num_inducing,
@@ -417,131 +453,3 @@ def init_inducing_points(
             var_strat.variational_params_initialized.fill_(0)
 
         return inducing_points
-
-
-def _select_inducing_points(
-    inputs: Tensor,
-    covar_module: Module,
-    num_inducing: int,
-    input_batch_shape: torch.Size,
-) -> Tensor:
-    r"""
-    Utility function that evaluates a kernel at given inputs and selects inducing point
-    locations based on the pivoted Cholesky heuristic.
-
-    Args:
-        inputs: A (*batch_shape, n, d)-dim input data tensor.
-        covar_module: GPyTorch Module returning a LinearOperator kernel matrix.
-        num_inducing: The maximun number (m) of inducing points (m <= n).
-        input_batch_shape: The non-task-related batch shape.
-
-    Returns:
-        A (*batch_shape, m, d)-dim tensor of inducing point locations.
-    """
-
-    train_train_kernel = covar_module(inputs).evaluate_kernel()
-
-    # base case
-    if train_train_kernel.ndimension() == 2:
-        inducing_points = _pivoted_cholesky_init(
-            train_inputs=inputs,
-            kernel_matrix=train_train_kernel,
-            max_length=num_inducing,
-        )
-    # multi-task case
-    elif train_train_kernel.ndimension() == 3 and len(input_batch_shape) == 0:
-        input_element = inputs[0] if inputs.ndimension() == 3 else inputs
-        kernel_element = train_train_kernel[0]
-        inducing_points = _pivoted_cholesky_init(
-            train_inputs=input_element,
-            kernel_matrix=kernel_element,
-            max_length=num_inducing,
-        )
-    # batched input cases
-    else:
-        batched_inputs = (
-            inputs.expand(*input_batch_shape, -1, -1)
-            if inputs.ndimension() == 2
-            else inputs
-        )
-        reshaped_inputs = batched_inputs.flatten(end_dim=-3)
-        inducing_points = []
-        for input_element in reshaped_inputs:
-            # the extra kernel evals are a little wasteful but make it
-            # easier to infer the task batch size
-            kernel_element = covar_module(input_element).evaluate_kernel()
-            # handle extra task batch dimension
-            kernel_element = (
-                kernel_element[0]
-                if kernel_element.ndimension() == 3
-                else kernel_element
-            )
-            inducing_points.append(
-                _pivoted_cholesky_init(
-                    train_inputs=input_element,
-                    kernel_matrix=kernel_element,
-                    max_length=num_inducing,
-                )
-            )
-        inducing_points = torch.stack(inducing_points).view(
-            *input_batch_shape, num_inducing, -1
-        )
-
-    return inducing_points
-
-
-def _pivoted_cholesky_init(
-    train_inputs: Tensor,
-    kernel_matrix: Union[Tensor, LinearOperator],
-    max_length: int,
-    epsilon: float = 1e-6,
-) -> Tensor:
-    r"""
-    A pivoted cholesky initialization method for the inducing points,
-    originally proposed in [burt2020svgp]_ with the algorithm itself coming from
-    [chen2018dpp]_. Code is a PyTorch version from [chen2018dpp]_, copied from
-    https://github.com/laming-chen/fast-map-dpp/blob/master/dpp.py.
-
-    Args:
-        train_inputs: training inputs (of shape n x d)
-        kernel_matrix: kernel matrix on the training
-            inputs
-        max_length: number of inducing points to initialize
-        epsilon: numerical jitter for stability.
-
-    Returns:
-        max_length x d tensor of the training inputs corresponding to the top
-        max_length pivots of the training kernel matrix
-    """
-
-    # this is numerically equivalent to iteratively performing a pivoted cholesky
-    # while storing the diagonal pivots at each iteration
-    # TODO: use gpytorch's pivoted cholesky instead once that gets an exposed list
-    # TODO: ensure this works in batch mode, which it does not currently.
-
-    item_size = kernel_matrix.shape[-2]
-    cis = torch.zeros(
-        (max_length, item_size), device=kernel_matrix.device, dtype=kernel_matrix.dtype
-    )
-    di2s = kernel_matrix.diag()
-    selected_items = []
-    selected_item = torch.argmax(di2s)
-    selected_items.append(selected_item)
-
-    while len(selected_items) < max_length:
-        k = len(selected_items) - 1
-        ci_optimal = cis[:k, selected_item]
-        di_optimal = torch.sqrt(di2s[selected_item])
-        elements = kernel_matrix[..., selected_item, :]
-        eis = (elements - torch.matmul(ci_optimal, cis[:k, :])) / di_optimal
-        cis[k, :] = eis
-        di2s = di2s - eis.pow(2.0)
-        di2s[selected_item] = NEG_INF
-        selected_item = torch.argmax(di2s)
-        if di2s[selected_item] < epsilon:
-            break
-        selected_items.append(selected_item)
-
-    ind_points = train_inputs[torch.stack(selected_items)]
-
-    return ind_points
diff --git a/botorch/models/utils/inducing_point_allocators.py b/botorch/models/utils/inducing_point_allocators.py
new file mode 100644
index 0000000000..6269339653
--- /dev/null
+++ b/botorch/models/utils/inducing_point_allocators.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+r"""
+Functionality for allocating the inducing points of sparse Gaussian
+process models.
+
+References
+
+.. [chen2018dpp]
+    Laming Chen and Guoxin Zhang and Hanning Zhou, Fast greedy MAP inference
+    for determinantal point process to improve recommendation diversity,
+    Proceedings of the 32nd International Conference on Neural Information
+    Processing Systems, 2018, https://arxiv.org/abs/1709.05135.
+
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Union
+
+import torch
+from botorch.models.model import Model
+
+from botorch.utils.probability.utils import ndtr as Phi, phi
+from gpytorch.module import Module
+from linear_operator.operators import LinearOperator
+from torch import Tensor
+
+NEG_INF = torch.tensor(float("-inf"))
+
+
+class InducingPointAllocator(ABC):
+    r"""
+    This class provides functionality to initialize the inducing point locations
+    of an inducing point-based model, e.g. a `SingleTaskVariationalGP`.
+    """
+
+    @abstractmethod
+    def _get_quality_function(
+        self,
+    ) -> QualityFunction:
+        """
+        Build the quality function required for this inducing point allocation strategy.
+
+        Returns:
+            A quality function.
+        """
+
+        pass  # pragma: no cover
+
+    def allocate_inducing_points(
+        self,
+        inputs: Tensor,
+        covar_module: Module,
+        num_inducing: int,
+        input_batch_shape: torch.Size,
+    ) -> Tensor:
+        r"""
+        Initialize the `num_inducing` inducing point locations according to a
+        specific initialization strategy. todo say something about quality
+
+        Args:
+            inputs: A (\*batch_shape, n, d)-dim input data tensor.
+            covar_module: GPyTorch Module returning a LinearOperator kernel matrix.
+            num_inducing: The maximun number (m) of inducing points (m <= n).
+            input_batch_shape: The non-task-related batch shape.
+
+        Returns:
+            A (\*batch_shape, m, d)-dim tensor of inducing point locations.
+        """
+        quality_function = self._get_quality_function()
+        covar_module = covar_module.to(inputs.device)
+
+        train_train_kernel = covar_module(inputs).evaluate_kernel()
+
+        # base case
+        if train_train_kernel.ndimension() == 2:
+            quality_scores = quality_function(inputs)
+            inducing_points = _pivoted_cholesky_init(
+                train_inputs=inputs,
+                kernel_matrix=train_train_kernel,
+                max_length=num_inducing,
+                quality_scores=quality_scores,
+            )
+        # multi-task case
+        elif train_train_kernel.ndimension() == 3 and len(input_batch_shape) == 0:
+            quality_scores = quality_function(inputs)
+            input_element = inputs[0] if inputs.ndimension() == 3 else inputs
+            kernel_element = train_train_kernel[0]
+            quality_scores = quality_function(input_element)
+            inducing_points = _pivoted_cholesky_init(
+                train_inputs=input_element,
+                kernel_matrix=kernel_element,
+                max_length=num_inducing,
+                quality_scores=quality_scores,
+            )
+        # batched input cases
+        else:
+            batched_inputs = (
+                inputs.expand(*input_batch_shape, -1, -1)
+                if inputs.ndimension() == 2
+                else inputs
+            )
+            reshaped_inputs = batched_inputs.flatten(end_dim=-3)
+            inducing_points = []
+            for input_element in reshaped_inputs:
+                # the extra kernel evals are a little wasteful but make it
+                # easier to infer the task batch size
+                kernel_element = covar_module(input_element).evaluate_kernel()
+                # handle extra task batch dimension
+                kernel_element = (
+                    kernel_element[0]
+                    if kernel_element.ndimension() == 3
+                    else kernel_element
+                )
+                quality_scores = quality_function(input_element)
+                inducing_points.append(
+                    _pivoted_cholesky_init(
+                        train_inputs=input_element,
+                        kernel_matrix=kernel_element,
+                        max_length=num_inducing,
+                        quality_scores=quality_scores,
+                    )
+                )
+            inducing_points = torch.stack(inducing_points).view(
+                *input_batch_shape, num_inducing, -1
+            )
+
+        return inducing_points
+
+
+class QualityFunction(ABC):
+    """A function that scores inputs with respect
+    to a specific criterion."""
+
+    @abstractmethod
+    def __call__(self, inputs: Tensor) -> Tensor:  # [n, d] -> [n]
+        """
+        Args:
+            inputs: inputs (of shape n x d)
+
+        Returns:
+            A tensor of quality scores for each input, of shape [n]
+        """
+
+        pass  # pragma: no cover
+
+
+class UnitQualityFunction(QualityFunction):
+    """
+    A function returning ones for each element. Using this quality function
+    for inducing point allocation corresponds to allocating inducing points
+    with the sole aim of minimizing predictive variance, i.e. the approach
+    of [burt2020svgp]_.
+    """
+
+    @torch.no_grad()
+    def __call__(self, inputs: Tensor) -> Tensor:  # [n, d]-> [n]
+        """
+        Args:
+            inputs: inputs (of shape n x d)
+
+        Returns:
+            A tensor of ones for each input, of shape [n]
+        """
+        return torch.ones([inputs.shape[0]], device=inputs.device, dtype=inputs.dtype)
+
+
+class ExpectedImprovementQualityFunction(QualityFunction):
+    """
+    A function measuring the quality of input points as their expected
+    improvement with respect to a conservative baseline. Expectations
+    are according to the model from the previous BO step. See [moss2023ipa]_
+    for details and justification.
+    """
+
+    def __init__(self, model: Model, maximize: bool):
+        r"""
+        Args:
+            model: The model fitted during the previous BO step. For now, this
+                must be a single task model (i.e. num_outputs=1).
+            maximize: Set True if we are performing function maximization, else
+                set False.
+        """
+        if model.num_outputs != 1:
+            raise NotImplementedError(
+                "Multi-output models are currently not supported. "
+            )
+        self._model = model
+        self._maximize = maximize
+
+    @torch.no_grad()
+    def __call__(self, inputs: Tensor) -> Tensor:  # [n, d] -> [n]
+        """
+        Args:
+            inputs: inputs (of shape n x d)
+
+        Returns:
+            A tensor of quality scores for each input, of shape [n]
+        """
+
+        posterior = self._model.posterior(inputs)
+        mean = posterior.mean.squeeze(-2).squeeze(-1)  # removing redundant dimensions
+        sigma = posterior.variance.clamp_min(1e-12).sqrt().view(mean.shape)
+
+        best_f = torch.max(mean) if self._maximize else torch.min(mean)
+        u = (mean - best_f) / sigma if self._maximize else -(mean - best_f) / sigma
+        return sigma * (phi(u) + u * Phi(u))
+
+
+class GreedyVarianceReduction(InducingPointAllocator):
+    r"""
+    The inducing point allocator proposed by [burt2020svgp]_, that
+    greedily chooses inducing point locations with maximal (conditional)
+    predictive variance.
+    """
+
+    def _get_quality_function(
+        self,
+    ) -> QualityFunction:
+        """
+        Build the unit quality function required for the greedy variance
+        reduction inducing point allocation strategy.
+
+        Returns:
+            A quality function.
+        """
+
+        return UnitQualityFunction()
+
+
+class GreedyImprovementReduction(InducingPointAllocator):
+    r"""
+    An inducing point allocator that greedily chooses inducing points with large
+    predictive variance and that are in promising regions of the search
+    space (according to the model form the previous BO step), see [moss2023ipa]_.
+    """
+
+    def __init__(self, model: Model, maximize: bool):
+        r"""
+
+        Args:
+            model: The model fitted during the previous BO step.
+            maximize: Set True if we are performing function maximization, else
+                set False.
+        """
+        self._model = model
+        self._maximize = maximize
+
+    def _get_quality_function(
+        self,
+    ) -> QualityFunction:
+        """
+        Build the improvement-based quality function required for the greedy
+        improvement reduction inducing point allocation strategy.
+
+        Returns:
+            A quality function.
+        """
+
+        return ExpectedImprovementQualityFunction(self._model, self._maximize)
+
+
+def _pivoted_cholesky_init(
+    train_inputs: Tensor,
+    kernel_matrix: Union[Tensor, LinearOperator],
+    max_length: int,
+    quality_scores: Tensor,
+    epsilon: float = 1e-6,
+) -> Tensor:
+    r"""
+    A pivoted Cholesky initialization method for the inducing points,
+    originally proposed in [burt2020svgp]_ with the algorithm itself coming from
+    [chen2018dpp]_. Code is a PyTorch version from [chen2018dpp]_, based on
+    https://github.com/laming-chen/fast-map-dpp/blob/master/dpp.py but with a small
+    modification to allow the underlying DPP to be defined through its diversity-quality
+    decomposition,as discussed by [moss2023ipa]_. This method returns a greedy
+    approximation of the MAP estimate of the specified DPP, i.e. its returns a
+    set of points that are highly diverse (according to the provided kernel_matrix)
+    and have high quality (according to the provided quality_scores).
+
+    Args:
+        train_inputs: training inputs (of shape n x d)
+        kernel_matrix: kernel matrix on the training inputs
+        max_length: number of inducing points to initialize
+        quality_scores: scores representing the quality of each candidate
+            input (of shape [n])
+        epsilon: numerical jitter for stability.
+
+    Returns:
+        max_length x d tensor of the training inputs corresponding to the top
+        max_length pivots of the training kernel matrix
+    """
+
+    # this is numerically equivalent to iteratively performing a pivoted cholesky
+    # while storing the diagonal pivots at each iteration
+    # TODO: use gpytorch's pivoted cholesky instead once that gets an exposed list
+    # TODO: ensure this works in batch mode, which it does not currently.
+
+    # todo test for shape of quality function
+
+    if quality_scores.shape[0] != train_inputs.shape[0]:
+        raise ValueError(
+            "_pivoted_cholesky_init requires a quality score for each of train_inputs"
+        )
+
+    item_size = kernel_matrix.shape[-2]
+    cis = torch.zeros(
+        (max_length, item_size), device=kernel_matrix.device, dtype=kernel_matrix.dtype
+    )
+    di2s = kernel_matrix.diag()
+    scores = di2s * (quality_scores**2)
+    selected_items = []
+    selected_item = torch.argmax(scores)
+    selected_items.append(selected_item)
+
+    while len(selected_items) < max_length:
+        k = len(selected_items) - 1
+        ci_optimal = cis[:k, selected_item]
+        di_optimal = torch.sqrt(di2s[selected_item])
+        elements = kernel_matrix[..., selected_item, :]
+        eis = (elements - torch.matmul(ci_optimal, cis[:k, :])) / di_optimal
+        cis[k, :] = eis
+        di2s = di2s - eis.pow(2.0)
+        di2s[selected_item] = NEG_INF
+        scores = di2s * (quality_scores**2)
+        selected_item = torch.argmax(scores)
+        if di2s[selected_item] < epsilon:
+            break
+        selected_items.append(selected_item)
+
+    ind_points = train_inputs[torch.stack(selected_items)]
+
+    return ind_points[:max_length, :]
diff --git a/sphinx/source/models.rst b/sphinx/source/models.rst
index 533f6fe52c..c7f3c12730 100644
--- a/sphinx/source/models.rst
+++ b/sphinx/source/models.rst
@@ -165,6 +165,12 @@ Model Conversion
 .. automodule:: botorch.models.converter
     :members:
 
+Inducing Point Allocators
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: botorch.models.utils.inducing_point_allocators
+    :members:
+    :private-members: _pivoted_cholesky_init
+
 Other Utilties
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. automodule:: botorch.models.utils.assorted
diff --git a/test/models/test_approximate_gp.py b/test/models/test_approximate_gp.py
index 5c1b5daabb..40d10ca29d 100644
--- a/test/models/test_approximate_gp.py
+++ b/test/models/test_approximate_gp.py
@@ -14,6 +14,10 @@
 )
 from botorch.models.transforms.input import Normalize
 from botorch.models.transforms.outcome import Log
+from botorch.models.utils.inducing_point_allocators import (
+    GreedyImprovementReduction,
+    GreedyVarianceReduction,
+)
 from botorch.posteriors import GPyTorchPosterior, TransformedPosterior
 from botorch.utils.testing import BotorchTestCase
 from gpytorch.likelihoods import GaussianLikelihood, MultitaskGaussianLikelihood
@@ -181,6 +185,27 @@ def test_initializations(self):
             else:
                 self.assertFalse(hasattr(model, "outcome_transform"))
 
+        # test default inducing point allocator
+        self.assertIsInstance(model._inducing_point_allocator, GreedyVarianceReduction)
+
+        # test that can specify an inducing point allocator
+        for ipa in [
+            GreedyVarianceReduction(),
+            GreedyImprovementReduction(model, maximize=True),
+        ]:
+            model = SingleTaskVariationalGP(train_X, inducing_point_allocator=ipa)
+            self.assertTrue(type(model._inducing_point_allocator), type(ipa))
+
+        # test warning when learning on and custom IPA provided
+        with self.assertWarnsRegex(
+            UserWarning, r"set `learn_inducing_points` to False"
+        ):
+            SingleTaskVariationalGP(
+                train_X,
+                learn_inducing_points=True,
+                inducing_point_allocator=GreedyVarianceReduction(),
+            )
+
     def test_inducing_point_init(self):
         train_X_1 = torch.rand(15, 1, device=self.device)
         train_X_2 = torch.rand(15, 1, device=self.device)
@@ -193,6 +218,8 @@ def test_inducing_point_init(self):
         model_2 = SingleTaskVariationalGP(train_X=train_X_2, inducing_points=5)
         model_2_inducing = model_2.model.variational_strategy.inducing_points
 
+        self.assertEqual(model_1_inducing.shape, (5, 1))
+        self.assertEqual(model_2_inducing.shape, (5, 1))
         self.assertAllClose(model_1_inducing, model_2_inducing)
 
         # multi-task
@@ -211,6 +238,8 @@ def test_inducing_point_init(self):
             model_2.model.variational_strategy.base_variational_strategy.inducing_points
         )
 
+        self.assertEqual(model_1_inducing.shape, (5, 1))
+        self.assertEqual(model_2_inducing.shape, (5, 1))
         self.assertAllClose(model_1_inducing, model_2_inducing)
 
         # batched inputs
@@ -223,12 +252,55 @@ def test_inducing_point_init(self):
         )
         model_1.init_inducing_points(train_X_2)
         model_1_inducing = model_1.model.variational_strategy.inducing_points
-
         model_2 = SingleTaskVariationalGP(
             train_X=train_X_2, train_Y=train_Y, inducing_points=5
         )
         model_2_inducing = model_2.model.variational_strategy.inducing_points
 
-        self.assertTrue(model_1_inducing.shape == (2, 5, 1))
-        self.assertTrue(model_2_inducing.shape == (2, 5, 1))
+        self.assertEqual(model_1_inducing.shape, (2, 5, 1))
+        self.assertEqual(model_2_inducing.shape, (2, 5, 1))
+        self.assertAllClose(model_1_inducing, model_2_inducing)
+
+    def test_custom_inducing_point_init(self):
+        train_X_0 = torch.rand(15, 1, device=self.device)
+        train_X_1 = torch.rand(15, 1, device=self.device)
+        train_X_2 = torch.rand(15, 1, device=self.device)
+        train_X_3 = torch.rand(15, 1, device=self.device)
+
+        model_from_previous_step = SingleTaskVariationalGP(
+            train_X=train_X_0, inducing_points=5
+        )
+
+        model_1 = SingleTaskVariationalGP(
+            train_X=train_X_1,
+            inducing_points=5,
+            inducing_point_allocator=GreedyImprovementReduction(
+                model_from_previous_step, maximize=True
+            ),
+        )
+        model_1.init_inducing_points(train_X_2)
+        model_1_inducing = model_1.model.variational_strategy.inducing_points
+
+        model_2 = SingleTaskVariationalGP(
+            train_X=train_X_2,
+            inducing_points=5,
+            inducing_point_allocator=GreedyImprovementReduction(
+                model_from_previous_step, maximize=True
+            ),
+        )
+        model_2_inducing = model_2.model.variational_strategy.inducing_points
+
+        model_3 = SingleTaskVariationalGP(
+            train_X=train_X_3,
+            inducing_points=5,
+            inducing_point_allocator=GreedyImprovementReduction(
+                model_from_previous_step, maximize=False
+            ),
+        )
+        model_3.init_inducing_points(train_X_2)
+        model_3_inducing = model_3.model.variational_strategy.inducing_points
+
+        self.assertEqual(model_1_inducing.shape, (5, 1))
+        self.assertEqual(model_2_inducing.shape, (5, 1))
         self.assertAllClose(model_1_inducing, model_2_inducing)
+        self.assertFalse(model_1_inducing[0, 0] == model_3_inducing[0, 0])
diff --git a/test/models/utils/test_inducing_point_allocators.py b/test/models/utils/test_inducing_point_allocators.py
new file mode 100644
index 0000000000..4873629fe5
--- /dev/null
+++ b/test/models/utils/test_inducing_point_allocators.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from botorch.models.approximate_gp import SingleTaskVariationalGP
+from botorch.models.utils.inducing_point_allocators import (
+    _pivoted_cholesky_init,
+    ExpectedImprovementQualityFunction,
+    GreedyImprovementReduction,
+    GreedyVarianceReduction,
+    UnitQualityFunction,
+)
+from botorch.utils.testing import BotorchTestCase
+
+from gpytorch.kernels import MaternKernel
+from gpytorch.likelihoods import GaussianLikelihood
+from gpytorch.mlls import VariationalELBO
+
+
+class TestUnitQualityFunction(BotorchTestCase):
+    def setUp(self):
+        super().setUp()
+        self.quality_function = UnitQualityFunction()
+
+    def test_returns_ones_and_correct_shape(self):
+        train_X = torch.rand(15, 1, device=self.device)
+        scores = self.quality_function(train_X)
+        self.assertTrue(torch.equal(scores, torch.ones([15], device=self.device)))
+
+
+class TestExpectedImprovementQualityFunction(BotorchTestCase):
+    def setUp(self):
+        super().setUp()
+        train_X = torch.rand(10, 1, device=self.device)
+        train_y = torch.sin(train_X) + torch.randn_like(train_X) * 0.2
+
+        self.previous_model = SingleTaskVariationalGP(
+            train_X=train_X, likelihood=GaussianLikelihood()
+        ).to(self.device)
+
+        mll = VariationalELBO(
+            self.previous_model.likelihood, self.previous_model.model, num_data=10
+        )
+        loss = -mll(
+            self.previous_model.likelihood(self.previous_model(train_X)), train_y
+        ).sum()
+        loss.backward()
+
+    def test_returns_correct_shape(self):
+        train_X = torch.rand(15, 1, device=self.device)
+        for maximize in [True, False]:
+            quality_function = ExpectedImprovementQualityFunction(
+                self.previous_model, maximize=maximize
+            )
+            scores = quality_function(train_X)
+            self.assertEqual(scores.shape, torch.Size([15]))
+
+    def test_raises_for_multi_output_model(self):
+        train_X = torch.rand(15, 1, device=self.device)
+        mo_model = SingleTaskVariationalGP(
+            train_X=train_X, likelihood=GaussianLikelihood(), num_outputs=5
+        ).to(self.device)
+        with self.assertRaises(NotImplementedError):
+            ExpectedImprovementQualityFunction(mo_model, maximize=True)
+
+    def test_different_for_maximize_and_minimize(self):
+        train_X = torch.rand(15, 1, device=self.device)
+
+        quality_function_for_max = ExpectedImprovementQualityFunction(
+            self.previous_model, maximize=True
+        )
+        scores_for_max = quality_function_for_max(train_X)
+
+        quality_function_for_min = ExpectedImprovementQualityFunction(
+            self.previous_model, maximize=False
+        )
+        scores_for_min = quality_function_for_min(train_X)
+
+        self.assertFalse(torch.equal(scores_for_min, scores_for_max))
+
+    def test_ei_calc_via_monte_carlo(self):
+        for maximize in [True, False]:
+            train_X = torch.rand(10, 1, device=self.device)
+            posterior = self.previous_model.posterior(train_X)
+            mean = posterior.mean.squeeze(-2).squeeze(-1)
+            sigma = posterior.variance.sqrt().view(mean.shape)
+            normal = torch.distributions.Normal(mean, sigma)
+            samples = normal.sample([1_000_000])
+            if maximize:
+                baseline = torch.min(mean)
+                ei = torch.clamp(samples - baseline, min=0.0).mean(axis=0)
+            else:
+                baseline = torch.max(mean)
+                ei = torch.clamp(baseline - samples, min=0.0).mean(axis=0)
+
+            quality_function = ExpectedImprovementQualityFunction(
+                self.previous_model, maximize
+            )
+
+            self.assertAllClose(ei, quality_function(train_X), atol=0.01, rtol=0.01)
+
+
+class TestGreedyVarianceReduction(BotorchTestCase):
+    def setUp(self):
+        super().setUp()
+        self.ipa = GreedyVarianceReduction()
+
+    def test_initialization(self):
+        self.assertIsInstance(self.ipa, GreedyVarianceReduction)
+
+    def test_inducing_points_shape_and_repeatability(self):
+
+        for train_X in [
+            torch.rand(15, 1, device=self.device),  # single task
+            torch.rand(2, 15, 1, device=self.device),  # batched inputs
+        ]:
+
+            inducing_points_1 = self.ipa.allocate_inducing_points(
+                inputs=train_X,
+                covar_module=MaternKernel(),
+                num_inducing=5,
+                input_batch_shape=torch.Size([]),
+            )
+
+            inducing_points_2 = self.ipa.allocate_inducing_points(
+                inputs=train_X,
+                covar_module=MaternKernel(),
+                num_inducing=5,
+                input_batch_shape=torch.Size([]),
+            )
+
+            if len(train_X) == 3:  # batched inputs
+                self.assertEqual(inducing_points_1.shape, (2, 5, 1))
+                self.assertEqual(inducing_points_2.shape, (2, 5, 1))
+            else:
+                self.assertEqual(inducing_points_1.shape, (5, 1))
+                self.assertEqual(inducing_points_2.shape, (5, 1))
+            self.assertAllClose(inducing_points_1, inducing_points_2)
+
+    def test_that_we_dont_get_redundant_inducing_points(self):
+        train_X = torch.rand(15, 1, device=self.device)
+        stacked_train_X = torch.cat((train_X, train_X), dim=0)
+        num_inducing = 20
+        inducing_points_1 = self.ipa.allocate_inducing_points(
+            inputs=stacked_train_X,
+            covar_module=MaternKernel(),
+            num_inducing=num_inducing,
+            input_batch_shape=torch.Size([]),
+        )
+        # should not have 20 inducing points when 15 singular dimensions
+        # are passed
+        self.assertLess(inducing_points_1.shape[-2], num_inducing)
+
+
+class TestGreedyImprovementReduction(BotorchTestCase):
+    def setUp(self):
+        super().setUp()
+        train_X = torch.rand(10, 1, device=self.device)
+        train_y = torch.sin(train_X) + torch.randn_like(train_X) * 0.2
+
+        self.previous_model = SingleTaskVariationalGP(
+            train_X=train_X, likelihood=GaussianLikelihood()
+        ).to(self.device)
+
+        mll = VariationalELBO(
+            self.previous_model.likelihood, self.previous_model.model, num_data=10
+        )
+        loss = -mll(
+            self.previous_model.likelihood(self.previous_model(train_X)), train_y
+        ).sum()
+        loss.backward()
+
+        self.ipa = GreedyImprovementReduction(self.previous_model, maximize=True)
+
+    def test_initialization(self):
+        self.assertIsInstance(self.ipa, GreedyImprovementReduction)
+        self.assertIsInstance(self.ipa._model, SingleTaskVariationalGP)
+        self.assertEqual(self.ipa._maximize, True)
+
+    def test_raises_for_multi_output_model(self):
+        train_X = torch.rand(10, 1, device=self.device)
+        model = SingleTaskVariationalGP(
+            train_X=train_X, likelihood=GaussianLikelihood(), num_outputs=5
+        ).to(self.device)
+        ipa = GreedyImprovementReduction(model, maximize=True)
+        with self.assertRaises(NotImplementedError):
+            ipa.allocate_inducing_points(
+                inputs=train_X,
+                covar_module=MaternKernel(),
+                num_inducing=5,
+                input_batch_shape=torch.Size([]),
+            )
+
+    def test_inducing_points_shape_and_repeatability(self):
+        train_X = torch.rand(15, 1, device=self.device)
+
+        for train_X in [
+            torch.rand(15, 1, device=self.device),  # single task
+            torch.rand(2, 15, 1, device=self.device),  # batched inputs
+        ]:
+
+            inducing_points_1 = self.ipa.allocate_inducing_points(
+                inputs=train_X,
+                covar_module=MaternKernel(),
+                num_inducing=5,
+                input_batch_shape=torch.Size([]),
+            )
+
+            inducing_points_2 = self.ipa.allocate_inducing_points(
+                inputs=train_X,
+                covar_module=MaternKernel(),
+                num_inducing=5,
+                input_batch_shape=torch.Size([]),
+            )
+
+            if len(train_X) == 3:  # batched inputs
+                self.assertEqual(inducing_points_1.shape, (2, 5, 1))
+                self.assertEqual(inducing_points_2.shape, (2, 5, 1))
+            else:
+                self.assertEqual(inducing_points_1.shape, (5, 1))
+                self.assertEqual(inducing_points_2.shape, (5, 1))
+            self.assertAllClose(inducing_points_1, inducing_points_2)
+
+    def test_that_we_dont_get_redundant_inducing_points(self):
+        train_X = torch.rand(15, 1, device=self.device)
+        stacked_train_X = torch.cat((train_X, train_X), dim=0)
+        num_inducing = 20
+        inducing_points_1 = self.ipa.allocate_inducing_points(
+            inputs=stacked_train_X,
+            covar_module=MaternKernel(),
+            num_inducing=num_inducing,
+            input_batch_shape=torch.Size([]),
+        )
+        # should not have 20 inducing points when 15 singular dimensions
+        # are passed
+        self.assertLess(inducing_points_1.shape[-2], num_inducing)
+
+    def test_inducing_points_different_when_minimizing(self):
+        ipa_for_max = GreedyImprovementReduction(self.previous_model, maximize=True)
+        ipa_for_min = GreedyImprovementReduction(self.previous_model, maximize=False)
+
+        train_X = torch.rand(15, 1, device=self.device)
+        inducing_points_for_max = ipa_for_max.allocate_inducing_points(
+            inputs=train_X,
+            covar_module=MaternKernel(),
+            num_inducing=10,
+            input_batch_shape=torch.Size([]),
+        )
+        inducing_points_for_min = ipa_for_min.allocate_inducing_points(
+            inputs=train_X,
+            covar_module=MaternKernel(),
+            num_inducing=10,
+            input_batch_shape=torch.Size([]),
+        )
+
+        self.assertFalse(torch.equal(inducing_points_for_min, inducing_points_for_max))
+
+
+class TestPivotedCholeskyInit(BotorchTestCase):
+    def test_raises_for_quality_function_with_invalid_shape(self):
+        with self.assertRaises(ValueError):
+            inputs = torch.rand(15, 1, device=self.device)
+            train_train_kernel = (
+                MaternKernel().to(self.device)(inputs).evaluate_kernel()
+            )
+            quality_scores = torch.ones([10, 1], device=self.device)
+            _pivoted_cholesky_init(
+                train_inputs=inputs,
+                kernel_matrix=train_train_kernel,
+                max_length=10,
+                quality_scores=quality_scores,
+            )