Fix memory leak in inducing point allocators (pytorch#1890)

Summary: Fixes pytorch#1788 . ## Motivation `allocate_inducing_points` leaks memory when passed a `kernel_matrix` with `requires_grad=True`. The memory leak happens due to a specific pattern of in-place torch operations in `_pivoted_cholesky_init`; see [this comment](pytorch#1788 (comment)) for more explanation. There is no need for `allocate_inducing_points` to support a `kernel_matrix` with `requires_grad=True`, because the output of `allocate_inducing_points` is not differentiable anyway (thanks to in-place operations). [x] make `_pivoted_cholesky_init` raise an `UnsupportedError` when passed a `kernel_matrix` with `requires_grad=True`. That is mildly BC-breaking, but I think that is okay since the alternative is a memory leak. [x] Evaluate kernels with `torch.no_grad()` where they are only used to be passed to `_pivoted_cholesky_init` ### Have you read the [Contributing Guidelines on pull requests](https://github.com/pytorch/botorch/blob/main/CONTRIBUTING.md#pull-requests)? Yes Pull Request resolved: pytorch#1890 Test Plan: [x] Unit test for memory leak [x] Unit test for UnsupportedError Reviewed By: saitcakmak, Balandat Differential Revision: D46803080 Pulled By: esantorella fbshipit-source-id: 1fb9c6500d4246a3740a9fce4bda290043f8ac3b
esantorella · Jun 17, 2023 · 0ac6b14 · 0ac6b14
1 parent 00fa4a8
commit 0ac6b14
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 40 deletions.
diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py
@@ -191,7 +191,7 @@ def __init__(
         Args:
             train_X: Training inputs (due to the ability of the SVGP to sub-sample
                 this does not have to be all of the training inputs).
-            train_Y: Training targets (optional).
+            train_Y: Not used.
             num_outputs: Number of output responses per input.
             covar_module: Kernel function. If omitted, uses a `MaternKernel`.
             mean_module: Mean of GP model. If omitted, uses a `ConstantMean`.
@@ -402,7 +402,6 @@ def __init__(
 
         model = _SingleTaskVariationalGP(
             train_X=transformed_X,
-            train_Y=train_Y,
             num_outputs=num_outputs,
             learn_inducing_points=learn_inducing_points,
             covar_module=covar_module,

diff --git a/botorch/models/utils/inducing_point_allocators.py b/botorch/models/utils/inducing_point_allocators.py
@@ -24,6 +24,7 @@
 from typing import Union
 
 import torch
+from botorch.exceptions.errors import UnsupportedError
 from botorch.models.model import Model
 
 from botorch.utils.probability.utils import ndtr as Phi, phi
@@ -74,19 +75,33 @@ def allocate_inducing_points(
         quality_function = self._get_quality_function()
         covar_module = covar_module.to(inputs.device)
 
-        train_train_kernel = covar_module(inputs).evaluate_kernel()
+        # We use 'no_grad' here because `inducing_points` are not
+        # auto-differentiable with respect to the kernel hyper-parameters,
+        # because `_pivoted_cholesky_init` does in-place operations.
+        with torch.no_grad():
+            # Evaluate lazily because this may only be needed to figure out what
+            # case we are in
+            possibly_lazy_kernel = covar_module(inputs)
+
+        base_case = possibly_lazy_kernel.ndimension() == 2
+        multi_task_case = (
+            possibly_lazy_kernel.ndimension() == 3 and len(input_batch_shape) == 0
+        )
+
+        if base_case or multi_task_case:
+            train_train_kernel = possibly_lazy_kernel.evaluate_kernel()
 
-        # base case
-        if train_train_kernel.ndimension() == 2:
+        if base_case:
             quality_scores = quality_function(inputs)
             inducing_points = _pivoted_cholesky_init(
                 train_inputs=inputs,
                 kernel_matrix=train_train_kernel,
                 max_length=num_inducing,
                 quality_scores=quality_scores,
             )
-        # multi-task case
-        elif train_train_kernel.ndimension() == 3 and len(input_batch_shape) == 0:
+            return inducing_points
+
+        if multi_task_case:
             input_element = inputs[0] if inputs.ndimension() == 3 else inputs
             kernel_element = train_train_kernel[0]
             quality_scores = quality_function(input_element)
@@ -96,37 +111,42 @@ def allocate_inducing_points(
                 max_length=num_inducing,
                 quality_scores=quality_scores,
             )
+            return inducing_points
+
         # batched input cases
-        else:
-            batched_inputs = (
-                inputs.expand(*input_batch_shape, -1, -1)
-                if inputs.ndimension() == 2
-                else inputs
-            )
-            reshaped_inputs = batched_inputs.flatten(end_dim=-3)
-            inducing_points = []
-            for input_element in reshaped_inputs:
-                # the extra kernel evals are a little wasteful but make it
-                # easier to infer the task batch size
+        batched_inputs = (
+            inputs.expand(*input_batch_shape, -1, -1)
+            if inputs.ndimension() == 2
+            else inputs
+        )
+        reshaped_inputs = batched_inputs.flatten(end_dim=-3)
+        inducing_points = []
+        for input_element in reshaped_inputs:
+            # the extra kernel evals are a little wasteful but make it
+            # easier to infer the task batch size
+            # We use 'no_grad' here because `inducing_points` are not
+            # auto-differentiable with respect to the kernel hyper-parameters,
+            # because `_pivoted_cholesky_init` does in-place operations.
+            with torch.no_grad():
                 kernel_element = covar_module(input_element).evaluate_kernel()
-                # handle extra task batch dimension
-                kernel_element = (
-                    kernel_element[0]
-                    if kernel_element.ndimension() == 3
-                    else kernel_element
-                )
-                quality_scores = quality_function(input_element)
-                inducing_points.append(
-                    _pivoted_cholesky_init(
-                        train_inputs=input_element,
-                        kernel_matrix=kernel_element,
-                        max_length=num_inducing,
-                        quality_scores=quality_scores,
-                    )
+            # handle extra task batch dimension
+            kernel_element = (
+                kernel_element[0]
+                if kernel_element.ndimension() == 3
+                else kernel_element
+            )
+            quality_scores = quality_function(input_element)
+            inducing_points.append(
+                _pivoted_cholesky_init(
+                    train_inputs=input_element,
+                    kernel_matrix=kernel_element,
+                    max_length=num_inducing,
+                    quality_scores=quality_scores,
                 )
-            inducing_points = torch.stack(inducing_points).view(
-                *input_batch_shape, num_inducing, -1
             )
+        inducing_points = torch.stack(inducing_points).view(
+            *input_batch_shape, num_inducing, -1
+        )
 
         return inducing_points
 
@@ -304,15 +324,20 @@ def _pivoted_cholesky_init(
             "_pivoted_cholesky_init requires a quality score for each of train_inputs"
         )
 
+    if kernel_matrix.requires_grad:
+        raise UnsupportedError(
+            "`_pivoted_cholesky_init` does not support using a `kernel_matrix` "
+            "with `requires_grad=True`."
+        )
+
     item_size = kernel_matrix.shape[-2]
     cis = torch.zeros(
         (max_length, item_size), device=kernel_matrix.device, dtype=kernel_matrix.dtype
     )
     di2s = kernel_matrix.diagonal()
     scores = di2s * torch.square(quality_scores)
-    selected_items = []
     selected_item = torch.argmax(scores)
-    selected_items.append(selected_item)
+    selected_items = [selected_item]
 
     while len(selected_items) < max_length:
         k = len(selected_items) - 1

diff --git a/test/models/utils/test_inducing_point_allocators.py b/test/models/utils/test_inducing_point_allocators.py
@@ -4,8 +4,10 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import gc
 
 import torch
+from botorch.exceptions.errors import UnsupportedError
 from botorch.models.approximate_gp import SingleTaskVariationalGP
 from botorch.models.utils.inducing_point_allocators import (
     _pivoted_cholesky_init,
@@ -16,7 +18,7 @@
 )
 from botorch.utils.testing import BotorchTestCase
 
-from gpytorch.kernels import MaternKernel
+from gpytorch.kernels import MaternKernel, ScaleKernel
 from gpytorch.likelihoods import GaussianLikelihood
 from gpytorch.mlls import VariationalELBO
 
@@ -112,6 +114,29 @@ def setUp(self):
     def test_initialization(self):
         self.assertIsInstance(self.ipa, GreedyVarianceReduction)
 
+    def test_allocate_inducing_points_doesnt_leak(self) -> None:
+        """
+        Run 'allocate_inducing_points' and check that all tensors allocated
+        in that function are garbabe-collected.
+        """
+
+        def _get_n_tensors_tracked_by_gc() -> int:
+            gc.collect()
+            return sum(1 for elt in gc.get_objects() if isinstance(elt, torch.Tensor))
+
+        def f() -> None:
+            """Construct and use a GreedyVarianceReduction allocator."""
+            x = torch.rand(7, 3).to(self.device)
+            kernel = ScaleKernel(MaternKernel())
+            allocator = GreedyVarianceReduction()
+            allocator.allocate_inducing_points(x, kernel, 4, x.shape[:-2])
+
+        n_tensors_before = _get_n_tensors_tracked_by_gc()
+        f()
+        n_tensors_after = _get_n_tensors_tracked_by_gc()
+
+        self.assertEqual(n_tensors_before, n_tensors_after)
+
     def test_inducing_points_shape_and_repeatability(self):
 
         for train_X in [
@@ -262,12 +287,29 @@ def test_inducing_points_different_when_minimizing(self):
 
 class TestPivotedCholeskyInit(BotorchTestCase):
     def test_raises_for_quality_function_with_invalid_shape(self):
-        with self.assertRaises(ValueError):
-            inputs = torch.rand(15, 1, device=self.device)
+        inputs = torch.rand(15, 1, device=self.device)
+        with torch.no_grad():
             train_train_kernel = (
                 MaternKernel().to(self.device)(inputs).evaluate_kernel()
             )
-            quality_scores = torch.ones([10, 1], device=self.device)
+        quality_scores = torch.ones([10, 1], device=self.device)
+        with self.assertRaisesRegex(ValueError, ".*requires a quality score"):
+            _pivoted_cholesky_init(
+                train_inputs=inputs,
+                kernel_matrix=train_train_kernel,
+                max_length=10,
+                quality_scores=quality_scores,
+            )
+
+    def test_raises_for_kernel_with_grad(self) -> None:
+        inputs = torch.rand(15, 1, device=self.device)
+        train_train_kernel = MaternKernel().to(self.device)(inputs).evaluate_kernel()
+        quality_scores = torch.ones(15, device=self.device)
+        with self.assertRaisesRegex(
+            UnsupportedError,
+            "`_pivoted_cholesky_init` does not support using a `kernel_matrix` "
+            "with `requires_grad=True`.",
+        ):
             _pivoted_cholesky_init(
                 train_inputs=inputs,
                 kernel_matrix=train_train_kernel,