Skip to content

Commit

Permalink
Fix memory leak in inducing point allocators (pytorch#1890)
Browse files Browse the repository at this point in the history
Summary:
Fixes pytorch#1788 .
## Motivation

`allocate_inducing_points` leaks memory when passed a `kernel_matrix` with `requires_grad=True`. The memory leak happens due to a specific pattern of in-place torch operations in `_pivoted_cholesky_init`;  see [this comment](pytorch#1788 (comment)) for more explanation. There is no need for `allocate_inducing_points` to support a `kernel_matrix` with `requires_grad=True`, because the output of `allocate_inducing_points` is not differentiable anyway (thanks to in-place operations).

[x] make `_pivoted_cholesky_init` raise an `UnsupportedError` when passed a `kernel_matrix` with `requires_grad=True`. That is mildly BC-breaking, but I think that is okay since the alternative is a memory leak.
[x] Evaluate kernels with `torch.no_grad()` where they are only used to be passed to `_pivoted_cholesky_init`

### Have you read the [Contributing Guidelines on pull requests](https://github.com/pytorch/botorch/blob/main/CONTRIBUTING.md#pull-requests)?

Yes

Pull Request resolved: pytorch#1890

Test Plan:
[x] Unit test for memory leak
[x] Unit test for UnsupportedError

Reviewed By: saitcakmak, Balandat

Differential Revision: D46803080

Pulled By: esantorella

fbshipit-source-id: 1fb9c6500d4246a3740a9fce4bda290043f8ac3b
  • Loading branch information
esantorella authored and facebook-github-bot committed Jun 17, 2023
1 parent 00fa4a8 commit 0ac6b14
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 40 deletions.
3 changes: 1 addition & 2 deletions botorch/models/approximate_gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def __init__(
Args:
train_X: Training inputs (due to the ability of the SVGP to sub-sample
this does not have to be all of the training inputs).
train_Y: Training targets (optional).
train_Y: Not used.
num_outputs: Number of output responses per input.
covar_module: Kernel function. If omitted, uses a `MaternKernel`.
mean_module: Mean of GP model. If omitted, uses a `ConstantMean`.
Expand Down Expand Up @@ -402,7 +402,6 @@ def __init__(

model = _SingleTaskVariationalGP(
train_X=transformed_X,
train_Y=train_Y,
num_outputs=num_outputs,
learn_inducing_points=learn_inducing_points,
covar_module=covar_module,
Expand Down
93 changes: 59 additions & 34 deletions botorch/models/utils/inducing_point_allocators.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from typing import Union

import torch
from botorch.exceptions.errors import UnsupportedError
from botorch.models.model import Model

from botorch.utils.probability.utils import ndtr as Phi, phi
Expand Down Expand Up @@ -74,19 +75,33 @@ def allocate_inducing_points(
quality_function = self._get_quality_function()
covar_module = covar_module.to(inputs.device)

train_train_kernel = covar_module(inputs).evaluate_kernel()
# We use 'no_grad' here because `inducing_points` are not
# auto-differentiable with respect to the kernel hyper-parameters,
# because `_pivoted_cholesky_init` does in-place operations.
with torch.no_grad():
# Evaluate lazily because this may only be needed to figure out what
# case we are in
possibly_lazy_kernel = covar_module(inputs)

base_case = possibly_lazy_kernel.ndimension() == 2
multi_task_case = (
possibly_lazy_kernel.ndimension() == 3 and len(input_batch_shape) == 0
)

if base_case or multi_task_case:
train_train_kernel = possibly_lazy_kernel.evaluate_kernel()

# base case
if train_train_kernel.ndimension() == 2:
if base_case:
quality_scores = quality_function(inputs)
inducing_points = _pivoted_cholesky_init(
train_inputs=inputs,
kernel_matrix=train_train_kernel,
max_length=num_inducing,
quality_scores=quality_scores,
)
# multi-task case
elif train_train_kernel.ndimension() == 3 and len(input_batch_shape) == 0:
return inducing_points

if multi_task_case:
input_element = inputs[0] if inputs.ndimension() == 3 else inputs
kernel_element = train_train_kernel[0]
quality_scores = quality_function(input_element)
Expand All @@ -96,37 +111,42 @@ def allocate_inducing_points(
max_length=num_inducing,
quality_scores=quality_scores,
)
return inducing_points

# batched input cases
else:
batched_inputs = (
inputs.expand(*input_batch_shape, -1, -1)
if inputs.ndimension() == 2
else inputs
)
reshaped_inputs = batched_inputs.flatten(end_dim=-3)
inducing_points = []
for input_element in reshaped_inputs:
# the extra kernel evals are a little wasteful but make it
# easier to infer the task batch size
batched_inputs = (
inputs.expand(*input_batch_shape, -1, -1)
if inputs.ndimension() == 2
else inputs
)
reshaped_inputs = batched_inputs.flatten(end_dim=-3)
inducing_points = []
for input_element in reshaped_inputs:
# the extra kernel evals are a little wasteful but make it
# easier to infer the task batch size
# We use 'no_grad' here because `inducing_points` are not
# auto-differentiable with respect to the kernel hyper-parameters,
# because `_pivoted_cholesky_init` does in-place operations.
with torch.no_grad():
kernel_element = covar_module(input_element).evaluate_kernel()
# handle extra task batch dimension
kernel_element = (
kernel_element[0]
if kernel_element.ndimension() == 3
else kernel_element
)
quality_scores = quality_function(input_element)
inducing_points.append(
_pivoted_cholesky_init(
train_inputs=input_element,
kernel_matrix=kernel_element,
max_length=num_inducing,
quality_scores=quality_scores,
)
# handle extra task batch dimension
kernel_element = (
kernel_element[0]
if kernel_element.ndimension() == 3
else kernel_element
)
quality_scores = quality_function(input_element)
inducing_points.append(
_pivoted_cholesky_init(
train_inputs=input_element,
kernel_matrix=kernel_element,
max_length=num_inducing,
quality_scores=quality_scores,
)
inducing_points = torch.stack(inducing_points).view(
*input_batch_shape, num_inducing, -1
)
inducing_points = torch.stack(inducing_points).view(
*input_batch_shape, num_inducing, -1
)

return inducing_points

Expand Down Expand Up @@ -304,15 +324,20 @@ def _pivoted_cholesky_init(
"_pivoted_cholesky_init requires a quality score for each of train_inputs"
)

if kernel_matrix.requires_grad:
raise UnsupportedError(
"`_pivoted_cholesky_init` does not support using a `kernel_matrix` "
"with `requires_grad=True`."
)

item_size = kernel_matrix.shape[-2]
cis = torch.zeros(
(max_length, item_size), device=kernel_matrix.device, dtype=kernel_matrix.dtype
)
di2s = kernel_matrix.diagonal()
scores = di2s * torch.square(quality_scores)
selected_items = []
selected_item = torch.argmax(scores)
selected_items.append(selected_item)
selected_items = [selected_item]

while len(selected_items) < max_length:
k = len(selected_items) - 1
Expand Down
50 changes: 46 additions & 4 deletions test/models/utils/test_inducing_point_allocators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import gc

import torch
from botorch.exceptions.errors import UnsupportedError
from botorch.models.approximate_gp import SingleTaskVariationalGP
from botorch.models.utils.inducing_point_allocators import (
_pivoted_cholesky_init,
Expand All @@ -16,7 +18,7 @@
)
from botorch.utils.testing import BotorchTestCase

from gpytorch.kernels import MaternKernel
from gpytorch.kernels import MaternKernel, ScaleKernel
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import VariationalELBO

Expand Down Expand Up @@ -112,6 +114,29 @@ def setUp(self):
def test_initialization(self):
self.assertIsInstance(self.ipa, GreedyVarianceReduction)

def test_allocate_inducing_points_doesnt_leak(self) -> None:
"""
Run 'allocate_inducing_points' and check that all tensors allocated
in that function are garbabe-collected.
"""

def _get_n_tensors_tracked_by_gc() -> int:
gc.collect()
return sum(1 for elt in gc.get_objects() if isinstance(elt, torch.Tensor))

def f() -> None:
"""Construct and use a GreedyVarianceReduction allocator."""
x = torch.rand(7, 3).to(self.device)
kernel = ScaleKernel(MaternKernel())
allocator = GreedyVarianceReduction()
allocator.allocate_inducing_points(x, kernel, 4, x.shape[:-2])

n_tensors_before = _get_n_tensors_tracked_by_gc()
f()
n_tensors_after = _get_n_tensors_tracked_by_gc()

self.assertEqual(n_tensors_before, n_tensors_after)

def test_inducing_points_shape_and_repeatability(self):

for train_X in [
Expand Down Expand Up @@ -262,12 +287,29 @@ def test_inducing_points_different_when_minimizing(self):

class TestPivotedCholeskyInit(BotorchTestCase):
def test_raises_for_quality_function_with_invalid_shape(self):
with self.assertRaises(ValueError):
inputs = torch.rand(15, 1, device=self.device)
inputs = torch.rand(15, 1, device=self.device)
with torch.no_grad():
train_train_kernel = (
MaternKernel().to(self.device)(inputs).evaluate_kernel()
)
quality_scores = torch.ones([10, 1], device=self.device)
quality_scores = torch.ones([10, 1], device=self.device)
with self.assertRaisesRegex(ValueError, ".*requires a quality score"):
_pivoted_cholesky_init(
train_inputs=inputs,
kernel_matrix=train_train_kernel,
max_length=10,
quality_scores=quality_scores,
)

def test_raises_for_kernel_with_grad(self) -> None:
inputs = torch.rand(15, 1, device=self.device)
train_train_kernel = MaternKernel().to(self.device)(inputs).evaluate_kernel()
quality_scores = torch.ones(15, device=self.device)
with self.assertRaisesRegex(
UnsupportedError,
"`_pivoted_cholesky_init` does not support using a `kernel_matrix` "
"with `requires_grad=True`.",
):
_pivoted_cholesky_init(
train_inputs=inputs,
kernel_matrix=train_train_kernel,
Expand Down

0 comments on commit 0ac6b14

Please sign in to comment.