From 5375ae4619a518105bdea5dba521531d662968d9 Mon Sep 17 00:00:00 2001 From: Thomas-Christie Date: Thu, 13 Jul 2023 16:16:23 +0100 Subject: [PATCH 1/5] Add BO notebook Added introduction to Bayesian optimisation notebook. Also fixed some links which had broken after renaming the old kernels notebook in a previous PR. --- README.md | 4 +- docs/examples/barycentres.py | 2 +- docs/examples/bayesian_optimisation.py | 737 +++++++++++++++++++++++++ docs/examples/deep_kernels.py | 2 +- docs/examples/graph_kernels.py | 2 +- docs/examples/intro_to_gps.py | 4 +- docs/examples/intro_to_kernels.py | 6 +- docs/examples/spatial.py | 2 +- mkdocs.yml | 1 + 9 files changed, 749 insertions(+), 11 deletions(-) create mode 100644 docs/examples/bayesian_optimisation.py diff --git a/README.md b/README.md index a2682e576..f625c4ccb 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ process modelling. > - [**Stochastic Variational Inference**](https://docs.jaxgaussianprocesses.com/examples/uncollapsed_vi/) > - [**BlackJax Integration**](https://docs.jaxgaussianprocesses.com/examples/classification/#mcmc-inference) > - [**Laplace Approximation**](https://docs.jaxgaussianprocesses.com/examples/classification/#laplace-approximation) -> - [**Inference on Non-Euclidean Spaces**](https://docs.jaxgaussianprocesses.com/examples/kernels/#custom-kernel) +> - [**Inference on Non-Euclidean Spaces**](https://docs.jaxgaussianprocesses.com/examples/constructing_new_kernels/#custom-kernel) > - [**Inference on Graphs**](https://docs.jaxgaussianprocesses.com/examples/graph_kernels/) > - [**Pathwise Sampling**](https://docs.jaxgaussianprocesses.com/examples/spatial/) > - [**Learning Gaussian Process Barycentres**](https://docs.jaxgaussianprocesses.com/examples/barycentres/) @@ -63,7 +63,7 @@ process modelling. ## Guides for customisation > -> - [**Custom kernels**](https://docs.jaxgaussianprocesses.com/examples/kernels/#custom-kernel) +> - [**Custom kernels**](https://docs.jaxgaussianprocesses.com/examples/constructing_new_kernels/#custom-kernel) > - [**UCI regression**](https://docs.jaxgaussianprocesses.com/examples/yacht/) ## Conversion between `.ipynb` and `.py` diff --git a/docs/examples/barycentres.py b/docs/examples/barycentres.py index b5235f851..9ab7a2d83 100644 --- a/docs/examples/barycentres.py +++ b/docs/examples/barycentres.py @@ -124,7 +124,7 @@ # optimised. For advice on achieving this, see the # [Regression notebook](https://docs.jaxgaussianprocesses.com/examples/regression/) # for advice on optimisation and the -# [Kernels notebook](https://docs.jaxgaussianprocesses.com/examples/kernels/) for +# [Kernels notebook](https://docs.jaxgaussianprocesses.com/examples/constructing_new_kernels/) for # advice on selecting an appropriate kernel. diff --git a/docs/examples/bayesian_optimisation.py b/docs/examples/bayesian_optimisation.py new file mode 100644 index 000000000..1408a5d9b --- /dev/null +++ b/docs/examples/bayesian_optimisation.py @@ -0,0 +1,737 @@ +# %% [markdown] +# # Introduction to Bayesian Optimisation +# +# In this guide we introduce the Bayesian Optimisation (BO) paradigm for +# optimising black-box functions. We'll assume an understanding of Gaussian processes +# (GPs), so if you're not familiar with them, check out our [GP introduction notebook](https://docs.jaxgaussianprocesses.com/examples/intro_to_gps/). + +# %% +# Enable Float64 for more stable matrix inversions. +from jax.config import config + +config.update("jax_enable_x64", True) + +import jax +from jax import jit +import jax.numpy as jnp +import jax.random as jr +from jaxtyping import install_import_hook, Float, Int +import matplotlib as mpl +import matplotlib.pyplot as plt +from matplotlib import cm +import optax as ox +import tensorflow_probability.substrates.jax as tfp +from typing import List + +with install_import_hook("gpjax", "beartype.beartype"): + import gpjax as gpx +from gpjax.typing import Array, FunctionalSample, ScalarFloat +from jaxopt import ScipyBoundedMinimize + +key = jr.PRNGKey(42) +plt.style.use( + "https://raw.githubusercontent.com/JaxGaussianProcesses/GPJax/main/docs/examples/gpjax.mplstyle" +) +cols = mpl.rcParams["axes.prop_cycle"].by_key()["color"] + + +# %% [markdown] +# ## Some Motivating Examples +# +# Countless problems in the physical world involve optimising functions for which the +# explicit functional form is unknown, but which can be expensively queried throughout +# their domain. For example, within the domain of science the task of designing new +# molecules with optimised properties ([Griffiths and Lobato, +# 2020](https://pubs.rsc.org/en/content/articlehtml/2019/sc/c9sc04026a)) is incredibly +# useful. Here, the domain being optimised over is the space of possible molecules, with +# the objective function depending on the property being optimised, for instance within +# drug-design this may be the efficacy of the drug. The function from molecules to +# efficacy is unknown, but can be queried by synthesising a molecule and running an +# experiment to measure its efficacy. This is clearly an expensive procedure! +# +# Within the domain of machine learning, the task of optimising neural network +# architectures is another example of such a problem (commonly referred to as [Neural +# Architecture Search (NAS)](https://en.wikipedia.org/wiki/Neural_architecture_search)). +# Here, the domain is the space of possible neural network architectures, and the +# objective function is a metric such as the accuracy of the trained model. Again, the +# function from neural network architectures to accuracy is unknown, but can be queried by +# training a model with a given architecture and evaluating its accuracy. This is also an +# expensive procedure, as training models can be incredibly time consuming and +# computationally demanding. +# +# Finally, these problems are ubiquitous within the field of climate science, with +# ([Hellan et al., 2023](https://arxiv.org/abs/2306.04343)) providing several excellent +# examples. One such example is the task of deciding where to place wind turbines in a +# wind farm in order to maximise the energy generated. Here, the domain is the space of +# possible locations for the wind turbines, and the objective function is the energy +# generated by the wind farm. The function from locations to energy generated is unknown, +# but could be queried by running a simulation of the wind farm with the turbines placed +# at a given set of locations. Running such simulations can be expensive, particularly if +# they are high-fidelity. +# +# At the heart of all these problems is the task of optimising a function for which we +# don't have the explicit functional form, but which we can (expensively) query at any +# point in its domain. Bayesian optimisation provides a principled framework for solving +# such problems. + +# %% [markdown] +# ## What is Bayesian Optimisation? +# +# Bayesian optimisation (BO) ([Močkus, 1974](https://link.springer.com/chapter/10.1007/3-540-07165-2_55)) provides a principled +# method for making decisions under uncertainty. The aim of BO is to find the global +# minimum of a *black-box* objective function, $\min_{\mathbf{x} \in X} +# f(\mathbf{x})$. The function $f$ is said to be a *black-box* function because its +# explicit functional form is unknown. However, it is assumed that one is able to +# ascertain information about the function by evaluating it at points in its domain, +# $X$. However, these evaluations are assumed to be *expensive*, as seen in the +# motivating examples. Therefore, the goal of BO is to minimise $f$ with as few +# evaluations of the black-box function as possible. +# +# As such, BO can be thought of as *sequential decision-making* problem. At each iteration +# one must choose which point (or batch of points) in a function's domain to evaluate +# next, drawing on previously observed values to make optimal decisions. In order to do +# this effectively, we need a way of representing our uncertainty about the black-box +# function $f$, which we can update in light of observing more data. Gaussian processes +# will be an ideal tool for this purpose! +# +# *Surrogate models* lie at the heart of BO, and are used to model the black-box +# function. GPs are a natural choice for this model, as they not only provide point +# estimates for the values taken by the function throughout its domain, but crucially +# provide a full predictive posterior *distribution* of the range of values the function +# may take. This rich quantification of uncertainty enables BO to balance *exploration* +# and *exploitation* in order to efficiently converge upon minima. +# +# Having chosen a surrogate model, which we can use to express our current beliefs about +# the black-box function, ideally we would like a method which can use the surrogate +# model's posterior distribution to automatically decide which point(s) in the black-box +# function's domain to query next. This is where *acquisition functions* come in. The +# acquisition function $\alpha: X \to \mathbb{R}$ is defined over the same domain as the +# surrogate model, and uses the surrogate model's posterior distribution to quantify the +# expected *utility*, $U$, of evaluating the black-box function at a given point. Simply +# put, for each point in the black-box function's domain, $\mathbf{x} \in X$, the +# acquisition function quantifies how useful it would be to evaluate the black-box +# function at $\mathbf{x}$ in order to find the minimum of the black-box function, whilst +# taking into consideration all the datapoints observed so far. Therefore, in order to +# decide which point to query next we simply choose the point which maximises the +# acquisition function, using an optimiser such as L-BFGS ([Liu and Nocedal, +# 1989](https://link.springer.com/article/10.1007/BF01589116)) or Adam ([Kingma and Ba, +# 2014](https://arxiv.org/abs/1412.6980)). +# +# The Bayesian optimisation loop can be summarised as follows, with $i$ denoting the +# current iteration: +# +# 1. Select the next point to query, $\mathbf{x}_{i}$, by maximising the acquisition function $\alpha$, defined using the surrogate model $\mathcal{M}_i$ conditioned on previously observed data $\mathcal{D}_i$: +# +# $$\mathbf{x}_{i} = \arg\max_{\mathbf{x}} \alpha (\mathbf{x}; \mathcal{D}_i, +# \mathcal{M}_i)$$ +# +# 2. Evaluate the objective function at $\mathbf{x}_i$, yielding observation $y_i = +# f(\mathbf{x}_i)$. +# +# 3. Append the most recent observation to the dataset, $\mathcal{D}_{i+1} = \mathcal{D}_i +# \cup \{(\mathbf{x}_i, y_i)\}$. +# +# 4. Condition the model on the updated dataset to yield $\mathcal{M}_{i+1}$. +# +# This process is repeated until some stopping criterion is met, such as a function +# evaluation budget being exhausted. +# +# There are a plethora of acquisition functions to choose from, each with their own +# advantages and disadvantages, of which ([Shahriari et al., 2015](https://www.cs.ox.ac.uk/people/nando.defreitas/publications/BayesOptLoop.pdf)) +# provides an excellent overview. +# +# In this guide we will focus on *Thompson sampling*, a conceptually simple yet effective +# method for characterising the utility of querying points in a black-box function's +# domain, which will be useful in demonstrating the key aspects of BO. + +# %% [markdown] +# ## Thompson Sampling +# +# Thompson sampling ([Thompson, 1933](https://shorturl.at/ejmCM)) is a simple method which +# naturally balances exploration and exploitation. The core idea is to, at each iteration +# of the BO loop, sample a function, $g$, from the posterior distribution of the surrogate +# model $\mathcal{M}_i$, and then evaluate the black-box function at the point(s) which +# minimise this sample. Given a sample $g$, from the posterior distribution given by the model $\mathcal{M}_i$ the Thompson sampling utility function is defined as: +# +# $$U_{\text{TS}}(\mathbf{x}; \mathcal{D}_i, \mathcal{M}_i) = - g(\mathbf{x})$$ +# +# Note the negative sign; this is included as we want to maximise the *utility* of +# evaluating the black-box function $f$ at a given point. We interested in finding the +# minimum of $f$, so we maximise the negative of the sample from the posterior distribution $g$. +# +# As a toy example, we shall be applying BO to the widely used [Forrester +# function](https://www.sfu.ca/~ssurjano/forretal08.html): +# +# $$f(x) = (6x - 2)^2 \sin(12x - 4)$$ +# +# treating $f$ as a black-box function. Moreover, we shall restrict the domain of the +# function to $\mathbf{x} \in [0, 1]$. The global minimum of this function is located at +# $x = 0.757$, where $f(x) = -6.021$. + + +# %% +def forrester(x: Float[Array, "N 1"]) -> Float[Array, "N 1"]: + return (6 * x - 2) ** 2 * jnp.sin(12 * x - 4) + + +# %% [markdown] +# We'll first go through one iteration of the BO loop step-by-step, before wrapping this +# up in a loop to perform the full optimisation. + +# %% [markdown] +# First we'll specify the domain over which we wish to optimise the function, as well as +# sampling some initial points for fitting our surrogate model using a space-filling design. + +# %% +lower_bound = jnp.array([0.0]) +upper_bound = jnp.array([1.0]) +initial_sample_num = 5 + +initial_x = tfp.mcmc.sample_halton_sequence( + dim=1, num_results=initial_sample_num, seed=key, dtype=jnp.float64 +).reshape(-1, 1) +initial_y = forrester(initial_x) +D = gpx.Dataset(X=initial_x, y=initial_y) + + +# %% [markdown] +# Next we'll define our GP model in the usual way, using a Matérn52 kernel, and fit the +# kernel parameters by minimising the negative log-marginal likelihood. We'll wrap this in +# a function as we'll be repeating this process at each iteration of the BO loop. + + +# %% +def generate_optimised_posterior( + data: gpx.Dataset, prior: gpx.Module, key: Array +) -> gpx.Module: + likelihood = gpx.Gaussian( + num_datapoints=data.n, obs_noise=jnp.array(1e-6) + ) # Our function is noise-free, so we set the observation noise to a very small value + likelihood = likelihood.replace_trainable(obs_noise=False) + + posterior = prior * likelihood + + negative_mll = gpx.objectives.ConjugateMLL(negative=True) + negative_mll(posterior, train_data=data) + negative_mll = jit(negative_mll) + + opt_posterior, history = gpx.fit( + model=posterior, + objective=negative_mll, + train_data=data, + optim=ox.adam(learning_rate=0.01), + num_iters=1000, + safe=True, + key=key, + verbose=False, + ) + + return opt_posterior + + +mean = gpx.mean_functions.Zero() +kernel = gpx.kernels.Matern52() +prior = gpx.Prior(mean_function=mean, kernel=kernel) +opt_posterior = generate_optimised_posterior(D, prior, key) + +# %% [markdown] +# We can then sample a function from the posterior distribution of the surrogate model. We +# will do this using the `sample_approx` method, which generates an approximate sample +# from the posterior using decoupled sampling introduced in ([Wilson et al., +# 2020](https://proceedings.mlr.press/v119/wilson20a.html)) and discussed in our [Pathwise +# Sampling Notebook](https://docs.jaxgaussianprocesses.com/examples/spatial/). This method +# is used as it enables us to sample from the posterior in a manner which scales linearly +# with the number of points sampled, $O(N)$, mitigating the cubic cost associated with +# drawing exact samples from a GP posterior, $O(N^3)$. It also generates more accurate +# samples than many other methods for drawing approximate samples from a GP posterior. +# +# Note that we also define a `utility_fn` which calls the approximate +# sample but returns the value returned as a scalar. This is because the `sample_approx` +# function returns an array of shape $[N, B]$, with $N$ being the number of points within +# each sample and $B$ being the number of samples drawn. We'll only be drawing (and +# optimising) one sample at a time, and our optimiser requires the function being +# optimised to return a scalar output (only querying it at $N=1$ points), so we'll remove the axes from the returned value. + +# %% +approx_sample = opt_posterior.sample_approx( + num_samples=1, train_data=D, key=key, num_features=500 +) +utility_fn = lambda x: approx_sample(x)[0][0] + + +# %% [markdown] +# In order to minimise the sample, we'll be using the L-BFGS-B ([Byrd et al., 1995](https://epubs.siam.org/doi/abs/10.1137/0916069)) optimiser from the `jaxopt` +# library. This is a gradient-based optimiser which performs optimisation within a bounded +# domain. In order to perform optimisation, this optimiser requires a point to start from. +# Therefore, we will first query our sample from the posterior at a random set of points, +# and then use the lowest point from this set of points as the starting point for the +# optimiser. In this example we'll sample 100 points from the posterior, due to the simple +# nature of the Forrester function. However, in practice it can be beneficial to +# adopt a more sophisticated approach, and there are several heuristics available in the +# literature (see for example ([Le Riche and Picheny, +# 2021](https://arxiv.org/abs/2103.16649))). For instance, one may randomly sample the +# posterior at a number of points proportional to the dimensionality of the input space, +# and one may run gradient-based optimisation from multiple of these points, to reduce the +# risk of converging upon local minima. + + +# %% +def optimise_sample( + sample: FunctionalSample, + key: Int[Array, ""], + lower_bound: Float[Array, "D"], + upper_bound: Float[Array, "D"], + num_initial_sample_points: int, +) -> ScalarFloat: + initial_sample_points = jr.uniform( + key, + shape=(num_initial_sample_points, lower_bound.shape[0]), + dtype=jnp.float64, + minval=lower_bound, + maxval=upper_bound, + ) + initial_sample_y = sample(initial_sample_points) + best_x = jnp.array([initial_sample_points[jnp.argmin(initial_sample_y)]]) + + negative_utility_fn = lambda x: sample(x)[0][0] + lbfgsb = ScipyBoundedMinimize(fun=negative_utility_fn, method="l-bfgs-b") + bounds = (lower_bound, upper_bound) + x_star = lbfgsb.run(best_x, bounds=bounds).params + return x_star + + +x_star = optimise_sample(approx_sample, key, lower_bound, upper_bound, 100) +y_star = forrester(x_star) + + +# %% [markdown] +# Having found the minimum of the sample from the posterior, we can then evaluate the +# black-box objective function at this point, and append the new observation to our dataset. +# +# Below we plot the posterior distribution of the surrogate model, along with the sample +# drawn from the model, and the minimiser of this sample returned from the optimiser, +# which we denote with a star. + + +# %% +def plot_bayes_opt( + posterior: gpx.Module, + sample: FunctionalSample, + dataset: gpx.Dataset, + queried_x: ScalarFloat, +) -> None: + plt_x = jnp.linspace(0, 1, 1000).reshape(-1, 1) + forrester_y = forrester(plt_x) + sample_y = sample(plt_x) + + latent_dist = posterior.predict(plt_x, train_data=dataset) + predictive_dist = posterior.likelihood(latent_dist) + + predictive_mean = predictive_dist.mean() + predictive_std = predictive_dist.stddev() + + fig, ax = plt.subplots() + ax.fill_between( + plt_x.squeeze(), + predictive_mean - 2 * predictive_std, + predictive_mean + 2 * predictive_std, + alpha=0.2, + label="Two sigma", + color=cols[1], + ) + ax.plot( + plt_x, + predictive_mean - 2 * predictive_std, + linestyle="--", + linewidth=1, + color=cols[1], + ) + ax.plot( + plt_x, + predictive_mean + 2 * predictive_std, + linestyle="--", + linewidth=1, + color=cols[1], + ) + ax.plot( + plt_x, + forrester_y, + label="Forrester Function", + color=cols[0], + linestyle="--", + linewidth=2, + ) + ax.plot(plt_x, predictive_mean, label="Predictive Mean", color=cols[1]) + ax.plot(plt_x, sample_y, label="Posterior Sample") + ax.scatter(dataset.X, dataset.y, label="Observations", color=cols[2], zorder=2) + ax.scatter( + queried_x, + sample(queried_x), + label="Posterior Sample Optimum", + marker="*", + color=cols[3], + zorder=3, + ) + ax.legend(loc="center left", bbox_to_anchor=(0.975, 0.5)) + plt.show() + + +plot_bayes_opt(opt_posterior, approx_sample, D, x_star) + +# %% [markdown] +# At this point we can update our model with the newly augmented dataset, and repeat the +# whole process until some stopping criterion is met. Below we repeat this process for 10 +# iterations, printing out the queried point and the value of the black-box function at +# each iteration. + +# %% +bo_iters = 10 + +# Set up initial dataset +initial_x = tfp.mcmc.sample_halton_sequence( + dim=1, num_results=initial_sample_num, seed=key, dtype=jnp.float64 +).reshape(-1, 1) +initial_y = forrester(initial_x) +D = gpx.Dataset(X=initial_x, y=initial_y) + +for i in range(bo_iters): + key, subkey = jr.split(key) + + # Generate optimised posterior using previously observed data + mean = gpx.mean_functions.Zero() + kernel = gpx.kernels.Matern52() + prior = gpx.Prior(mean_function=mean, kernel=kernel) + opt_posterior = generate_optimised_posterior(D, prior, subkey) + + # Draw a sample from the posterior, and find the minimiser of it + approx_sample = opt_posterior.sample_approx( + num_samples=1, train_data=D, key=subkey, num_features=500 + ) + x_star = optimise_sample( + approx_sample, subkey, lower_bound, upper_bound, num_initial_sample_points=100 + ) + + plot_bayes_opt(opt_posterior, approx_sample, D, x_star) + + # Evaluate the black-box function at the best point observed so far, and add it to the dataset + y_star = forrester(x_star) + print(f"Queried Point: {x_star}, Black-Box Function Value: {y_star}") + D = D + gpx.Dataset(X=x_star, y=y_star) + +# %% [markdown] +# Below we plot the best observed black-box function value against the number of times +# the black-box function has been evaluated. Note that the first 5 samples are randomly +# sampled to fit the initial GP model, and we denote the start of using BO to sample with +# the dotted vertical line. +# +# We can see that the BO algorithm quickly converges to the global minimum of the +# black-box function! +# + +# %% +fig, ax = plt.subplots() +fn_evaluations = jnp.arange(1, bo_iters + initial_sample_num + 1) +cumulative_best_y = jax.lax.associative_scan(jax.numpy.minimum, D.y) +ax.plot(fn_evaluations, cumulative_best_y) +ax.axvline(x=initial_sample_num, linestyle=":") +ax.axhline(y=-6.0207, linestyle="--", label="True Minimum") +ax.set_xlabel("Number of Black-Box Function Evaluations") +ax.set_ylabel("Best Observed Value") +ax.legend() +plt.show() + + +# %% [markdown] +# ### A More Challenging Example - The Six-Hump Camel Function + +# %% [markdown] +# We'll now apply BO to a more challenging example, the [Six-Hump Camel +# Function](https://www.sfu.ca/~ssurjano/camel6.html). This is a function of two inputs +# defined as follows: +# +# $$f(x_1, x_2) = (4 - 2.1x_1^2 + \frac{x_1^4}{3})x_1^2 + x_1x_2 + (-4 + 4x_2^2)x_2^2$$ +# +# We'll be evaluating it over the domain $x_1 \in [-2, 2]$ and $x_2 \in [-1, 1]$. The +# global minima of this function are located at $\mathbf{x} = (0.0898, -0.7126)$ and $\mathbf{x} = (-0.0898, 0.7126)$, where the function takes the value $f(\mathbf{x}) = -1.0316$. + + +# %% +def six_hump_camel(x: Float[Array, "N 2"]) -> Float[Array, "N 1"]: + x1 = x[..., :1] + x2 = x[..., 1:] + term1 = (4 - 2.1 * x1**2 + x1**4 / 3) * x1**2 + term2 = x1 * x2 + term3 = (-4 + 4 * x2**2) * x2**2 + return term1 + term2 + term3 + + +# %% [markdown] +# First, we'll visualise the function over the domain of interest: + +# %% +x1 = jnp.linspace(-2, 2, 100) +x2 = jnp.linspace(-1, 1, 100) +x1, x2 = jnp.meshgrid(x1, x2) +x = jnp.stack([x1.flatten(), x2.flatten()], axis=1) +y = six_hump_camel(x) + +fig, ax = plt.subplots(subplot_kw={"projection": "3d"}) +surf = ax.plot_surface( + x1, + x2, + y.reshape(x1.shape[0], x2.shape[0]), + linewidth=0, + cmap=cm.coolwarm, + antialiased=False, +) +ax.set_xlabel("x1") +ax.set_ylabel("x2") +plt.show() + +# %% [markdown] +# For more clarity, we can generate a contour plot of the function which enables us to see +# the global minima of the function more clearly. + +# %% +x_star_one = jnp.array([[0.0898, -0.7126]]) +x_star_two = jnp.array([[-0.0898, 0.7126]]) +fig, ax = plt.subplots() +contour_plot = ax.contourf( + x1, x2, y.reshape(x1.shape[0], x2.shape[0]), cmap=cm.coolwarm, levels=40 +) +ax.scatter( + x_star_one[0][0], x_star_one[0][1], marker="*", color=cols[2], label="Global Minima" +) +ax.scatter(x_star_two[0][0], x_star_two[0][1], marker="*", color=cols[2]) +ax.set_xlabel("x1") +ax.set_ylabel("x2") +fig.colorbar(contour_plot) +ax.legend() +plt.show() + +# %% [markdown] +# Next, we'll run the BO loop using Thompson sampling as before. This time we'll run the +# experiment 5 times in order to see how the algorithm performs on average, with different +# starting points for the initial GP model. This is good practice, as the performance +# obtained is likely to vary between runs depending on the initialisation samples used to +# fit the initial GP model. + +# %% +lower_bound = jnp.array([-2.0, -1.0]) +upper_bound = jnp.array([2.0, 1.0]) +initial_sample_num = 5 +bo_iters = 15 +num_experiments = 5 +bo_experiment_results = [] + +for experiment in range(num_experiments): + print(f"Starting Experiment: {experiment + 1}") + # Set up initial dataset + initial_x = tfp.mcmc.sample_halton_sequence( + dim=2, num_results=initial_sample_num, seed=key, dtype=jnp.float64 + ) + initial_x = jnp.array(lower_bound + (upper_bound - lower_bound) * initial_x) + initial_y = six_hump_camel(initial_x) + D = gpx.Dataset(X=initial_x, y=initial_y) + + for i in range(bo_iters): + key, subkey = jr.split(key) + + # Generate optimised posterior + mean = gpx.mean_functions.Zero() + kernel = gpx.kernels.Matern52( + active_dims=[0, 1], lengthscale=jnp.array([1.0, 1.0]), variance=2.0 + ) + prior = gpx.Prior(mean_function=mean, kernel=kernel) + opt_posterior = generate_optimised_posterior(D, prior, subkey) + + # Draw a sample from the posterior, and find the minimiser of it + approx_sample = opt_posterior.sample_approx( + num_samples=1, train_data=D, key=subkey, num_features=500 + ) + x_star = optimise_sample( + approx_sample, + subkey, + lower_bound, + upper_bound, + num_initial_sample_points=1000, + ) + + # Evaluate the black-box function at the best point observed so far, and add it to the dataset + y_star = six_hump_camel(x_star) + print( + f"BO Iteration: {i + 1}, Queried Point: {x_star}, Black-Box Function Value: {y_star}" + ) + D = D + gpx.Dataset(X=x_star, y=y_star) + bo_experiment_results.append(D) + + +# %% [markdown] +# We'll also run a random benchmark, whereby we randomly sample from the search space for +# 20 iterations. This is a useful benchmark to compare the performance of BO against in +# order to ascertain how much of an advantage BO provides over such a simple approach. +# + +# %% +random_experiment_results = [] +for i in range(num_experiments): + key, subkey = jr.split(key) + initial_x = bo_experiment_results[i].X[:5] + initial_y = bo_experiment_results[i].y[:5] + final_x = jr.uniform( + key, shape=(15, 2), dtype=jnp.float64, minval=lower_bound, maxval=upper_bound + ) + final_y = six_hump_camel(final_x) + random_x = jnp.concatenate([initial_x, final_x], axis=0) + random_y = jnp.concatenate([initial_y, final_y], axis=0) + random_experiment_results.append(gpx.Dataset(X=random_x, y=random_y)) + + +# %% [markdown] +# Finally, we'll process the experiment results to find the best observed value of the +# black-box function at each iteration of the experiments. We'll then take the mean and +# standard deviation of these values across the 5 experiments. + + +# %% +def obtain_cumulative_minimum_statistics( + experiment_results: List[gpx.Dataset], +) -> tuple[Float[Array, "N 1"], Float[Array, "N 1"]]: + cumulative_best_observation_results = [] + for exp_result in experiment_results: + observations = exp_result.y + cumulative_best_observations = jax.lax.associative_scan( + jax.numpy.minimum, observations + ) + cumulative_best_observation_results.append(cumulative_best_observations) + + cumulative_best_observation_results = jnp.array(cumulative_best_observation_results) + cumulative_best_observation_mean = jnp.mean( + cumulative_best_observation_results, axis=0 + ) + cumulative_best_observation_std = jnp.std( + cumulative_best_observation_results, axis=0 + ) + return cumulative_best_observation_mean, cumulative_best_observation_std + + +bo_cumulative_min_mean, bo_cumulative_min_std = obtain_cumulative_minimum_statistics( + bo_experiment_results +) +( + random_cumulative_min_mean, + random_cumulative_min_std, +) = obtain_cumulative_minimum_statistics(random_experiment_results) + +# %% [markdown] +# Now, when we plot the mean and standard deviation of the best observed value of the +# black-box function at each iteration, we can see that BO outperforms random sampling, +# consistently converging to the global minimum of the function. + +# %% +fig, ax = plt.subplots() +fn_evaluations = jnp.arange(1, bo_iters + initial_sample_num + 1) +cumulative_best_y = jax.lax.associative_scan(jax.numpy.minimum, D.y) +cumulative_random_y = jax.lax.associative_scan(jax.numpy.minimum, random_y) +ax.plot(fn_evaluations, bo_cumulative_min_mean, label="Bayesian Optimisation") +ax.fill_between( + fn_evaluations, + bo_cumulative_min_mean[:, 0] - bo_cumulative_min_std[:, 0], + bo_cumulative_min_mean[:, 0] + bo_cumulative_min_std[:, 0], + alpha=0.2, +) +ax.plot(fn_evaluations, random_cumulative_min_mean, label="Random Search") +ax.fill_between( + fn_evaluations, + random_cumulative_min_mean[:, 0] - random_cumulative_min_std[:, 0], + random_cumulative_min_mean[:, 0] + random_cumulative_min_std[:, 0], + alpha=0.2, +) +ax.axvline(x=initial_sample_num, linestyle=":") +ax.axhline(y=-1.0316, linestyle="--", label="True Minimum") +ax.set_xlabel("Number of Black-Box Function Evaluations") +ax.set_ylabel("Best Observed Value") +ax.legend() +plt.show() + +# %% [markdown] +# It can also be useful to plot the queried points over the course of a single BO run, in +# order to gain some insight into how the algorithm queries the search space. Below +# we do this for the first BO experiment, and can see that the algorithm initially +# performs some exploration of the search space whilst it is uncertain about the black-box +# function, but it then hones in one one of the global minima of the function, as we would hope! + +# %% +fig, ax = plt.subplots() +contour_plot = ax.contourf( + x1, x2, y.reshape(x1.shape[0], x2.shape[0]), cmap=cm.coolwarm, levels=40 +) +ax.scatter( + x_star_one[0][0], + x_star_one[0][1], + marker="*", + color=cols[2], + label="Global Minimum", + zorder=2, +) +ax.scatter(x_star_two[0][0], x_star_two[0][1], marker="*", color=cols[2], zorder=2) +ax.scatter( + bo_experiment_results[0].X[:, 0], + bo_experiment_results[0].X[:, 1], + marker="x", + color=cols[1], + label="Bayesian Optimisation Queries", +) +ax.set_xlabel("x1") +ax.set_ylabel("x2") +fig.colorbar(contour_plot) +ax.legend() +plt.show() + +# %% [markdown] +# ### Other Acquisition Functions and Further Reading +# +# As mentioned previously, there are many acquisition functions which one may use to +# characterise the expected utility of querying the black-box function at a given point. +# We list two of the most popular below: +# +# - **Probability of Improvement (PI)** ([Kushner, 1964](https://asmedigitalcollection.asme.org/fluidsengineering/article/86/1/97/392213/A-New-Method-of-Locating-the-Maximum-Point-of-an)): Given the lowest objective function observation +# so far, $f(\mathbf{x}^*)$, PI calculates the probability that the objective function's +# value at a given point $\mathbf{x}$ is lower than $f(\mathbf{x}^*)$. Given a GP +# surrogate model $\mathcal{M}_i$, PI is defined mathematically as: +# $$ +# \alpha_{\text{PI}}(\mathbf{x}; \mathcal{D}_i, \mathcal{M}_i) = \mathbb{P}[\mathcal{M}_i (\mathbf{x}) < f(\mathbf{x}^*)] = \Phi \left(\frac{f(\mathbf{x}^*) - \mu_{\mathcal{M}_i}(\mathbf{x})}{\sigma_{\mathcal{M}_i}(\mathbf{x})}\right) +# $$ +# +# with $\Phi(\cdot)$ denoting the standard normal cumulative distribution function. +# +# - **Expected Improvement (EI)** ([Močkus, 1974](https://link.springer.com/chapter/10.1007/3-540-07165-2_55)) - EI goes beyond PI by not only considering the +# probability of improving on the current best observed point, but also taking into +# account the \textit{magnitude} of improvement. Mathematically, this is defined as +# follows: +# $$ +# \begin{aligned} +# \alpha_{\text{EI}}(\mathbf{x};\mathcal{D}_i, \mathcal{M}_i) &= \mathbb{E}[(f(\mathbf{x}^*) - \mathcal{M}_i(\mathbf{x}))\mathbb{I}(\mathcal{M}_i(\mathbf{x}) < f(\mathbf{x}^*))] \\ +# &= \underbrace{(f(\mathbf{x}^*) - \mu_{\mathcal{M}_i}(\mathbf{x}))\Phi +# \left(\frac{f(\mathbf{x}^*) - +# \mu_{\mathcal{M}_i}(\mathbf{x})}{\sigma_{\mathcal{M}_i}(\mathbf{x})}\right)}_\text{exploits +# areas with low mean} \\ +# &+ \underbrace{\sigma_{\mathcal{M}_i}(\mathbf{x}) \phi \left(\frac{f(\mathbf{x}^*) - \mu_{\mathcal{M}_i}(\mathbf{x})}{\sigma_{\mathcal{M}_i}(\mathbf{x})}\right)}_\text{explores areas with high variance} \nonumber +# \end{aligned} +# $$ +# +# with $\mathbb{I}(\cdot)$ denoting the indicator function and $\phi(\cdot)$ being the +# standard normal probability density function. +# +# For those particularly interested in diving deeper into Bayesian optimisation, be sure +# to check out Shahriari et al.'s "[Taking the Human Out of the Loop: +# A Review of Bayesian +# Optimization](https://www.cs.ox.ac.uk/people/nando.defreitas/publications/BayesOptLoop.pdf)", +# which includes a wide variety of acquisition functions, as well as some examples of more +# exotic BO problems, such as problems which also feature unknown constraints. +# +# ## System Configuration + +# %% +# %reload_ext watermark +# %watermark -n -u -v -iv -w -a 'Thomas Christie' diff --git a/docs/examples/deep_kernels.py b/docs/examples/deep_kernels.py index d36a38d1e..b9d4adb46 100644 --- a/docs/examples/deep_kernels.py +++ b/docs/examples/deep_kernels.py @@ -130,7 +130,7 @@ def __call__( # activation functions between the layers. The first hidden layer contains 64 units, # while the second layer contains 32 units. Finally, we'll make the output of our # network a three units wide. The corresponding kernel that we define will then be of -# [ARD form](https://docs.jaxgaussianprocesses.com/examples/kernels/#active-dimensions) +# [ARD form](https://docs.jaxgaussianprocesses.com/examples/constructing_new_kernels/#active-dimensions) # to allow for different lengthscales in each dimension of the feature space. # Users may wish to design more intricate network structures for more complex tasks, # which functionality is supported well in Haiku. diff --git a/docs/examples/graph_kernels.py b/docs/examples/graph_kernels.py index 1ef5d15ad..82154b3a4 100644 --- a/docs/examples/graph_kernels.py +++ b/docs/examples/graph_kernels.py @@ -5,7 +5,7 @@ # of a graph using a Gaussian process with a Matérn kernel presented in # . For a general discussion of the # kernels supported within GPJax, see the -# [kernels notebook](https://docs.jaxgaussianprocesses.com/examples/kernels). +# [kernels notebook](https://docs.jaxgaussianprocesses.com/examples/constructing_new_kernels). # %% # Enable Float64 for more stable matrix inversions. diff --git a/docs/examples/intro_to_gps.py b/docs/examples/intro_to_gps.py index 13cff8fb6..114c51791 100644 --- a/docs/examples/intro_to_gps.py +++ b/docs/examples/intro_to_gps.py @@ -447,8 +447,8 @@ # that are admissible under the GP prior. A kernel is a positive-definite # function with parameters $\boldsymbol{\theta}$ that maps pairs of inputs # $\mathbf{X}, \mathbf{X}' \in \mathcal{X}$ onto the real line. We dedicate the -# entirety of the [Kernel Guide -# notebook](https://docs.jaxgaussianprocesses.com/examples/kernels) to +# entirety of the [Introduction to Kernels +# notebook](https://docs.jaxgaussianprocesses.com/examples/intro_to_kernels) to # exploring the different GPs each kernel can yield. # # ## Gaussian process regression diff --git a/docs/examples/intro_to_kernels.py b/docs/examples/intro_to_kernels.py index d447a4d53..7356202fc 100644 --- a/docs/examples/intro_to_kernels.py +++ b/docs/examples/intro_to_kernels.py @@ -212,11 +212,11 @@ def forrester(x: Float[Array, "N"]) -> Float[Array, "N"]: test_y = forrester(test_x) # %% [markdown] -# First we define our model, using the Matérn32 kernel, and construct our posterior *without* optimising the kernel hyperparameters: +# First we define our model, using the Matérn52 kernel, and construct our posterior *without* optimising the kernel hyperparameters: # %% mean = gpx.mean_functions.Zero() -kernel = gpx.kernels.Matern32( +kernel = gpx.kernels.Matern52( lengthscale=jnp.array(2.0) ) # Initialise our kernel lengthscale to 2.0 @@ -672,7 +672,7 @@ def forrester(x: Float[Array, "N"]) -> Float[Array, "N"]: # # - [Gaussian Processes for Machine Learning](http://www.gaussianprocess.org/gpml/chapters/RW.pdf) - Chapter 4 provides a comprehensive overview of kernels, diving deep into some of the technical details and also providing some kernels defined on non-Euclidean spaces such as strings. # - David Duvenaud's [Kernel Cookbook](https://www.cs.toronto.edu/~duvenaud/cookbook/) is a great resource for learning about kernels, and also provides some information about some of the pitfalls people commonly encounter when using the Matérn family of kernels. His PhD thesis, [Automatic Model Construction with Gaussian Processes](https://www.cs.toronto.edu/~duvenaud/thesis.pdf), also provides some in-depth recipes for how one may incorporate their prior knowledge when constructing kernels. -# - Finally, please check out our [more advanced kernel guide](https://docs.jaxgaussianprocesses.com/examples/kernels/), which details some more kernels available in GPJax as well as how one may combine kernels together to form more complex kernels. +# - Finally, please check out our [more advanced kernel guide](https://docs.jaxgaussianprocesses.com/examples/constructing_new_kernels/), which details some more kernels available in GPJax as well as how one may combine kernels together to form more complex kernels. # # ## System Configuration diff --git a/docs/examples/spatial.py b/docs/examples/spatial.py index 140088baf..72fe15c4f 100644 --- a/docs/examples/spatial.py +++ b/docs/examples/spatial.py @@ -133,7 +133,7 @@ # alone isn't enough to to a decent job at interpolating this data. Therefore, we can also use elevation and optimize # the parameters of our kernel such that more relevance should be given to elevation. This is possible by using a # kernel that has one length-scale parameter per input dimension: an automatic relevance determination (ARD) kernel. -# See our [kernel notebook](https://docs.jaxgaussianprocesses.com/examples/kernels/) for more an introduction to +# See our [kernel notebook](https://docs.jaxgaussianprocesses.com/examples/constructing_new_kernels/) for more an introduction to # kernels in GPJax. # %% diff --git a/mkdocs.yml b/mkdocs.yml index b571a70b9..5d00248cb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,6 +28,7 @@ nav: - Sparse GPs: examples/uncollapsed_vi.py - Stochastic sparse GPs: examples/collapsed_vi.py - Pathwise Sampling for Spatial Modelling: examples/spatial.py + - Bayesian Optimisation: examples/bayesian_optimisation.py - 📖 Guides for customisation: - Kernels: examples/constructing_new_kernels.py - Likelihoods: examples/likelihoods_guide.py From 7ec707013cd6e14f789d935c2527c8906fda796e Mon Sep 17 00:00:00 2001 From: Thomas-Christie Date: Wed, 19 Jul 2023 20:08:40 +0100 Subject: [PATCH 2/5] Fix documentation build workflow Added `sudo apt-get update` command before `sudo apt-get install` as recommended in https://docs.github.com/en/actions/using-github-hosted-runners/customizing-github-hosted-runners in order to mitigate package installation failures. --- .github/workflows/build_docs.yml | 4 +++- .github/workflows/test_docs.yml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 34ba58b01..c2a4c5887 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -53,7 +53,9 @@ jobs: installer-parallel: true - name: Install LaTex - run: sudo apt-get install texlive-fonts-recommended texlive-fonts-extra texlive-latex-extra dvipng cm-super + run: | + sudo apt-get update + sudo apt-get install texlive-fonts-recommended texlive-fonts-extra texlive-latex-extra dvipng cm-super - name: Build the documentation with MKDocs run: | diff --git a/.github/workflows/test_docs.yml b/.github/workflows/test_docs.yml index 25d5f3191..55dc208af 100644 --- a/.github/workflows/test_docs.yml +++ b/.github/workflows/test_docs.yml @@ -43,7 +43,9 @@ jobs: npm install katex - name: Install LaTex - run: sudo apt-get install texlive-fonts-recommended texlive-fonts-extra texlive-latex-extra dvipng cm-super + run: | + sudo apt-get update + sudo apt-get install texlive-fonts-recommended texlive-fonts-extra texlive-latex-extra dvipng cm-super # Install Poetry and build the documentation - name: Install and configure Poetry From 915cdf63edecd1c3e50c8cc1bbcc522f6eef043e Mon Sep 17 00:00:00 2001 From: Thomas-Christie Date: Wed, 19 Jul 2023 21:21:44 +0100 Subject: [PATCH 3/5] Fix tuple type annotation for Python 3.8 --- docs/examples/bayesian_optimisation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/examples/bayesian_optimisation.py b/docs/examples/bayesian_optimisation.py index 1408a5d9b..36fa09c08 100644 --- a/docs/examples/bayesian_optimisation.py +++ b/docs/examples/bayesian_optimisation.py @@ -21,7 +21,7 @@ from matplotlib import cm import optax as ox import tensorflow_probability.substrates.jax as tfp -from typing import List +from typing import List, Tuple with install_import_hook("gpjax", "beartype.beartype"): import gpjax as gpx @@ -596,7 +596,7 @@ def six_hump_camel(x: Float[Array, "N 2"]) -> Float[Array, "N 1"]: # %% def obtain_cumulative_minimum_statistics( experiment_results: List[gpx.Dataset], -) -> tuple[Float[Array, "N 1"], Float[Array, "N 1"]]: +) -> Tuple[Float[Array, "N 1"], Float[Array, "N 1"]]: cumulative_best_observation_results = [] for exp_result in experiment_results: observations = exp_result.y From fac383c914946859fa99a481242a052836f9fc38 Mon Sep 17 00:00:00 2001 From: Thomas-Christie Date: Mon, 24 Jul 2023 12:27:09 +0100 Subject: [PATCH 4/5] Make minor changes to BO notebook Incorporated feedback into BO notebook and added link to BO notebook from project README. --- README.md | 1 + docs/examples/bayesian_optimisation.py | 90 ++++++++++++++------------ 2 files changed, 49 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index f625c4ccb..ea95d3a45 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ process modelling. > - [**Learning Gaussian Process Barycentres**](https://docs.jaxgaussianprocesses.com/examples/barycentres/) > - [**Deep Kernel Regression**](https://docs.jaxgaussianprocesses.com/examples/deep_kernels/) > - [**Poisson Regression**](https://docs.jaxgaussianprocesses.com/examples/poisson/) +> - [**Bayesian Optimisation**](https://docs.jaxgaussianprocesses.com/examples/bayesian_optimisation/) ## Guides for customisation > diff --git a/docs/examples/bayesian_optimisation.py b/docs/examples/bayesian_optimisation.py index 36fa09c08..d1b08e76d 100644 --- a/docs/examples/bayesian_optimisation.py +++ b/docs/examples/bayesian_optimisation.py @@ -114,8 +114,7 @@ # taking into consideration all the datapoints observed so far. Therefore, in order to # decide which point to query next we simply choose the point which maximises the # acquisition function, using an optimiser such as L-BFGS ([Liu and Nocedal, -# 1989](https://link.springer.com/article/10.1007/BF01589116)) or Adam ([Kingma and Ba, -# 2014](https://arxiv.org/abs/1412.6980)). +# 1989](https://link.springer.com/article/10.1007/BF01589116)). # # The Bayesian optimisation loop can be summarised as follows, with $i$ denoting the # current iteration: @@ -147,7 +146,7 @@ # %% [markdown] # ## Thompson Sampling # -# Thompson sampling ([Thompson, 1933](https://shorturl.at/ejmCM)) is a simple method which +# Thompson sampling ([Thompson, 1933](https://www.dropbox.com/s/yhn9prnr5bz0156/1933-thompson.pdf)) is a simple method which # naturally balances exploration and exploitation. The core idea is to, at each iteration # of the BO loop, sample a function, $g$, from the posterior distribution of the surrogate # model $\mathcal{M}_i$, and then evaluate the black-box function at the point(s) which @@ -201,7 +200,7 @@ def forrester(x: Float[Array, "N 1"]) -> Float[Array, "N 1"]: # %% -def generate_optimised_posterior( +def return_optimised_posterior( data: gpx.Dataset, prior: gpx.Module, key: Array ) -> gpx.Module: likelihood = gpx.Gaussian( @@ -232,7 +231,7 @@ def generate_optimised_posterior( mean = gpx.mean_functions.Zero() kernel = gpx.kernels.Matern52() prior = gpx.Prior(mean_function=mean, kernel=kernel) -opt_posterior = generate_optimised_posterior(D, prior, key) +opt_posterior = return_optimised_posterior(D, prior, key) # %% [markdown] # We can then sample a function from the posterior distribution of the surrogate model. We @@ -293,6 +292,7 @@ def optimise_sample( initial_sample_y = sample(initial_sample_points) best_x = jnp.array([initial_sample_points[jnp.argmin(initial_sample_y)]]) + # We want to maximise the utility function, but the optimiser performs minimisation. Since we're minimising the sample drawn, the sample is actually the negative utility function. negative_utility_fn = lambda x: sample(x)[0][0] lbfgsb = ScipyBoundedMinimize(fun=negative_utility_fn, method="l-bfgs-b") bounds = (lower_bound, upper_bound) @@ -385,7 +385,7 @@ def plot_bayes_opt( # each iteration. # %% -bo_iters = 10 +bo_iters = 5 # Set up initial dataset initial_x = tfp.mcmc.sample_halton_sequence( @@ -401,7 +401,7 @@ def plot_bayes_opt( mean = gpx.mean_functions.Zero() kernel = gpx.kernels.Matern52() prior = gpx.Prior(mean_function=mean, kernel=kernel) - opt_posterior = generate_optimised_posterior(D, prior, subkey) + opt_posterior = return_optimised_posterior(D, prior, subkey) # Draw a sample from the posterior, and find the minimiser of it approx_sample = opt_posterior.sample_approx( @@ -520,7 +520,7 @@ def six_hump_camel(x: Float[Array, "N 2"]) -> Float[Array, "N 1"]: lower_bound = jnp.array([-2.0, -1.0]) upper_bound = jnp.array([2.0, 1.0]) initial_sample_num = 5 -bo_iters = 15 +bo_iters = 11 num_experiments = 5 bo_experiment_results = [] @@ -543,7 +543,7 @@ def six_hump_camel(x: Float[Array, "N 2"]) -> Float[Array, "N 1"]: active_dims=[0, 1], lengthscale=jnp.array([1.0, 1.0]), variance=2.0 ) prior = gpx.Prior(mean_function=mean, kernel=kernel) - opt_posterior = generate_optimised_posterior(D, prior, subkey) + opt_posterior = return_optimised_posterior(D, prior, subkey) # Draw a sample from the posterior, and find the minimiser of it approx_sample = opt_posterior.sample_approx( @@ -579,7 +579,11 @@ def six_hump_camel(x: Float[Array, "N 2"]) -> Float[Array, "N 1"]: initial_x = bo_experiment_results[i].X[:5] initial_y = bo_experiment_results[i].y[:5] final_x = jr.uniform( - key, shape=(15, 2), dtype=jnp.float64, minval=lower_bound, maxval=upper_bound + key, + shape=(bo_iters, 2), + dtype=jnp.float64, + minval=lower_bound, + maxval=upper_bound, ) final_y = six_hump_camel(final_x) random_x = jnp.concatenate([initial_x, final_x], axis=0) @@ -588,69 +592,71 @@ def six_hump_camel(x: Float[Array, "N 2"]) -> Float[Array, "N 1"]: # %% [markdown] -# Finally, we'll process the experiment results to find the best observed value of the -# black-box function at each iteration of the experiments. We'll then take the mean and -# standard deviation of these values across the 5 experiments. +# Finally, we'll process the experiment results to find the log regret at each iteration +# of the experiments. The regret is defined as the difference between the minimum value of +# the black-box function observed so far and the true global minimum of the black box +# function. Mathematically, at time $t$, with observations $\mathcal{D}_t$, for function +# $f$ with global minimum $f^*$, the regret is defined as: +# +# $$\text{regret}_t = \min_{\mathbf{x} \in \mathcal{D_t}}f(\mathbf{x}) - f^*$$ +# +# We'll then take the mean and standard deviation of the log of the regret values across +# the 5 experiments. # %% -def obtain_cumulative_minimum_statistics( +def obtain_log_regret_statistics( experiment_results: List[gpx.Dataset], + global_minimum: ScalarFloat, ) -> Tuple[Float[Array, "N 1"], Float[Array, "N 1"]]: - cumulative_best_observation_results = [] + log_regret_results = [] for exp_result in experiment_results: observations = exp_result.y cumulative_best_observations = jax.lax.associative_scan( jax.numpy.minimum, observations ) - cumulative_best_observation_results.append(cumulative_best_observations) + regret = cumulative_best_observations - global_minimum + log_regret = jnp.log(regret) + log_regret_results.append(log_regret) - cumulative_best_observation_results = jnp.array(cumulative_best_observation_results) - cumulative_best_observation_mean = jnp.mean( - cumulative_best_observation_results, axis=0 - ) - cumulative_best_observation_std = jnp.std( - cumulative_best_observation_results, axis=0 - ) - return cumulative_best_observation_mean, cumulative_best_observation_std + log_regret_results = jnp.array(log_regret_results) + log_regret_mean = jnp.mean(log_regret_results, axis=0) + log_regret_std = jnp.std(log_regret_results, axis=0) + return log_regret_mean, log_regret_std -bo_cumulative_min_mean, bo_cumulative_min_std = obtain_cumulative_minimum_statistics( - bo_experiment_results +bo_log_regret_mean, bo_log_regret_std = obtain_log_regret_statistics( + bo_experiment_results, -1.031625 ) ( - random_cumulative_min_mean, - random_cumulative_min_std, -) = obtain_cumulative_minimum_statistics(random_experiment_results) + random_log_regret_mean, + random_log_regret_std, +) = obtain_log_regret_statistics(random_experiment_results, -1.031625) # %% [markdown] -# Now, when we plot the mean and standard deviation of the best observed value of the -# black-box function at each iteration, we can see that BO outperforms random sampling, -# consistently converging to the global minimum of the function. +# Now, when we plot the mean and standard deviation of the log regret at each iteration, +# we can see that BO outperforms random sampling! # %% fig, ax = plt.subplots() fn_evaluations = jnp.arange(1, bo_iters + initial_sample_num + 1) -cumulative_best_y = jax.lax.associative_scan(jax.numpy.minimum, D.y) -cumulative_random_y = jax.lax.associative_scan(jax.numpy.minimum, random_y) -ax.plot(fn_evaluations, bo_cumulative_min_mean, label="Bayesian Optimisation") +ax.plot(fn_evaluations, bo_log_regret_mean, label="Bayesian Optimisation") ax.fill_between( fn_evaluations, - bo_cumulative_min_mean[:, 0] - bo_cumulative_min_std[:, 0], - bo_cumulative_min_mean[:, 0] + bo_cumulative_min_std[:, 0], + bo_log_regret_mean[:, 0] - bo_log_regret_std[:, 0], + bo_log_regret_mean[:, 0] + bo_log_regret_std[:, 0], alpha=0.2, ) -ax.plot(fn_evaluations, random_cumulative_min_mean, label="Random Search") +ax.plot(fn_evaluations, random_log_regret_mean, label="Random Search") ax.fill_between( fn_evaluations, - random_cumulative_min_mean[:, 0] - random_cumulative_min_std[:, 0], - random_cumulative_min_mean[:, 0] + random_cumulative_min_std[:, 0], + random_log_regret_mean[:, 0] - random_log_regret_std[:, 0], + random_log_regret_mean[:, 0] + random_log_regret_std[:, 0], alpha=0.2, ) ax.axvline(x=initial_sample_num, linestyle=":") -ax.axhline(y=-1.0316, linestyle="--", label="True Minimum") ax.set_xlabel("Number of Black-Box Function Evaluations") -ax.set_ylabel("Best Observed Value") +ax.set_ylabel("Log Regret") ax.legend() plt.show() From 2874f6d1337a9f7dbd6d7804fff23630f1b6a72b Mon Sep 17 00:00:00 2001 From: Thomas-Christie Date: Mon, 24 Jul 2023 16:43:29 +0100 Subject: [PATCH 5/5] Minor stylistic changes --- docs/examples/bayesian_optimisation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/examples/bayesian_optimisation.py b/docs/examples/bayesian_optimisation.py index d1b08e76d..1804cd674 100644 --- a/docs/examples/bayesian_optimisation.py +++ b/docs/examples/bayesian_optimisation.py @@ -331,6 +331,7 @@ def plot_bayes_opt( predictive_std = predictive_dist.stddev() fig, ax = plt.subplots() + ax.plot(plt_x, predictive_mean, label="Predictive Mean", color=cols[1]) ax.fill_between( plt_x.squeeze(), predictive_mean - 2 * predictive_std, @@ -353,6 +354,7 @@ def plot_bayes_opt( linewidth=1, color=cols[1], ) + ax.plot(plt_x, sample_y, label="Posterior Sample") ax.plot( plt_x, forrester_y, @@ -361,8 +363,7 @@ def plot_bayes_opt( linestyle="--", linewidth=2, ) - ax.plot(plt_x, predictive_mean, label="Predictive Mean", color=cols[1]) - ax.plot(plt_x, sample_y, label="Posterior Sample") + ax.axvline(x=0.757, linestyle=":", color=cols[3], label="True Optimum") ax.scatter(dataset.X, dataset.y, label="Observations", color=cols[2], zorder=2) ax.scatter( queried_x,