JuliaGaussianProcesses · rossviljoen · Jul 10, 2021 · Jul 14, 2021 · Jul 14, 2021 · Jul 14, 2021
diff --git a/Project.toml b/Project.toml
@@ -8,7 +8,9 @@ AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
diff --git a/examples/classification.jl b/examples/classification.jl
@@ -27,20 +27,22 @@ scatter(x, y)
 # %%
 # First, create the GP kernel from given parameters k
 function make_kernel(k)
-    return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+    return softplus(k[1]) * (Matern52Kernel() ∘ ScaleTransform(softplus(k[2])))
 end
 
-k = [10, 0.1]
+k = [20.0, 0.5]
+M = 15 # number of inducing points
+z = x[1:M]
 
-kernel = make_kernel(k)
-f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
-fx = f(x)
+model = SVGPModel(make_kernel, k, z; jitter=1e-3, likelihood=BernoulliLikelihood())
 
+f = prior(model)
+fx = f(x)
 
 # %%
 # Then, plot some samples from the prior underlying GP
 x_plot = 0:0.02:6
-prior_f_samples = rand(f.f(x_plot, 1e-6),20)
+prior_f_samples = rand(f.f(x_plot), 20)
 
 plt = plot(
     x_plot,
@@ -67,72 +69,35 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points")
 
 
 # %%
-# A simple Flux model
+# Optimise the model using Flux
 using Flux
 
-struct SVGPModel
-    k # kernel parameters
-    m # variational mean
-    A # variational covariance
-    z # inducing points
-end
-
-@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
-
-lik = BernoulliLikelihood()
-function (m::SVGPModel)(x)
-    kernel = make_kernel(m.k)
-    f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
-    q = MvNormal(m.m, m.A'm.A)
-    fx = f(x)
-    fu = f(m.z).fx
-    return fx, fu, q
-end
-
-function flux_loss(x, y; n_data=length(y))
-    fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data)
-end
-
-# %%
-M = 15 # number of inducing points
-
-# Initialise the parameters
-k = [10, 0.1]
-m = zeros(M)
-A = Matrix{Float64}(I, M, M)
-z = x[1:M]
-
-model = SVGPModel(k, m, A, z)
-
 opt = ADAM(0.1)
 parameters = Flux.params(model)
+delete!(parameters, model.z)    # Don't train the inducing inputs
 
 # %%
 # Negative ELBO before training
-println(flux_loss(x, y))
+println(loss(model, x, y))
 
 # %%
 # Train the model
 Flux.train!(
-    (x, y) -> flux_loss(x, y),
+    (x, y) -> loss(model, x, y),
     parameters,
     ncycle([(x, y)], 1000), # Train for 1000 epochs
     opt
 )
 
 # %%
 # Negative ELBO after training
-println(flux_loss(x, y))
+println(loss(model, x, y))
 
 # %%
 # After optimisation, plot samples from the underlying posterior GP.
+post = SparseGPs.posterior(model)
 
-fu = f(z).fx # want the underlying FiniteGP
-post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A))
-l_post = LatentGP(post, BernoulliLikelihood(), 0.1)
-
-post_f_samples = rand(l_post.f(x_plot, 1e-6), 20)
+post_f_samples = rand(post.f(x_plot), 20)
 
 plt = plot(
     x_plot,
@@ -144,7 +109,7 @@ plt = plot(
 
 # %%
 # As above, push these samples through a logistic sigmoid to get posterior predictions.
-post_y_samples = mean.(l_post.lik.(post_f_samples))
+post_y_samples = mean.(post.lik.(post_f_samples))
 
 plt = plot(
     x_plot,

diff --git a/examples/regression.jl b/examples/regression.jl
@@ -6,6 +6,7 @@ using Distributions
 using LinearAlgebra
 using Optim
 using IterTools
+using GPLikelihoods
 
 using Plots
 default(; legend=:outertopright, size=(700, 400))
@@ -30,88 +31,47 @@ scatter(x, y; xlabel="x", ylabel="y", legend=false)
 # A simple Flux model
 using Flux
 
-lik_noise = 0.3
-jitter = 1e-5
-
-struct SVGPModel
-    k # kernel parameters
-    m # variational mean
-    A # variational covariance
-    z # inducing points
-end
-
-@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
-
 function make_kernel(k)
     return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
 end
 
-# Create the 'model' from the parameters - i.e. return the FiniteGP at inputs x,
-# the FiniteGP at inducing inputs z and the variational posterior over inducing
-# points - q(u).
-function (m::SVGPModel)(x)
-    kernel = make_kernel(m.k)
-    f = GP(kernel)
-    q = MvNormal(m.m, m.A'm.A)
-    fx = f(x, lik_noise)
-    fu = f(m.z, jitter)
-    return fx, fu, q
-end
-
-# Create the posterior GP from the model parameters.
-function posterior(m::SVGPModel)
-    kernel = make_kernel(m.k)
-    f = GP(kernel)
-    fu = f(m.z, jitter)
-    q = MvNormal(m.m, m.A'm.A)
-    return SparseGPs.approx_posterior(SVGP(), fu, q)
-end
-
-# Return the loss given data - in this case the negative ELBO.
-function flux_loss(x, y; n_data=length(y))
-    fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data)
-end
-
-
 # %%
 M = 50 # number of inducing points
 
 # Select the first M inputs as inducing inputs
 z = x[1:M]
 
-# Initialise the parameters
+# Initialise the kernel parameters
 k = [0.3, 10]
-m = zeros(M)
-A = Matrix{Float64}(I, M, M)
 
-model = SVGPModel(k, m, A, z)
+model = SVGPModel(make_kernel, k, z; likelihood=GaussianLikelihood(0.1))
 
 b = 100 # minibatch size
 opt = ADAM(0.001)
 parameters = Flux.params(model)
+delete!(parameters, model.z)    # Don't train the inducing inputs
 data_loader = Flux.Data.DataLoader((x, y), batchsize=b)
 
 # %%
 # Negative ELBO before training
-println(flux_loss(x, y))
+println(loss(model, x, y))
 
 # %%
 # Train the model
 Flux.train!(
-    (x, y) -> flux_loss(x, y; n_data=N),
+    (x, y) -> loss(model, x, y, n_data=N),
     parameters,
     ncycle(data_loader, 300), # Train for 300 epochs
     opt
 )
 
 # %%
 # Negative ELBO after training
-println(flux_loss(x, y))
+println(loss(model, x, y))
 
 # %%
-# Plot samples from the optmimised approximate posterior.
-post = posterior(model)
+# Plot samples from the optimised approximate posterior.
+post = SparseGPs.posterior(model)
 
 scatter(
     x,
@@ -143,10 +103,11 @@ function exact_q(fu, fx, y)
     return MvNormal(m, S)
 end
 
-kernel = make_kernel([0.3, 10])
+kernel = kernel = make_kernel([0.3, 10])
 f = GP(kernel)
-fx = f(x, lik_noise)
-fu = f(z, jitter)
+fx = f(x, 0.1)
+fu = f(z, 1e-6)
+
 q_ex = exact_q(fu, fx, y)
 
 scatter(x, y)
@@ -175,4 +136,3 @@ scatter(
 plot!(-1:0.001:1, ap_ex; label="SVGP posterior")
 plot!(-1:0.001:1, ap_tits; label="Titsias posterior")
 vline!(z; label="Pseudo-points")
-
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
@@ -8,6 +8,8 @@ using StatsBase
 using FastGaussQuadrature
 using GPLikelihoods
 using ChainRulesCore
+using PDMats
+using Functors
 
 using AbstractGPs:
     FiniteGP,
@@ -21,9 +23,16 @@ using AbstractGPs:
 
 export elbo,
     approx_posterior,
-    SVGP
+    SVGP,
+    SVGPModel,
+    prior,
+    posterior,
+    loss,
+    elbo
 
+include("utils.jl")
 include("elbo.jl")
 include("svgp.jl")
+include("models.jl")
 
 end
diff --git a/src/models.jl b/src/models.jl
@@ -0,0 +1,108 @@
+const default_jitter = 1e-6
+
+struct SVGPModel{Tlik}
+    kernel_func # function to construct the kernel from `k`
+    lik::Tlik   # the likelihood function
+    jitter      # the jitter added to covariance matrices
+
+    ## Trainable parameters
+    k::AbstractVector           # kernel parameters
+    m::AbstractVector           # variational mean
+    A::AbstractMatrix           # variational covariance (sqrt)
+    z::AbstractVector           # inducing points
+end
+
+@functor SVGPModel (k, m, A, z,)
+
+function SVGPModel(
+    kernel_func,
+    kernel_params,
+    inducing_inputs;
+    q_μ::Union{AbstractVector,Nothing}=nothing,
+    q_Σ_sqrt::Union{AbstractMatrix,Nothing}=nothing,
+    q_eltype=Float64,
+    jitter=default_jitter,
+    likelihood=GaussianLikelihood(jitter)
+)
+    m, A = _init_variational_params(q_μ, q_Σ_sqrt, inducing_inputs; q_eltype)
+    return SVGPModel(
+        kernel_func,
+        likelihood,
+        jitter,
+        kernel_params,
+        m,
+        A,
+        inducing_inputs
+    )
+end
+
+function (m::SVGPModel{<:GaussianLikelihood})(x)
+    f = prior(m)
+    fx = f(x, m.lik.σ²)
+    fu = f(m.z, m.jitter)
+    q = _construct_q(m)
+    return fx, fu, q
+end
+
+function (m::SVGPModel)(x)
+    f = prior(m)
+    fx = f(x)
+    fu = f(m.z).fx
+    q = _construct_q(m)
+    return fx, fu, q
+end
+
+function posterior(m::SVGPModel{<:GaussianLikelihood})
+    f = prior(m)
+    fu = f(m.z, m.jitter)
+    q = _construct_q(m)
+    return SparseGPs.approx_posterior(SVGP(), fu, q)
+end
+
+function posterior(m::SVGPModel)
+    f = prior(m)
+    fu = f(m.z).fx
+    q = _construct_q(m)
+    post = SparseGPs.approx_posterior(SVGP(), fu, q)
+    return LatentGP(post, m.lik, m.jitter) # TODO: should this return `post` instead?
+end
+
+function prior(m::SVGPModel{<:GaussianLikelihood})
+    kernel = m.kernel_func(m.k)
+    return GP(kernel)
+end
+
+function prior(m::SVGPModel)
+    kernel = m.kernel_func(m.k)
+    return LatentGP(GP(kernel), m.lik, m.jitter)
+end
+
+function loss(m::SVGPModel, x, y; n_data=length(y))
+    return -elbo(m, x, y; n_data)
+end
+
+function elbo(m::SVGPModel, x, y; n_data=length(y))
+    fx, fu, q = m(x)
+    return SparseGPs.elbo(fx, y, fu, q; n_data)
+end
+
+function _init_variational_params(
+    q_μ::Union{AbstractVector,Nothing},
+    q_Σ_sqrt::Union{AbstractMatrix,Nothing},
+    z::AbstractVector;
+    q_eltype=Float64
+)
+    n = length(z)
+    if q_μ === nothing
+        q_μ = zeros(q_eltype, n)
+    end
+    if q_Σ_sqrt === nothing
+        q_Σ_sqrt = Matrix{q_eltype}(I, n, n)
+    end
+    return q_μ, q_Σ_sqrt
+end
+
+function _construct_q(m::SVGPModel)
+    S = PDMat(Cholesky(LowerTriangular(m.A)))
+    return MvNormal(m.m, S)
+end
diff --git a/src/utils.jl b/src/utils.jl
@@ -0,0 +1,6 @@
+# These methods to create a Cholesky directly from the factorisation will be in Julia 1.7
+# https://github.com/JuliaLang/julia/pull/39352
+if VERSION < v"1.7"
+    LinearAlgebra.Cholesky(L::LowerTriangular{T}) where {T} = Cholesky{T,typeof(L.data)}(L.data, 'L', 0)
+    LinearAlgebra.Cholesky(U::UpperTriangular{T}) where {T} = Cholesky{T,typeof(U.data)}(U.data, 'U', 0)
+end