Allow to instantiate DenseKKTSystem on CUDA GPU (#73)

* support DenseKKTSystem on GPUs * add dedicated GPU kernels for KKT operations * add proper tests for DenseKKTSystem on GPU * move build_qp_dense function in MadNLPTests * rescope PR#73 * add proper API to instantiate DenseKKTSystem on GPU * fix tests on GPU * add GPU implementation for compress_jacobian!
MadNLP · Oct 8, 2021 · 86eba3d · 86eba3d
1 parent 72d55de
commit 86eba3d
Show file tree

Hide file tree

Showing 11 changed files with 410 additions and 181 deletions.
diff --git a/lib/MadNLPGPU/Project.toml b/lib/MadNLPGPU/Project.toml
@@ -5,16 +5,21 @@ version = "0.1.0"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
 
 [compat]
 CUDA = "~2,~3"
+CUDAKernels = "0.3.0"
+KernelAbstractions = "0.7"
 MadNLP = "~0.2"
 julia = "1.5"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 MadNLPTests = "b52a2a03-04ab-4a5f-9698-6a2deff93217"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test","MadNLPTests"]
+test = ["Test", "MadNLPTests"]
diff --git a/lib/MadNLPGPU/src/MadNLPGPU.jl b/lib/MadNLPGPU/src/MadNLPGPU.jl
@@ -1,12 +1,23 @@
 module MadNLPGPU
 
-import CUDA: CUBLAS, CUSOLVER, CuVector, CuMatrix, toolkit_version, R_64F, has_cuda
+import LinearAlgebra
+# CUDA
+import CUDA: CUBLAS, CUSOLVER, CuVector, CuMatrix, CuArray, toolkit_version, R_64F, has_cuda
+# Kernels
+import KernelAbstractions: @kernel, @index, wait
+import CUDAKernels: CUDADevice
+
+import MadNLP
+
 import MadNLP:
     @kwdef, Logger, @debug, @warn, @error,
     AbstractOptions, AbstractLinearSolver, set_options!, MadNLPLapackCPU,
     SymbolicException,FactorizationException,SolveException,InertiaException,
     introduce, factorize!, solve!, improve!, is_inertia, inertia, tril_to_full!
 
+
+include("kernels.jl")
+
 if has_cuda()
     include("lapackgpu.jl")
     export MadNLPLapackGPU

diff --git a/lib/MadNLPGPU/src/kernels.jl b/lib/MadNLPGPU/src/kernels.jl
@@ -0,0 +1,147 @@
+#=
+    MadNLP utils
+=#
+
+@kernel function _copy_diag!(dest, src)
+    i = @index(Global)
+    dest[i] = src[i, i]
+end
+
+function MadNLP.diag!(dest::CuVector{T}, src::CuMatrix{T}) where T
+    @assert length(dest) == size(src, 1)
+    ev = _copy_diag!(CUDADevice())(dest, src, ndrange=length(dest))
+    wait(ev)
+end
+
+@kernel function _add_diagonal!(dest, src1, src2)
+    i = @index(Global)
+    dest[i, i] = src1[i] + src2[i]
+end
+
+function MadNLP.diag_add!(dest::CuMatrix, src1::CuVector, src2::CuVector)
+    ev = _add_diagonal!(CUDADevice())(dest, src1, src2, ndrange=size(dest, 1))
+    wait(ev)
+end
+
+#=
+    MadNLP kernels
+=#
+
+# Overload is_valid to avoid fallback to default is_valid, slow on GPU
+MadNLP.is_valid(src::CuArray) = true
+
+# Constraint scaling
+function MadNLP.set_con_scale!(con_scale::AbstractVector, jac::CuMatrix, nlp_scaling_max_gradient)
+    # Compute reduction on the GPU with built-in CUDA.jl function
+    d_con_scale = maximum(abs, jac, dims=2)
+    copyto!(con_scale, d_con_scale)
+    con_scale .= min.(1.0, nlp_scaling_max_gradient ./ con_scale)
+end
+
+@kernel function _treat_fixed_variable_kernell!(dest, ind_fixed)
+    k, j = @index(Global, NTuple)
+    i = ind_fixed[k]
+
+    if i == j
+        dest[i, i] = 1.0
+    else
+        dest[i, j] = 0.0
+        dest[j, i] = 0.0
+    end
+end
+
+function MadNLP.treat_fixed_variable!(kkt::MadNLP.AbstractKKTSystem{T, MT}) where {T, MT<:CuMatrix{T}}
+    length(kkt.ind_fixed) == 0 && return
+    aug = kkt.aug_com
+    d_ind_fixed = kkt.ind_fixed |> CuVector # TODO: allocate ind_fixed directly on the GPU
+    ndrange = (length(d_ind_fixed), size(aug, 1))
+    ev = _treat_fixed_variable_kernell!(CUDADevice())(aug, d_ind_fixed, ndrange=ndrange)
+    wait(ev)
+end
+
+#=
+    DenseKKTSystem kernels
+=#
+function MadNLP.mul!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
+    # Load buffers
+    haskey(kkt.etc, :hess_w1) || (kkt.etc[:hess_w1] = CuVector{T}(undef, size(kkt.aug_com, 1)))
+    haskey(kkt.etc, :hess_w2) || (kkt.etc[:hess_w2] = CuVector{T}(undef, size(kkt.aug_com, 1)))
+
+    d_x = kkt.etc[:hess_w1]::VT
+    d_y = kkt.etc[:hess_w2]::VT
+
+    # x and y can be host arrays. Copy them on the device to avoid side effect.
+    copyto!(d_x, x)
+    LinearAlgebra.mul!(d_y, kkt.aug_com, d_x)
+    copyto!(y, d_y)
+end
+
+function MadNLP.jtprod!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
+    # Load buffers
+    haskey(kkt.etc, :jac_w1) || (kkt.etc[:jac_w1] = CuVector{T}(undef, size(kkt.jac, 1)))
+    haskey(kkt.etc, :jac_w2) || (kkt.etc[:jac_w2] = CuVector{T}(undef, size(kkt.jac, 2)))
+
+    d_x = kkt.etc[:jac_w1]::VT
+    d_y = kkt.etc[:jac_w2]::VT
+
+    # x and y can be host arrays. Copy them on the device to avoid side effect.
+    copyto!(d_x, x)
+    LinearAlgebra.mul!(d_y, kkt.jac', d_x)
+    copyto!(y, d_y)
+end
+
+function MadNLP.set_aug_diagonal!(kkt::MadNLP.DenseKKTSystem{T, VT, MT}, ips::MadNLP.InteriorPointSolver) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
+    haskey(kkt.etc, :pr_diag_host) || (kkt.etc[:pr_diag_host] = Vector{T}(undef, length(kkt.pr_diag)))
+    pr_diag_h = kkt.etc[:pr_diag_host]::Vector{T}
+    # Broadcast is not working as MadNLP array are allocated on the CPU,
+    # whereas pr_diag is allocated on the GPU
+    pr_diag_h .= ips.zl./(ips.x.-ips.xl) .+ ips.zu./(ips.xu.-ips.x)
+    copyto!(kkt.pr_diag, pr_diag_h)
+    fill!(kkt.du_diag, 0.0)
+end
+
+@kernel function _build_dense_kkt_system_kernel!(
+    dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns
+)
+    i, j = @index(Global, NTuple)
+    if (i <= n)
+        # Transfer Hessian
+        if (i == j)
+            dest[i, i] = pr_diag[i] + diag_hess[i]
+        elseif j <= n
+            dest[i, j] = hess[i, j]
+            dest[j, i] = hess[j, i]
+        end
+    elseif i <= n + ns
+        # Transfer slack diagonal
+        dest[i, i] = pr_diag[i]
+    elseif i <= n + ns + m
+        # Transfer Jacobian
+        i_ = i - n - ns
+        dest[i, j] = jac[i_, j]
+        dest[j, i] = jac[i_, j]
+        # Transfer dual regularization
+        dest[i, i] = du_diag[i_]
+    end
+end
+
+function MadNLP._build_dense_kkt_system!(
+    dest::CuMatrix, hess::CuMatrix, jac::CuMatrix,
+    pr_diag::CuVector, du_diag::CuVector, diag_hess::CuVector, n, m, ns
+)
+    ndrange = (n+m+ns, n+ns)
+    ev = _build_dense_kkt_system_kernel!(CUDADevice())(dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns, ndrange=ndrange)
+    wait(ev)
+end
+
+function MadNLP.compress_jacobian!(kkt::MadNLP.DenseKKTSystem{T, VT, MT}) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
+    m = size(kkt.jac, 1)
+    n = size(kkt.hess, 1)
+    # Extract diagonal terms corresponding to inequalities
+    index = (LinearAlgebra.diagind(kkt.jac) .+ n * m)[kkt.ind_ineq]
+    # Add slack indexes
+    kkt.jac[index] .= -one(T)
+    # Scale
+    kkt.jac .*= kkt.jacobian_scaling
+    return
+end
diff --git a/lib/MadNLPGPU/src/lapackgpu.jl b/lib/MadNLPGPU/src/lapackgpu.jl
@@ -23,8 +23,8 @@ const INPUT_MATRIX_TYPE = :dense
     lapackgpu_algorithm::Algorithms = BUNCHKAUFMAN
 end
 
-mutable struct Solver <: AbstractLinearSolver
-    dense::Matrix{Float64}
+mutable struct Solver{MT} <: AbstractLinearSolver
+    dense::MT
     fact::CuMatrix{Float64}
     rhs::CuVector{Float64}
     work::CuVector{Float64}
@@ -37,10 +37,10 @@ mutable struct Solver <: AbstractLinearSolver
     logger::Logger
 end
 
-function Solver(dense::Matrix{Float64};
+function Solver(dense::MT;
                 option_dict::Dict{Symbol,Any}=Dict{Symbol,Any}(),
                 opt=Options(),logger=Logger(),
-                kwargs...)
+                kwargs...) where {MT <: AbstractMatrix}
 
     set_options!(opt,option_dict,kwargs...)
     fact = CuMatrix{Float64}(undef,size(dense))
@@ -88,10 +88,10 @@ introduce(M::Solver) = "Lapack-GPU ($(M.opt.lapackgpu_algorithm))"
 if toolkit_version() >= v"11.3.1"
 
     is_inertia(M::Solver) = M.opt.lapackgpu_algorithm == CHOLESKY  # TODO: implement inertia(M::Solver) for BUNCHKAUFMAN
-    
+
     function factorize_bunchkaufman!(M::Solver)
-        haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))    
-        haskey(M.etc,:ipiv64) || (M.etc[:ipiv64] = CuVector{Int64}(undef,length(M.etc[:ipiv]))) 
+        haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))
+        haskey(M.etc,:ipiv64) || (M.etc[:ipiv64] = CuVector{Int64}(undef,length(M.etc[:ipiv])))
 
         copyto!(M.fact,M.dense)
         cusolverDnDsytrf_bufferSize(
@@ -105,7 +105,7 @@ if toolkit_version() >= v"11.3.1"
     end
 
     function solve_bunchkaufman!(M::Solver,x)
-        
+
         copyto!(M.etc[:ipiv64],M.etc[:ipiv])
         copyto!(M.rhs,x)
         ccall((:cusolverDnXsytrs_bufferSize, libcusolver()), cusolverStatus_t,
@@ -126,12 +126,12 @@ if toolkit_version() >= v"11.3.1"
               size(M.fact,1),1,R_64F,M.fact,size(M.fact,2),
               M.etc[:ipiv64],R_64F,M.rhs,length(M.rhs),M.work,M.lwork[],M.work_host,M.lwork_host[],M.info)
         copyto!(x,M.rhs)
-        
+
         return x
     end
 else
     is_inertia(M::Solver) =
-        M.opt.lapackgpu_algorithm == CHOLESKY || M.opt.lapackgpu_algorithm == CHOLESKY 
+        M.opt.lapackgpu_algorithm == CHOLESKY || M.opt.lapackgpu_algorithm == CHOLESKY
 
     function factorize_bunchkaufman!(M::Solver)
         haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))
@@ -162,7 +162,7 @@ else
             Cvoid,
             (Ref{Cchar},Ref{Int},Ref{Int},Ptr{Cdouble},Ref{Int},Ptr{Int},Ptr{Cdouble},Ref{Int},Ptr{Int}),
             'L',size(M.fact,1),1,M.etc[:fact_cpu],size(M.fact,2),M.etc[:ipiv_cpu],x,length(x),[1])
-        
+
         return x
     end
 end
@@ -242,7 +242,7 @@ function inertia(M::Solver)
     if M.opt.lapackgpu_algorithm == BUNCHKAUFMAN
         inertia(M.etc[:fact_cpu],M.etc[:ipiv_cpu],M.etc[:info_cpu][])
     elseif M.opt.lapackgpu_algorithm == CHOLESKY
-        sum(M.info) == 0 ? (size(M.fact,1),0,0) : (0,size(M.fact,1),0) 
+        sum(M.info) == 0 ? (size(M.fact,1),0,0) : (0,size(M.fact,1),0)
     else
         error(LOGGER,"Invalid lapackcpu_algorithm")
     end

diff --git a/lib/MadNLPGPU/test/densekkt_gpu.jl b/lib/MadNLPGPU/test/densekkt_gpu.jl
@@ -0,0 +1,44 @@
+
+using CUDA
+using MadNLPTests
+
+function _compare_gpu_with_cpu(n, m, ind_fixed)
+    madnlp_options = Dict{Symbol, Any}(
+        :kkt_system=>MadNLP.DENSE_KKT_SYSTEM,
+        :linear_solver=>MadNLPLapackGPU,
+        :print_level=>MadNLP.ERROR,
+    )
+
+    nlp = MadNLPTests.DenseDummyQP(; n=n, m=m, fixed_variables=ind_fixed)
+
+    h_ips = MadNLP.InteriorPointSolver(nlp; option_dict=copy(madnlp_options))
+    MadNLP.optimize!(h_ips)
+
+    # Reinit NonlinearProgram to avoid side effect
+    ind_cons = MadNLP.get_index_constraints(nlp)
+    ns = length(ind_cons.ind_ineq)
+
+    # Init KKT on the GPU
+    TKKTGPU = MadNLP.DenseKKTSystem{Float64, CuVector{Float64}, CuMatrix{Float64}}
+    opt = MadNLP.Options(; madnlp_options...)
+    # Instantiate Solver with KKT on the GPU
+    d_ips = MadNLP.InteriorPointSolver{TKKTGPU}(nlp, opt; option_linear_solver=copy(madnlp_options))
+    MadNLP.optimize!(d_ips)
+
+    # Check that both results match exactly
+    @test h_ips.cnt.k == d_ips.cnt.k
+    @test h_ips.obj_val ≈ d_ips.obj_val atol=1e-10
+    @test h_ips.x ≈ d_ips.x atol=1e-10
+    @test h_ips.l ≈ d_ips.l atol=1e-10
+end
+
+@testset "MadNLP: dense versus sparse" begin
+    @testset "Size: ($n, $m)" for (n, m) in [(10, 0), (10, 5), (50, 10)]
+        _compare_gpu_with_cpu(n, m, Int[])
+    end
+    @testset "Fixed variables" begin
+        n, m = 10, 5
+        _compare_gpu_with_cpu(10, 5, Int[1, 2])
+    end
+end
+
diff --git a/lib/MadNLPGPU/test/runtests.jl b/lib/MadNLPGPU/test/runtests.jl
@@ -39,10 +39,13 @@ testset = [
     ],
 ]
 
-@testset "MadNLPGPU test" begin
+# Test LapackGPU wrapper
+@testset "LapackGPU test" begin
     for (name,optimizer_constructor,exclude) in testset
         test_madnlp(name,optimizer_constructor,exclude)
     end
 end
 
+# Test DenseKKTSystem on GPU
+include("densekkt_gpu.jl")
 
diff --git a/lib/MadNLPTests/Project.toml b/lib/MadNLPTests/Project.toml
@@ -6,8 +6,11 @@ version = "0.1.0"
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
+NLPModels = "a4795742-8479-5a88-8948-cc11e1c8c1a6"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 JuMP = "~0.21"
-julia = "1.3"
+julia = "1.3"