-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Allow to instantiate
DenseKKTSystem
on CUDA GPU (#73)
* support DenseKKTSystem on GPUs * add dedicated GPU kernels for KKT operations * add proper tests for DenseKKTSystem on GPU * move build_qp_dense function in MadNLPTests * rescope PR#73 * add proper API to instantiate DenseKKTSystem on GPU * fix tests on GPU * add GPU implementation for compress_jacobian!
- Loading branch information
Showing
11 changed files
with
410 additions
and
181 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
#= | ||
MadNLP utils | ||
=# | ||
|
||
@kernel function _copy_diag!(dest, src) | ||
i = @index(Global) | ||
dest[i] = src[i, i] | ||
end | ||
|
||
function MadNLP.diag!(dest::CuVector{T}, src::CuMatrix{T}) where T | ||
@assert length(dest) == size(src, 1) | ||
ev = _copy_diag!(CUDADevice())(dest, src, ndrange=length(dest)) | ||
wait(ev) | ||
end | ||
|
||
@kernel function _add_diagonal!(dest, src1, src2) | ||
i = @index(Global) | ||
dest[i, i] = src1[i] + src2[i] | ||
end | ||
|
||
function MadNLP.diag_add!(dest::CuMatrix, src1::CuVector, src2::CuVector) | ||
ev = _add_diagonal!(CUDADevice())(dest, src1, src2, ndrange=size(dest, 1)) | ||
wait(ev) | ||
end | ||
|
||
#= | ||
MadNLP kernels | ||
=# | ||
|
||
# Overload is_valid to avoid fallback to default is_valid, slow on GPU | ||
MadNLP.is_valid(src::CuArray) = true | ||
|
||
# Constraint scaling | ||
function MadNLP.set_con_scale!(con_scale::AbstractVector, jac::CuMatrix, nlp_scaling_max_gradient) | ||
# Compute reduction on the GPU with built-in CUDA.jl function | ||
d_con_scale = maximum(abs, jac, dims=2) | ||
copyto!(con_scale, d_con_scale) | ||
con_scale .= min.(1.0, nlp_scaling_max_gradient ./ con_scale) | ||
end | ||
|
||
@kernel function _treat_fixed_variable_kernell!(dest, ind_fixed) | ||
k, j = @index(Global, NTuple) | ||
i = ind_fixed[k] | ||
|
||
if i == j | ||
dest[i, i] = 1.0 | ||
else | ||
dest[i, j] = 0.0 | ||
dest[j, i] = 0.0 | ||
end | ||
end | ||
|
||
function MadNLP.treat_fixed_variable!(kkt::MadNLP.AbstractKKTSystem{T, MT}) where {T, MT<:CuMatrix{T}} | ||
length(kkt.ind_fixed) == 0 && return | ||
aug = kkt.aug_com | ||
d_ind_fixed = kkt.ind_fixed |> CuVector # TODO: allocate ind_fixed directly on the GPU | ||
ndrange = (length(d_ind_fixed), size(aug, 1)) | ||
ev = _treat_fixed_variable_kernell!(CUDADevice())(aug, d_ind_fixed, ndrange=ndrange) | ||
wait(ev) | ||
end | ||
|
||
#= | ||
DenseKKTSystem kernels | ||
=# | ||
function MadNLP.mul!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}} | ||
# Load buffers | ||
haskey(kkt.etc, :hess_w1) || (kkt.etc[:hess_w1] = CuVector{T}(undef, size(kkt.aug_com, 1))) | ||
haskey(kkt.etc, :hess_w2) || (kkt.etc[:hess_w2] = CuVector{T}(undef, size(kkt.aug_com, 1))) | ||
|
||
d_x = kkt.etc[:hess_w1]::VT | ||
d_y = kkt.etc[:hess_w2]::VT | ||
|
||
# x and y can be host arrays. Copy them on the device to avoid side effect. | ||
copyto!(d_x, x) | ||
LinearAlgebra.mul!(d_y, kkt.aug_com, d_x) | ||
copyto!(y, d_y) | ||
end | ||
|
||
function MadNLP.jtprod!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}} | ||
# Load buffers | ||
haskey(kkt.etc, :jac_w1) || (kkt.etc[:jac_w1] = CuVector{T}(undef, size(kkt.jac, 1))) | ||
haskey(kkt.etc, :jac_w2) || (kkt.etc[:jac_w2] = CuVector{T}(undef, size(kkt.jac, 2))) | ||
|
||
d_x = kkt.etc[:jac_w1]::VT | ||
d_y = kkt.etc[:jac_w2]::VT | ||
|
||
# x and y can be host arrays. Copy them on the device to avoid side effect. | ||
copyto!(d_x, x) | ||
LinearAlgebra.mul!(d_y, kkt.jac', d_x) | ||
copyto!(y, d_y) | ||
end | ||
|
||
function MadNLP.set_aug_diagonal!(kkt::MadNLP.DenseKKTSystem{T, VT, MT}, ips::MadNLP.InteriorPointSolver) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}} | ||
haskey(kkt.etc, :pr_diag_host) || (kkt.etc[:pr_diag_host] = Vector{T}(undef, length(kkt.pr_diag))) | ||
pr_diag_h = kkt.etc[:pr_diag_host]::Vector{T} | ||
# Broadcast is not working as MadNLP array are allocated on the CPU, | ||
# whereas pr_diag is allocated on the GPU | ||
pr_diag_h .= ips.zl./(ips.x.-ips.xl) .+ ips.zu./(ips.xu.-ips.x) | ||
copyto!(kkt.pr_diag, pr_diag_h) | ||
fill!(kkt.du_diag, 0.0) | ||
end | ||
|
||
@kernel function _build_dense_kkt_system_kernel!( | ||
dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns | ||
) | ||
i, j = @index(Global, NTuple) | ||
if (i <= n) | ||
# Transfer Hessian | ||
if (i == j) | ||
dest[i, i] = pr_diag[i] + diag_hess[i] | ||
elseif j <= n | ||
dest[i, j] = hess[i, j] | ||
dest[j, i] = hess[j, i] | ||
end | ||
elseif i <= n + ns | ||
# Transfer slack diagonal | ||
dest[i, i] = pr_diag[i] | ||
elseif i <= n + ns + m | ||
# Transfer Jacobian | ||
i_ = i - n - ns | ||
dest[i, j] = jac[i_, j] | ||
dest[j, i] = jac[i_, j] | ||
# Transfer dual regularization | ||
dest[i, i] = du_diag[i_] | ||
end | ||
end | ||
|
||
function MadNLP._build_dense_kkt_system!( | ||
dest::CuMatrix, hess::CuMatrix, jac::CuMatrix, | ||
pr_diag::CuVector, du_diag::CuVector, diag_hess::CuVector, n, m, ns | ||
) | ||
ndrange = (n+m+ns, n+ns) | ||
ev = _build_dense_kkt_system_kernel!(CUDADevice())(dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns, ndrange=ndrange) | ||
wait(ev) | ||
end | ||
|
||
function MadNLP.compress_jacobian!(kkt::MadNLP.DenseKKTSystem{T, VT, MT}) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}} | ||
m = size(kkt.jac, 1) | ||
n = size(kkt.hess, 1) | ||
# Extract diagonal terms corresponding to inequalities | ||
index = (LinearAlgebra.diagind(kkt.jac) .+ n * m)[kkt.ind_ineq] | ||
# Add slack indexes | ||
kkt.jac[index] .= -one(T) | ||
# Scale | ||
kkt.jac .*= kkt.jacobian_scaling | ||
return | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
|
||
using CUDA | ||
using MadNLPTests | ||
|
||
function _compare_gpu_with_cpu(n, m, ind_fixed) | ||
madnlp_options = Dict{Symbol, Any}( | ||
:kkt_system=>MadNLP.DENSE_KKT_SYSTEM, | ||
:linear_solver=>MadNLPLapackGPU, | ||
:print_level=>MadNLP.ERROR, | ||
) | ||
|
||
nlp = MadNLPTests.DenseDummyQP(; n=n, m=m, fixed_variables=ind_fixed) | ||
|
||
h_ips = MadNLP.InteriorPointSolver(nlp; option_dict=copy(madnlp_options)) | ||
MadNLP.optimize!(h_ips) | ||
|
||
# Reinit NonlinearProgram to avoid side effect | ||
ind_cons = MadNLP.get_index_constraints(nlp) | ||
ns = length(ind_cons.ind_ineq) | ||
|
||
# Init KKT on the GPU | ||
TKKTGPU = MadNLP.DenseKKTSystem{Float64, CuVector{Float64}, CuMatrix{Float64}} | ||
opt = MadNLP.Options(; madnlp_options...) | ||
# Instantiate Solver with KKT on the GPU | ||
d_ips = MadNLP.InteriorPointSolver{TKKTGPU}(nlp, opt; option_linear_solver=copy(madnlp_options)) | ||
MadNLP.optimize!(d_ips) | ||
|
||
# Check that both results match exactly | ||
@test h_ips.cnt.k == d_ips.cnt.k | ||
@test h_ips.obj_val ≈ d_ips.obj_val atol=1e-10 | ||
@test h_ips.x ≈ d_ips.x atol=1e-10 | ||
@test h_ips.l ≈ d_ips.l atol=1e-10 | ||
end | ||
|
||
@testset "MadNLP: dense versus sparse" begin | ||
@testset "Size: ($n, $m)" for (n, m) in [(10, 0), (10, 5), (50, 10)] | ||
_compare_gpu_with_cpu(n, m, Int[]) | ||
end | ||
@testset "Fixed variables" begin | ||
n, m = 10, 5 | ||
_compare_gpu_with_cpu(10, 5, Int[1, 2]) | ||
end | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.