From 99dedd29009fef66278a6b09c142ccb188569d80 Mon Sep 17 00:00:00 2001 From: OsKnoth <50015520+OsKnoth@users.noreply.github.com> Date: Sun, 15 Oct 2023 21:59:39 +0200 Subject: [PATCH] MPI GPU --- Examples/testNHSphere.jl | 13 ++++--- Jobs/JobNHBaroWaveDrySphere | 4 +- src/GPU/OperatorKernel.jl | 75 ++++++++++++++++++++++++++++++++++++- src/Parallel/Exchange.jl | 48 +++++++++++++++++------- 4 files changed, 119 insertions(+), 21 deletions(-) diff --git a/Examples/testNHSphere.jl b/Examples/testNHSphere.jl index a0f3a94..dc3cc29 100644 --- a/Examples/testNHSphere.jl +++ b/Examples/testNHSphere.jl @@ -83,9 +83,9 @@ FloatTypeBackend = parsed_args["FloatTypeBackend"] NumberThreadGPU = parsed_args["NumberThreadGPU"] -if Device == "CPU" +if Device == "CPU" || Device == "CPU_P" backend = CPU() -elseif Device == "GPU" +elseif Device == "GPU" || Device == "GPU_P" if GPUType == "CUDA" backend = CUDABackend() CUDA.allowscalar(true) @@ -285,13 +285,16 @@ end if Device == "CPU" || Device == "GPU" Global.ParallelCom.NumberThreadGPU = NumberThreadGPU - @show "FcnGPU" nT = max(7 + NumTr, NumV + NumTr) - @show Global.Output.Flat CGDycore.TimeStepper!(U,CGDycore.FcnGPU!,CGDycore.FcnPrepareGPU!,CGDycore.JacSchurGPU!, CGDycore.TransSphereX,CG,Metric,Phys,Exchange,Global,Param,DiscType) +elseif Device == "CPU_P" || Device == "GPU_P" + Global.ParallelCom.NumberThreadGPU = NumberThreadGPU + nT = max(7 + NumTr, NumV + NumTr) + CGDycore.InitExchangeData3D(backend,FTB,nz,nT,Exchange) + CGDycore.TimeStepper!(U,CGDycore.FcnGPU_P!,CGDycore.FcnPrepareGPU!,CGDycore.JacSchurGPU!, + CGDycore.TransSphereX,CG,Metric,Phys,Exchange,Global,Param,DiscType) else - @show "Fcn" nT = max(7 + NumTr, NumV + NumTr) CGDycore.InitExchangeData3D(backend,FTB,nz,nT,Exchange) CGDycore.TimeStepper!(U,CGDycore.Fcn!,CGDycore.FcnPrepare!,CGDycore.JacSchurGPU!, diff --git a/Jobs/JobNHBaroWaveDrySphere b/Jobs/JobNHBaroWaveDrySphere index 63d0efd..b3309e3 100755 --- a/Jobs/JobNHBaroWaveDrySphere +++ b/Jobs/JobNHBaroWaveDrySphere @@ -1,6 +1,6 @@ -mpirun -n 2 julia --project Examples/testNHSphere.jl \ +mpirun -n 1 julia --project Examples/testNHSphere.jl \ --Problem="BaroWaveDrySphere" \ - --Device="CPU" \ + --Device="GPU" \ --GPUType="Metal" \ --FloatTypeBackend="Float32" \ --NumV=5 \ diff --git a/src/GPU/OperatorKernel.jl b/src/GPU/OperatorKernel.jl index c2a44e1..0068d47 100644 --- a/src/GPU/OperatorKernel.jl +++ b/src/GPU/OperatorKernel.jl @@ -1404,7 +1404,80 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,DiscType) KDivRhoTrUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange) KernelAbstractions.synchronize(backend) - ExchangeData3DSend(U,Exchange) +end + +function FcnGPU_P!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,DiscType) + + backend = get_backend(F) + FT = eltype(F) + @. F = 0 + Glob = FE.Glob + DS = FE.DS + DW = FE.DW + M = FE.M + dXdxI = Metric.dXdxI + X = Metric.X + J = Metric.J + DoF = FE.DoF + N = size(FE.DS,1) + Nz = size(F,1) + NF = size(Glob,2) + Koeff = Global.Model.HyperDDiv + Temp1 = Cache.Temp1 + NumberThreadGPU = Global.ParallelCom.NumberThreadGPU + + + KoeffCurl = Global.Model.HyperDCurl + KoeffGrad = Global.Model.HyperDGrad + KoeffDiv = Global.Model.HyperDDiv + + +# State vector + @views Rho = U[:,:,1] + @views u = U[:,:,2] + @views v = U[:,:,3] + @views w = U[:,:,4] + @views RhoTr = U[:,:,5] +# Cache + @views CacheF = Temp1[:,:,1:6] + @views FRho = F[:,:,1] + @views FRhoTr = F[:,:,5] + @views p = Cache.AuxG[:,:,1] +# Ranges + NzG = min(div(NumberThreadGPU,N*N),Nz) + group = (N, N, NzG, 1) + ndrange = (N, N, Nz, NF) + + KRhoGradKinKernel! = RhoGradKinKernel!(backend,group) + KGradKernel! = GradKernel!(backend,group) + KDivRhoGradKernel! = DivRhoGradKernel!(backend, group) + KHyperViscKernel! = HyperViscKernel!(backend, group) + KHyperViscKernelKoeff! = HyperViscKernelKoeff!(backend, group) + KDivRhoTrUpwind3Kernel! = DivRhoTrUpwind3Kernel!(backend, group) + KMomentumCoriolisKernel! = MomentumCoriolisKernel!(backend, group) +# KMomentumKernel! = MomentumKernel!(backend, group) + + @. CacheF = 0 + @views MRho = CacheF[:,:,6] + KHyperViscKernel!(CacheF,MRho,U,DS,DW,dXdxI,J,M,Glob,ndrange=ndrange) + KernelAbstractions.synchronize(backend) + + @. F = 0 + KHyperViscKernelKoeff!(F,U,CacheF,DS,DW,dXdxI,J,M,Glob,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrange) + KernelAbstractions.synchronize(backend) + + KGradKernel!(F,U,p,DS,dXdxI,J,M,MRho,Glob,Phys,ndrange=ndrange) + KernelAbstractions.synchronize(backend) + + KMomentumCoriolisKernel!(F,U,DS,dXdxI,J,X,MRho,M,Glob,Phys,ndrange=ndrange) +# KMomentumKernel!(F,U,DS,dXdxI,MRho,M,Glob,Phys,ndrange=ndrange) + KernelAbstractions.synchronize(backend) + + KDivRhoTrUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange) + KernelAbstractions.synchronize(backend) + + ExchangeData3DSendGPU(U,Exchange) + KernelAbstractions.synchronize(backend) ExchangeData3DRecvGPU!(U,Exchange) KernelAbstractions.synchronize(backend) diff --git a/src/Parallel/Exchange.jl b/src/Parallel/Exchange.jl index 2c3861c..f8bbe4b 100644 --- a/src/Parallel/Exchange.jl +++ b/src/Parallel/Exchange.jl @@ -1,8 +1,9 @@ mutable struct ExchangeStruct{FT<:AbstractFloat, + IT1<:AbstractArray, AT3<:AbstractArray} - IndSendBuffer::Dict{Int,Array{Int,1}} + IndSendBuffer::Dict{Int,IT1} IndSendBufferF::Dict{Int,Array{Int,1}} - IndRecvBuffer::Dict{Int,Array{Int,1}} + IndRecvBuffer::Dict{Int,IT1} IndRecvBufferF::Dict{Int,Array{Int,1}} NeiProc::Array{Int, 1} Proc::Int @@ -44,7 +45,9 @@ function ExchangeStruct{FT}(backend) where FT<:AbstractFloat sreq = MPI.UnsafeMultiRequest(0) rreq = MPI.UnsafeMultiRequest(0) AT3 = KernelAbstractions.zeros(backend,FT,0,0,0) + IT1 = KernelAbstractions.zeros(backend,Int,0) return ExchangeStruct{FT, + typeof(IT1), typeof(AT3)}( IndSendBuffer, IndSendBufferF, @@ -351,13 +354,26 @@ function ExchangeStruct{FT}(backend,SubGrid,OrdPoly,CellToProc,Proc,ProcNumber,H # Copy from CPU to device AT3 = KernelAbstractions.zeros(backend,FT,0,0,0) + IT1 = KernelAbstractions.zeros(backend,Int,0) + + SendBuffer = Dict() + for (key,) in SendBufferN + SendBuffer[key] = KernelAbstractions.zeros(backend,Int,size(SendBufferN[key])) + copyto!(SendBuffer[key],SendBufferN[key]) + end + RecvBuffer = Dict() + for (key,) in RecvBufferN + RecvBuffer[key] = KernelAbstractions.zeros(backend,Int,size(RecvBufferN[key])) + copyto!(RecvBuffer[key],RecvBufferN[key]) + end return ExchangeStruct{FT, + typeof(IT1), typeof(AT3)}( - SendBufferN, + SendBuffer, IndSendBufferF, - RecvBufferN, + RecvBuffer, IndRecvBufferF, NeiProcN, Proc, @@ -683,7 +699,7 @@ function InitExchangeData3D(backend,FT,nz,nT,Exchange) NeiProc = Exchange.NeiProc @inbounds for iP in NeiProc Exchange.RecvBuffer3[iP] = KernelAbstractions.zeros(backend,FT,nz,length(IndRecvBuffer[iP]),nT) - Exchange.SendBuffer3[iP] = KernelAbstractions.zeros(backend,FT,nz,length(IndRecvBuffer[iP]),nT) + Exchange.SendBuffer3[iP] = KernelAbstractions.zeros(backend,FT,nz,length(IndSendBuffer[iP]),nT) end end @@ -758,13 +774,15 @@ function ExchangeData3DSend(U,Exchange) end end function ExchangeData3DSendGPU(U,Exchange) + backend = get_backend(U) + FT = eltype(U) IndSendBuffer = Exchange.IndSendBuffer IndRecvBuffer = Exchange.IndRecvBuffer NeiProc = Exchange.NeiProc Proc = Exchange.Proc ProcNumber = Exchange.ProcNumber - nz = size(U,1) + Nz = size(U,1) nT = size(U,3) RecvBuffer3 = Exchange.RecvBuffer3 SendBuffer3 = Exchange.SendBuffer3 @@ -772,23 +790,23 @@ function ExchangeData3DSendGPU(U,Exchange) sreq = Exchange.sreq group = (Nz,5,1) - KExchangeData3DSendKernel! = ExchangeData3DSendKernel!(group) + KExchangeData3DSendKernel! = ExchangeData3DSendKernel!(backend,group) @inbounds for iP in NeiProc ndrange = (Nz,length(IndSendBuffer[iP]),nT) - KExchangeData3DSendKernel!(U,SendBuffer3[iP],IndSendBuffer[iP],ndrange) + KExchangeData3DSendKernel!(U,SendBuffer3[iP],IndSendBuffer[iP],ndrange=ndrange) end i = 0 @inbounds for iP in NeiProc tag = Proc + ProcNumber*iP i += 1 - @views MPI.Irecv!(RecvBuffer3[iP][1:nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, rreq[i]) + @views MPI.Irecv!(RecvBuffer3[iP][1:Nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, rreq[i]) end i = 0 @inbounds for iP in NeiProc tag = iP + ProcNumber*Proc i += 1 - @views MPI.Isend(SendBuffer3[iP][1:nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, sreq[i]) + @views MPI.Isend(SendBuffer3[iP][1:Nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, sreq[i]) end end @@ -796,7 +814,8 @@ end Iz,I,IT = @index(Global, NTuple) NumInd = @uniform @ndrange()[2] - NT = @uniform @ndrange()[2] + NT = @uniform @ndrange()[3] + if I <= NumInd && IT <= NT @inbounds Ind = IndSendBuffer[I] @inbounds SendBuffer[Iz,I,IT] = U[Iz,Ind,IT] @@ -851,11 +870,14 @@ function ExchangeData3DRecv!(U,Exchange) end function ExchangeData3DRecvGPU!(U,Exchange) + backend = get_backend(U) + FT = eltype(U) Nz = size(U,1) nT = size(U,3) IndRecvBuffer = Exchange.IndRecvBuffer NeiProc = Exchange.NeiProc + Proc = Exchange.Proc RecvBuffer3 = Exchange.RecvBuffer3 rreq = Exchange.rreq sreq = Exchange.sreq @@ -865,12 +887,12 @@ function ExchangeData3DRecvGPU!(U,Exchange) MPI.Barrier(MPI.COMM_WORLD) group = (Nz,5,1) - KExchangeData3DRecvKernel! = ExchangeData3DRecvKernel!(group) + KExchangeData3DRecvKernel! = ExchangeData3DRecvKernel!(backend,group) #Receive @inbounds for iP in NeiProc ndrange = (Nz,length(IndRecvBuffer[iP]),nT) - KExchangeData3DRecvKernel!(U,RecvBuffer3[iP],IndRecvBuffer[iP],ndrange) + KExchangeData3DRecvKernel!(U,RecvBuffer3[iP],IndRecvBuffer[iP],ndrange=ndrange) end end