Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/CliMA/CGDycore.jl
Browse files Browse the repository at this point in the history
  • Loading branch information
OsKnoth committed Oct 17, 2023
2 parents 84c10f3 + 99dedd2 commit 4cd2603
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 21 deletions.
13 changes: 8 additions & 5 deletions Examples/testNHSphere.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ FloatTypeBackend = parsed_args["FloatTypeBackend"]
NumberThreadGPU = parsed_args["NumberThreadGPU"]


if Device == "CPU"
if Device == "CPU" || Device == "CPU_P"
backend = CPU()
elseif Device == "GPU"
elseif Device == "GPU" || Device == "GPU_P"
if GPUType == "CUDA"
backend = CUDABackend()
CUDA.allowscalar(true)
Expand Down Expand Up @@ -285,13 +285,16 @@ end

if Device == "CPU" || Device == "GPU"
Global.ParallelCom.NumberThreadGPU = NumberThreadGPU
@show "FcnGPU"
nT = max(7 + NumTr, NumV + NumTr)
@show Global.Output.Flat
CGDycore.TimeStepper!(U,CGDycore.FcnGPU!,CGDycore.FcnPrepareGPU!,CGDycore.JacSchurGPU!,
CGDycore.TransSphereX,CG,Metric,Phys,Exchange,Global,Param,DiscType)
elseif Device == "CPU_P" || Device == "GPU_P"
Global.ParallelCom.NumberThreadGPU = NumberThreadGPU
nT = max(7 + NumTr, NumV + NumTr)
CGDycore.InitExchangeData3D(backend,FTB,nz,nT,Exchange)
CGDycore.TimeStepper!(U,CGDycore.FcnGPU_P!,CGDycore.FcnPrepareGPU!,CGDycore.JacSchurGPU!,
CGDycore.TransSphereX,CG,Metric,Phys,Exchange,Global,Param,DiscType)
else
@show "Fcn"
nT = max(7 + NumTr, NumV + NumTr)
CGDycore.InitExchangeData3D(backend,FTB,nz,nT,Exchange)
CGDycore.TimeStepper!(U,CGDycore.Fcn!,CGDycore.FcnPrepare!,CGDycore.JacSchurGPU!,
Expand Down
4 changes: 2 additions & 2 deletions Jobs/JobNHBaroWaveDrySphere
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mpirun -n 2 julia --project Examples/testNHSphere.jl \
mpirun -n 1 julia --project Examples/testNHSphere.jl \
--Problem="BaroWaveDrySphere" \
--Device="CPU" \
--Device="GPU" \
--GPUType="Metal" \
--FloatTypeBackend="Float32" \
--NumV=5 \
Expand Down
75 changes: 74 additions & 1 deletion src/GPU/OperatorKernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1404,7 +1404,80 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,DiscType)
KDivRhoTrUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)

ExchangeData3DSend(U,Exchange)
end

function FcnGPU_P!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,DiscType)

backend = get_backend(F)
FT = eltype(F)
@. F = 0
Glob = FE.Glob
DS = FE.DS
DW = FE.DW
M = FE.M
dXdxI = Metric.dXdxI
X = Metric.X
J = Metric.J
DoF = FE.DoF
N = size(FE.DS,1)
Nz = size(F,1)
NF = size(Glob,2)
Koeff = Global.Model.HyperDDiv
Temp1 = Cache.Temp1
NumberThreadGPU = Global.ParallelCom.NumberThreadGPU


KoeffCurl = Global.Model.HyperDCurl
KoeffGrad = Global.Model.HyperDGrad
KoeffDiv = Global.Model.HyperDDiv


# State vector
@views Rho = U[:,:,1]
@views u = U[:,:,2]
@views v = U[:,:,3]
@views w = U[:,:,4]
@views RhoTr = U[:,:,5]
# Cache
@views CacheF = Temp1[:,:,1:6]
@views FRho = F[:,:,1]
@views FRhoTr = F[:,:,5]
@views p = Cache.AuxG[:,:,1]
# Ranges
NzG = min(div(NumberThreadGPU,N*N),Nz)
group = (N, N, NzG, 1)
ndrange = (N, N, Nz, NF)

KRhoGradKinKernel! = RhoGradKinKernel!(backend,group)
KGradKernel! = GradKernel!(backend,group)
KDivRhoGradKernel! = DivRhoGradKernel!(backend, group)
KHyperViscKernel! = HyperViscKernel!(backend, group)
KHyperViscKernelKoeff! = HyperViscKernelKoeff!(backend, group)
KDivRhoTrUpwind3Kernel! = DivRhoTrUpwind3Kernel!(backend, group)
KMomentumCoriolisKernel! = MomentumCoriolisKernel!(backend, group)
# KMomentumKernel! = MomentumKernel!(backend, group)

@. CacheF = 0
@views MRho = CacheF[:,:,6]
KHyperViscKernel!(CacheF,MRho,U,DS,DW,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)

@. F = 0
KHyperViscKernelKoeff!(F,U,CacheF,DS,DW,dXdxI,J,M,Glob,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrange)
KernelAbstractions.synchronize(backend)

KGradKernel!(F,U,p,DS,dXdxI,J,M,MRho,Glob,Phys,ndrange=ndrange)
KernelAbstractions.synchronize(backend)

KMomentumCoriolisKernel!(F,U,DS,dXdxI,J,X,MRho,M,Glob,Phys,ndrange=ndrange)
# KMomentumKernel!(F,U,DS,dXdxI,MRho,M,Glob,Phys,ndrange=ndrange)
KernelAbstractions.synchronize(backend)

KDivRhoTrUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)

ExchangeData3DSendGPU(U,Exchange)
KernelAbstractions.synchronize(backend)

ExchangeData3DRecvGPU!(U,Exchange)
KernelAbstractions.synchronize(backend)
Expand Down
48 changes: 35 additions & 13 deletions src/Parallel/Exchange.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
mutable struct ExchangeStruct{FT<:AbstractFloat,
IT1<:AbstractArray,
AT3<:AbstractArray}
IndSendBuffer::Dict{Int,Array{Int,1}}
IndSendBuffer::Dict{Int,IT1}
IndSendBufferF::Dict{Int,Array{Int,1}}
IndRecvBuffer::Dict{Int,Array{Int,1}}
IndRecvBuffer::Dict{Int,IT1}
IndRecvBufferF::Dict{Int,Array{Int,1}}
NeiProc::Array{Int, 1}
Proc::Int
Expand Down Expand Up @@ -44,7 +45,9 @@ function ExchangeStruct{FT}(backend) where FT<:AbstractFloat
sreq = MPI.UnsafeMultiRequest(0)
rreq = MPI.UnsafeMultiRequest(0)
AT3 = KernelAbstractions.zeros(backend,FT,0,0,0)
IT1 = KernelAbstractions.zeros(backend,Int,0)
return ExchangeStruct{FT,
typeof(IT1),
typeof(AT3)}(
IndSendBuffer,
IndSendBufferF,
Expand Down Expand Up @@ -351,13 +354,26 @@ function ExchangeStruct{FT}(backend,SubGrid,OrdPoly,CellToProc,Proc,ProcNumber,H

# Copy from CPU to device
AT3 = KernelAbstractions.zeros(backend,FT,0,0,0)
IT1 = KernelAbstractions.zeros(backend,Int,0)

SendBuffer = Dict()
for (key,) in SendBufferN
SendBuffer[key] = KernelAbstractions.zeros(backend,Int,size(SendBufferN[key]))
copyto!(SendBuffer[key],SendBufferN[key])
end
RecvBuffer = Dict()
for (key,) in RecvBufferN
RecvBuffer[key] = KernelAbstractions.zeros(backend,Int,size(RecvBufferN[key]))
copyto!(RecvBuffer[key],RecvBufferN[key])
end


return ExchangeStruct{FT,
typeof(IT1),
typeof(AT3)}(
SendBufferN,
SendBuffer,
IndSendBufferF,
RecvBufferN,
RecvBuffer,
IndRecvBufferF,
NeiProcN,
Proc,
Expand Down Expand Up @@ -683,7 +699,7 @@ function InitExchangeData3D(backend,FT,nz,nT,Exchange)
NeiProc = Exchange.NeiProc
@inbounds for iP in NeiProc
Exchange.RecvBuffer3[iP] = KernelAbstractions.zeros(backend,FT,nz,length(IndRecvBuffer[iP]),nT)
Exchange.SendBuffer3[iP] = KernelAbstractions.zeros(backend,FT,nz,length(IndRecvBuffer[iP]),nT)
Exchange.SendBuffer3[iP] = KernelAbstractions.zeros(backend,FT,nz,length(IndSendBuffer[iP]),nT)
end
end

Expand Down Expand Up @@ -758,45 +774,48 @@ function ExchangeData3DSend(U,Exchange)
end
end
function ExchangeData3DSendGPU(U,Exchange)
backend = get_backend(U)
FT = eltype(U)

IndSendBuffer = Exchange.IndSendBuffer
IndRecvBuffer = Exchange.IndRecvBuffer
NeiProc = Exchange.NeiProc
Proc = Exchange.Proc
ProcNumber = Exchange.ProcNumber
nz = size(U,1)
Nz = size(U,1)
nT = size(U,3)
RecvBuffer3 = Exchange.RecvBuffer3
SendBuffer3 = Exchange.SendBuffer3
rreq = Exchange.rreq
sreq = Exchange.sreq

group = (Nz,5,1)
KExchangeData3DSendKernel! = ExchangeData3DSendKernel!(group)
KExchangeData3DSendKernel! = ExchangeData3DSendKernel!(backend,group)
@inbounds for iP in NeiProc
ndrange = (Nz,length(IndSendBuffer[iP]),nT)
KExchangeData3DSendKernel!(U,SendBuffer3[iP],IndSendBuffer[iP],ndrange)
KExchangeData3DSendKernel!(U,SendBuffer3[iP],IndSendBuffer[iP],ndrange=ndrange)
end

i = 0
@inbounds for iP in NeiProc
tag = Proc + ProcNumber*iP
i += 1
@views MPI.Irecv!(RecvBuffer3[iP][1:nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, rreq[i])
@views MPI.Irecv!(RecvBuffer3[iP][1:Nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, rreq[i])
end
i = 0
@inbounds for iP in NeiProc
tag = iP + ProcNumber*Proc
i += 1
@views MPI.Isend(SendBuffer3[iP][1:nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, sreq[i])
@views MPI.Isend(SendBuffer3[iP][1:Nz,:,1:nT], iP - 1, tag, MPI.COMM_WORLD, sreq[i])
end
end

@kernel function ExchangeData3DSendKernel!(U,SendBuffer,IndSendBuffer)

Iz,I,IT = @index(Global, NTuple)
NumInd = @uniform @ndrange()[2]
NT = @uniform @ndrange()[2]
NT = @uniform @ndrange()[3]

if I <= NumInd && IT <= NT
@inbounds Ind = IndSendBuffer[I]
@inbounds SendBuffer[Iz,I,IT] = U[Iz,Ind,IT]
Expand Down Expand Up @@ -851,11 +870,14 @@ function ExchangeData3DRecv!(U,Exchange)
end

function ExchangeData3DRecvGPU!(U,Exchange)
backend = get_backend(U)
FT = eltype(U)

Nz = size(U,1)
nT = size(U,3)
IndRecvBuffer = Exchange.IndRecvBuffer
NeiProc = Exchange.NeiProc
Proc = Exchange.Proc
RecvBuffer3 = Exchange.RecvBuffer3
rreq = Exchange.rreq
sreq = Exchange.sreq
Expand All @@ -865,12 +887,12 @@ function ExchangeData3DRecvGPU!(U,Exchange)
MPI.Barrier(MPI.COMM_WORLD)

group = (Nz,5,1)
KExchangeData3DRecvKernel! = ExchangeData3DRecvKernel!(group)
KExchangeData3DRecvKernel! = ExchangeData3DRecvKernel!(backend,group)

#Receive
@inbounds for iP in NeiProc
ndrange = (Nz,length(IndRecvBuffer[iP]),nT)
KExchangeData3DRecvKernel!(U,RecvBuffer3[iP],IndRecvBuffer[iP],ndrange)
KExchangeData3DRecvKernel!(U,RecvBuffer3[iP],IndRecvBuffer[iP],ndrange=ndrange)
end
end

Expand Down

0 comments on commit 4cd2603

Please sign in to comment.