Skip to content

Commit

Permalink
New kernels in FcnGPU
Browse files Browse the repository at this point in the history
  • Loading branch information
OsKnoth committed Oct 10, 2024
1 parent 5d96ad3 commit 0032b2f
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 17 deletions.
2 changes: 1 addition & 1 deletion BatchScripts/Mac/HeldSuarezMoistSphere_32Elem.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
export JuliaDevice="CPU"
mpirun -n 1 ./Jobs/NHSphere/HeldSuarezMoistSphere_32Elem
mpirun -n 6 ./Jobs/NHSphere/HeldSuarezMoistSphere_32Elem
2 changes: 1 addition & 1 deletion BatchScripts/levante/TestKernels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#SBATCH --cpus-per-task=64
#SBATCH --exclusive
#SBATCH --mem=0 # Request all memory available on all nodes
#SBATCH --time=00:30:00 # Set a limit on the total run time
#SBATCH --time=00:10:00 # Set a limit on the total run time
#SBATCH --mail-type=FAIL # Notify user by email in case of job failure
#SBATCH --account=bb1143 # Charge resources on this project account
#SBATCH --output=ErgKernelCUDA # File name for standard output
Expand Down
2 changes: 1 addition & 1 deletion BatchScripts/lumi/TestKernels.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --job-name=benchmark
#SBATCH --account=project_465000863
#SBATCH --time=00:20:00
#SBATCH --time=00:10:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
Expand Down
15 changes: 15 additions & 0 deletions TestKernels/testKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,21 @@ KernelAbstractions.synchronize(backend)
KernelAbstractions.synchronize(backend)
end

@show "Upwind Tracer New2"
@. F = 0
@. Tr[:,:,1] = Th
@. Tr[:,:,2] = Th
KDivRhoTrUpwind3New2Kernel! = GPU.DivRhoTrUpwind3New2Kernel!(backend,group)
KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
@show sum(abs.(FTr[:,:,1]))
@show sum(abs.(FTr[:,:,2]))
@show sum(abs.(F[:,:,5]))
@time for iter = 1 : TestIter
KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
end

KHyperViscKoeffKernel! = GPU.HyperViscKoeffKernel!(backend,group)
KHyperViscKoeffKernel!(F,U,CacheF,D,DW,dXdxI,J,M,Glob,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
Expand Down
27 changes: 13 additions & 14 deletions src/GPU/FcnGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E
KHyperViscTracerKernel! = HyperViscTracerKernel!(backend, groupTr)
KHyperViscTracerKoeffKernel! = HyperViscTracerKoeffKernel!(backend, groupTr)
KDivRhoTrUpwind3Kernel! = DivRhoTrUpwind3Kernel!(backend, groupTr)
KDivRhoTrUpwind3New2Kernel! = DivRhoTrUpwind3New2Kernel!(backend, groupTr)
KDivRhoTrUpwind3LimKernel! = DivRhoTrUpwind3LimKernel!(backend, groupTr)
KLimitKernel! = LimitKernel!(backend, groupL)

Expand Down Expand Up @@ -396,8 +397,8 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E
for iT = 1 : NumTr
@views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI,J,M,Glob,
KoeffDiv,ndrange=ndrangeB)
@views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS,
dXdxI,J,M,Glob,ndrange=ndrangeB)
# @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS,
# dXdxI,J,M,Glob,ndrange=ndrangeB)
end
else
for iT = 1 : NumTr
Expand All @@ -408,6 +409,7 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E
if TkePos > 0
@views KHyperViscTracerKoeffKernel!(FTke,CacheTke,Rho,DS,DW,dXdxI,J,M,Glob,
KoeffDiv,ndrange=ndrangeB)
@views KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS, dXdxI,J,M,Glob,ndrange=ndrangeB)
end
if KoeffDivW > 0
KHyperViscWKoeffKernel! = HyperViscWKoeffKernel!(backend, groupTr)
Expand All @@ -422,13 +424,11 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E
KMomentumCoriolisKernel!(F,U,DS,dXdxI,J,X,M,Glob,CoriolisFun,ndrange=ndrangeB)
KGradFullKernel!(F,U,p,DS,dXdxI,X,J,M,Glob,GravitationFun,ndrange=ndrangeB)
if State == "Dry" || State == "ShallowWater" || State == "Moist"
KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrangeB)
# KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrangeB)
KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,DS,dXdxI,J,M,Glob,ndrange=ndrangeB)
elseif State == "DryEnergy" || State == "MoistEnergy"
KDivRhoKEUpwind3Kernel!(F,U,p,DS,dXdxI,J,M,Glob,ndrange=ndrangeB)
end
if TkePos > 0
@views KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS, dXdxI,J,M,Glob,ndrange=ndrangeB)
end
if EDMF
KMomentumCoriolisDraftKernel! = MomentumVectorInvariantCoriolisDraftKernel!(backend,group)
KMomentumCoriolisDraftKernel!(F,U,wEDMF,aRhoEDMF,DS,dXdxI,J,X,M,Glob,CoriolisFun,ndrange=ndrangeBEDMF)
Expand All @@ -450,10 +450,10 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E
@views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI_I,J_I,M,Glob_I,
KoeffDiv,ndrange=ndrangeI)
end
for iT = 1 : NumTr
@views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS,
dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
end
# for iT = 1 : NumTr
# @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS,
# dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
# end
else
for iT = 1 : NumTr
@views KDivRhoTrUpwind3LimKernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS,
Expand All @@ -463,6 +463,7 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E
if TkePos > 0
@views KHyperViscTracerKoeffKernel!(FTke,CacheTke,Rho,DS,DW,dXdxI_I,J_I,M,Glob_I,
KoeffDiv,ndrange=ndrangeI)
KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
end
if KoeffDivW > 0
@views KHyperViscWKoeffKernel!(F[:,:,4],Cachew,DS,DW,dXdxI_I,J_I,M,Glob_I,KoeffDivW,ndrange=ndrangeI)
Expand All @@ -481,14 +482,12 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E
KGradFullKernel!(F,U,p,DS,dXdxI_I,X_I,J_I,M,Glob_I,GravitationFun,ndrange=ndrangeI)

if State == "Dry" || State == "ShallowWater" || State == "Moist"
KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
# KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
elseif State == "DryEnergy"
KDivRhoKEUpwind3Kernel!(F,U,p,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
end

if TkePos > 0
KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI)
end
if EDMF
KMomentumCoriolisDraftKernel!(F,U,wEDMF,aRhoEDMF,DS,dXdxI_I,J_I,X_I,M,Glob_I,CoriolisFun,ndrange=ndrangeIEDMF)
KRhoGradKinEDMFKernel!(F,U,wEDMF,aRhoEDMF,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeIEDMF)
Expand Down
124 changes: 124 additions & 0 deletions src/GPU/OperatorKernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,130 @@ end
end
end
end
@kernel inbounds = true function DivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,@Const(U),@Const(D),@Const(dXdxI),
@Const(JJ),@Const(M),@Const(Glob))

I, J, iz = @index(Local, NTuple)
_,_,Iz,IF = @index(Global, NTuple)

ColumnTilesDim = @uniform @groupsize()[3]
N = @uniform @groupsize()[1]
Nz = @uniform @ndrange()[3]
NF = @uniform @ndrange()[4]

ID = I + (J - 1) * N
ind = Glob[ID,IF]

cCol = @localmem eltype(F) (N,N, ColumnTilesDim+3)
RhoCol = @localmem eltype(F) (N,N, ColumnTilesDim+1)
uCol = @localmem eltype(F) (N,N, ColumnTilesDim+1)
vCol = @localmem eltype(F) (N,N, ColumnTilesDim+1)
wCol = @localmem eltype(F) (N,N, ColumnTilesDim)
JCol = @localmem eltype(F) (N,N, ColumnTilesDim+3)
MCCol = @localmem eltype(F) (N,N, ColumnTilesDim+1)
if Iz <= Nz
cCol[I,J,iz+1] = U[Iz,ind,5] / U[Iz,ind,1]
JCol[I,J,iz+1] = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]
RhoCol[I,J,iz] = U[Iz,ind,1]
uCol[I,J,iz] = U[Iz,ind,2]
vCol[I,J,iz] = U[Iz,ind,3]
wCol[I,J,iz] = U[Iz,ind,4]
MCCol[I,J,iz] = M[Iz,ind,1] + M[Iz,ind,2]
end
if iz == 1
Izm1 = max(Iz - 1,1)
cCol[I,J,iz] = U[Izm1,ind,5] / U[Izm1,ind,1]
JCol[I,J,iz] = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF]
end
if iz == ColumnTilesDim || Iz == Nz
Izp1 = min(Iz + 1,Nz)
cCol[I,J,iz+2] = U[Izp1,ind,5] / U[Izp1,ind,1]
JCol[I,J,iz+2] = JJ[ID,1,Izp1,IF] + JJ[ID,2,Izp1,IF]
Izp2 = min(Iz + 2,Nz)
cCol[I,J,iz+3] = U[Izp2,ind,5] / U[Izp2,ind,1]
JCol[I,J,iz+3] = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF]
RhoCol[I,J,iz+1] = U[Izp1,ind,1]
uCol[I,J,iz+1] = U[Izp1,ind,2]
vCol[I,J,iz+1] = U[Izp1,ind,3]
MCCol[I,J,iz+1] = M[Izp1,ind,1] + M[Izp1,ind,2]
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]

if Iz < Nz
@views wCon = Contra3(RhoCol[I,J,iz:iz+1],uCol[I,J,iz:iz+1],vCol[I,J,iz:iz+1],
wCol[I,J,iz],dXdxI[3,:,:,ID,Iz:Iz+1,IF])
wCol[I,J,iz] = wCon
cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3],
JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3])
Flux = eltype(F)(0.25) * ((abs(wCon) + wCon) * cFL +
(-abs(wCon) + wCon) * cFR)
@atomic :monotonic F[Iz,ind,5] += -Flux / MCCol[I,J,iz]
@atomic :monotonic F[Iz+1,ind,5] += Flux / MCCol[I,J,iz+1]
Flux = wCon
@atomic :monotonic F[Iz,ind,1] += -Flux / MCCol[I,J,iz]
@atomic :monotonic F[Iz+1,ind,1] += Flux / MCCol[I,J,iz+1]
end

if Iz <= Nz
uCol[I,J,iz], vCol[I,J,iz] = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],view(dXdxI,1:2,1:2,:,ID,Iz,IF))
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz <= Nz
DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1]
DivRho = D[I,1] * uCol[1,J,iz] + D[J,1] * vCol[I,1,iz]
for k = 2 : N
DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1]
DivRho += D[I,k] * uCol[k,J,iz] + D[J,k] * vCol[I,k,iz]
end
@atomic :monotonic F[Iz,ind,5] += DivRhoTr / MCCol[I,J,iz]
@atomic :monotonic F[Iz,ind,1] += DivRho / MCCol[I,J,iz]
end

for iT = NumV+1 : NumV + NumTr
# Second tracer
ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz <= Nz
cCol[I,J,iz+1] = U[Iz,ind,iT] / U[Iz,ind,1]
end
if iz == 1
Izm1 = max(Iz - 1,1)
cCol[I,J,iz] = U[Izm1,ind,iT] / U[Izm1,ind,1]
end
if iz == ColumnTilesDim || Iz == Nz
Izp1 = min(Iz + 1,Nz)
cCol[I,J,iz+2] = U[Izp1,ind,iT] / U[Izp1,ind,1]
Izp2 = min(Iz + 2,Nz)
cCol[I,J,iz+3] = U[Izp2,ind,iT] / U[Izp2,ind,1]
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz < Nz
wCon = wCol[I,J,iz]
cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3],
JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3])
Flux = eltype(F)(0.25) * ((abs(wCon) + wCon) * cFL +
(-abs(wCon) + wCon) * cFR)
@atomic :monotonic F[Iz,ind,iT] += -Flux / MCCol[I,J,iz]
@atomic :monotonic F[Iz+1,ind,iT] += Flux / MCCol[I,J,iz+1]
end
if Iz <= Nz
DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1]
for k = 2 : N
DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1]
end
@atomic :monotonic F[Iz,ind,iT] += DivRhoTr / MCCol[I,J,iz]
end
end
end

@kernel inbounds = true function DivRhoTrUpwind3NewKernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI),
@Const(JJ),@Const(M),@Const(Glob))
Expand Down

0 comments on commit 0032b2f

Please sign in to comment.