From 0032b2f365424c93ff43de0384f914fe1b396166 Mon Sep 17 00:00:00 2001 From: OsKnoth <50015520+OsKnoth@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:00:08 +0200 Subject: [PATCH] New kernels in FcnGPU --- .../Mac/HeldSuarezMoistSphere_32Elem.sh | 2 +- BatchScripts/levante/TestKernels.sh | 2 +- BatchScripts/lumi/TestKernels.sh | 2 +- TestKernels/testKernels.jl | 15 +++ src/GPU/FcnGPU.jl | 27 ++-- src/GPU/OperatorKernel.jl | 124 ++++++++++++++++++ 6 files changed, 155 insertions(+), 17 deletions(-) diff --git a/BatchScripts/Mac/HeldSuarezMoistSphere_32Elem.sh b/BatchScripts/Mac/HeldSuarezMoistSphere_32Elem.sh index 316052c..326a8c5 100755 --- a/BatchScripts/Mac/HeldSuarezMoistSphere_32Elem.sh +++ b/BatchScripts/Mac/HeldSuarezMoistSphere_32Elem.sh @@ -1,2 +1,2 @@ export JuliaDevice="CPU" -mpirun -n 1 ./Jobs/NHSphere/HeldSuarezMoistSphere_32Elem +mpirun -n 6 ./Jobs/NHSphere/HeldSuarezMoistSphere_32Elem diff --git a/BatchScripts/levante/TestKernels.sh b/BatchScripts/levante/TestKernels.sh index 4b21ff0..571dd6b 100755 --- a/BatchScripts/levante/TestKernels.sh +++ b/BatchScripts/levante/TestKernels.sh @@ -7,7 +7,7 @@ #SBATCH --cpus-per-task=64 #SBATCH --exclusive #SBATCH --mem=0 # Request all memory available on all nodes -#SBATCH --time=00:30:00 # Set a limit on the total run time +#SBATCH --time=00:10:00 # Set a limit on the total run time #SBATCH --mail-type=FAIL # Notify user by email in case of job failure #SBATCH --account=bb1143 # Charge resources on this project account #SBATCH --output=ErgKernelCUDA # File name for standard output diff --git a/BatchScripts/lumi/TestKernels.sh b/BatchScripts/lumi/TestKernels.sh index 6e5a602..1956c9e 100755 --- a/BatchScripts/lumi/TestKernels.sh +++ b/BatchScripts/lumi/TestKernels.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name=benchmark #SBATCH --account=project_465000863 -#SBATCH --time=00:20:00 +#SBATCH --time=00:10:00 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 diff --git a/TestKernels/testKernels.jl b/TestKernels/testKernels.jl index 683a5b5..19d0417 100644 --- a/TestKernels/testKernels.jl +++ b/TestKernels/testKernels.jl @@ -154,6 +154,21 @@ KernelAbstractions.synchronize(backend) KernelAbstractions.synchronize(backend) end +@show "Upwind Tracer New2" +@. F = 0 +@. Tr[:,:,1] = Th +@. Tr[:,:,2] = Th +KDivRhoTrUpwind3New2Kernel! = GPU.DivRhoTrUpwind3New2Kernel!(backend,group) +KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,D,dXdxI,J,M,Glob,ndrange=ndrange) +KernelAbstractions.synchronize(backend) +@show sum(abs.(FTr[:,:,1])) +@show sum(abs.(FTr[:,:,2])) +@show sum(abs.(F[:,:,5])) +@time for iter = 1 : TestIter + KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,D,dXdxI,J,M,Glob,ndrange=ndrange) + KernelAbstractions.synchronize(backend) +end + KHyperViscKoeffKernel! = GPU.HyperViscKoeffKernel!(backend,group) KHyperViscKoeffKernel!(F,U,CacheF,D,DW,dXdxI,J,M,Glob,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrange) KernelAbstractions.synchronize(backend) diff --git a/src/GPU/FcnGPU.jl b/src/GPU/FcnGPU.jl index f59401c..b399972 100644 --- a/src/GPU/FcnGPU.jl +++ b/src/GPU/FcnGPU.jl @@ -312,6 +312,7 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E KHyperViscTracerKernel! = HyperViscTracerKernel!(backend, groupTr) KHyperViscTracerKoeffKernel! = HyperViscTracerKoeffKernel!(backend, groupTr) KDivRhoTrUpwind3Kernel! = DivRhoTrUpwind3Kernel!(backend, groupTr) + KDivRhoTrUpwind3New2Kernel! = DivRhoTrUpwind3New2Kernel!(backend, groupTr) KDivRhoTrUpwind3LimKernel! = DivRhoTrUpwind3LimKernel!(backend, groupTr) KLimitKernel! = LimitKernel!(backend, groupL) @@ -396,8 +397,8 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E for iT = 1 : NumTr @views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI,J,M,Glob, KoeffDiv,ndrange=ndrangeB) - @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, - dXdxI,J,M,Glob,ndrange=ndrangeB) +# @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, +# dXdxI,J,M,Glob,ndrange=ndrangeB) end else for iT = 1 : NumTr @@ -408,6 +409,7 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E if TkePos > 0 @views KHyperViscTracerKoeffKernel!(FTke,CacheTke,Rho,DS,DW,dXdxI,J,M,Glob, KoeffDiv,ndrange=ndrangeB) + @views KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS, dXdxI,J,M,Glob,ndrange=ndrangeB) end if KoeffDivW > 0 KHyperViscWKoeffKernel! = HyperViscWKoeffKernel!(backend, groupTr) @@ -422,13 +424,11 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E KMomentumCoriolisKernel!(F,U,DS,dXdxI,J,X,M,Glob,CoriolisFun,ndrange=ndrangeB) KGradFullKernel!(F,U,p,DS,dXdxI,X,J,M,Glob,GravitationFun,ndrange=ndrangeB) if State == "Dry" || State == "ShallowWater" || State == "Moist" - KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrangeB) +# KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrangeB) + KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,DS,dXdxI,J,M,Glob,ndrange=ndrangeB) elseif State == "DryEnergy" || State == "MoistEnergy" KDivRhoKEUpwind3Kernel!(F,U,p,DS,dXdxI,J,M,Glob,ndrange=ndrangeB) end - if TkePos > 0 - @views KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS, dXdxI,J,M,Glob,ndrange=ndrangeB) - end if EDMF KMomentumCoriolisDraftKernel! = MomentumVectorInvariantCoriolisDraftKernel!(backend,group) KMomentumCoriolisDraftKernel!(F,U,wEDMF,aRhoEDMF,DS,dXdxI,J,X,M,Glob,CoriolisFun,ndrange=ndrangeBEDMF) @@ -450,10 +450,10 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E @views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI_I,J_I,M,Glob_I, KoeffDiv,ndrange=ndrangeI) end - for iT = 1 : NumTr - @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, - dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) - end +# for iT = 1 : NumTr +# @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, +# dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) +# end else for iT = 1 : NumTr @views KDivRhoTrUpwind3LimKernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, @@ -463,6 +463,7 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E if TkePos > 0 @views KHyperViscTracerKoeffKernel!(FTke,CacheTke,Rho,DS,DW,dXdxI_I,J_I,M,Glob_I, KoeffDiv,ndrange=ndrangeI) + KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) end if KoeffDivW > 0 @views KHyperViscWKoeffKernel!(F[:,:,4],Cachew,DS,DW,dXdxI_I,J_I,M,Glob_I,KoeffDivW,ndrange=ndrangeI) @@ -481,14 +482,12 @@ NVTX.@annotate function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,E KGradFullKernel!(F,U,p,DS,dXdxI_I,X_I,J_I,M,Glob_I,GravitationFun,ndrange=ndrangeI) if State == "Dry" || State == "ShallowWater" || State == "Moist" - KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) +# KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) + KDivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) elseif State == "DryEnergy" KDivRhoKEUpwind3Kernel!(F,U,p,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) end - if TkePos > 0 - KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) - end if EDMF KMomentumCoriolisDraftKernel!(F,U,wEDMF,aRhoEDMF,DS,dXdxI_I,J_I,X_I,M,Glob_I,CoriolisFun,ndrange=ndrangeIEDMF) KRhoGradKinEDMFKernel!(F,U,wEDMF,aRhoEDMF,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeIEDMF) diff --git a/src/GPU/OperatorKernel.jl b/src/GPU/OperatorKernel.jl index aea198b..a8490cb 100644 --- a/src/GPU/OperatorKernel.jl +++ b/src/GPU/OperatorKernel.jl @@ -634,6 +634,130 @@ end end end end +@kernel inbounds = true function DivRhoTrUpwind3New2Kernel!(F,NumV,NumTr,@Const(U),@Const(D),@Const(dXdxI), + @Const(JJ),@Const(M),@Const(Glob)) + + I, J, iz = @index(Local, NTuple) + _,_,Iz,IF = @index(Global, NTuple) + + ColumnTilesDim = @uniform @groupsize()[3] + N = @uniform @groupsize()[1] + Nz = @uniform @ndrange()[3] + NF = @uniform @ndrange()[4] + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + + cCol = @localmem eltype(F) (N,N, ColumnTilesDim+3) + RhoCol = @localmem eltype(F) (N,N, ColumnTilesDim+1) + uCol = @localmem eltype(F) (N,N, ColumnTilesDim+1) + vCol = @localmem eltype(F) (N,N, ColumnTilesDim+1) + wCol = @localmem eltype(F) (N,N, ColumnTilesDim) + JCol = @localmem eltype(F) (N,N, ColumnTilesDim+3) + MCCol = @localmem eltype(F) (N,N, ColumnTilesDim+1) + if Iz <= Nz + cCol[I,J,iz+1] = U[Iz,ind,5] / U[Iz,ind,1] + JCol[I,J,iz+1] = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + RhoCol[I,J,iz] = U[Iz,ind,1] + uCol[I,J,iz] = U[Iz,ind,2] + vCol[I,J,iz] = U[Iz,ind,3] + wCol[I,J,iz] = U[Iz,ind,4] + MCCol[I,J,iz] = M[Iz,ind,1] + M[Iz,ind,2] + end + if iz == 1 + Izm1 = max(Iz - 1,1) + cCol[I,J,iz] = U[Izm1,ind,5] / U[Izm1,ind,1] + JCol[I,J,iz] = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF] + end + if iz == ColumnTilesDim || Iz == Nz + Izp1 = min(Iz + 1,Nz) + cCol[I,J,iz+2] = U[Izp1,ind,5] / U[Izp1,ind,1] + JCol[I,J,iz+2] = JJ[ID,1,Izp1,IF] + JJ[ID,2,Izp1,IF] + Izp2 = min(Iz + 2,Nz) + cCol[I,J,iz+3] = U[Izp2,ind,5] / U[Izp2,ind,1] + JCol[I,J,iz+3] = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF] + RhoCol[I,J,iz+1] = U[Izp1,ind,1] + uCol[I,J,iz+1] = U[Izp1,ind,2] + vCol[I,J,iz+1] = U[Izp1,ind,3] + MCCol[I,J,iz+1] = M[Izp1,ind,1] + M[Izp1,ind,2] + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + + if Iz < Nz + @views wCon = Contra3(RhoCol[I,J,iz:iz+1],uCol[I,J,iz:iz+1],vCol[I,J,iz:iz+1], + wCol[I,J,iz],dXdxI[3,:,:,ID,Iz:Iz+1,IF]) + wCol[I,J,iz] = wCon + cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3], + JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3]) + Flux = eltype(F)(0.25) * ((abs(wCon) + wCon) * cFL + + (-abs(wCon) + wCon) * cFR) + @atomic :monotonic F[Iz,ind,5] += -Flux / MCCol[I,J,iz] + @atomic :monotonic F[Iz+1,ind,5] += Flux / MCCol[I,J,iz+1] + Flux = wCon + @atomic :monotonic F[Iz,ind,1] += -Flux / MCCol[I,J,iz] + @atomic :monotonic F[Iz+1,ind,1] += Flux / MCCol[I,J,iz+1] + end + + if Iz <= Nz + uCol[I,J,iz], vCol[I,J,iz] = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],view(dXdxI,1:2,1:2,:,ID,Iz,IF)) + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz <= Nz + DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1] + DivRho = D[I,1] * uCol[1,J,iz] + D[J,1] * vCol[I,1,iz] + for k = 2 : N + DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1] + DivRho += D[I,k] * uCol[k,J,iz] + D[J,k] * vCol[I,k,iz] + end + @atomic :monotonic F[Iz,ind,5] += DivRhoTr / MCCol[I,J,iz] + @atomic :monotonic F[Iz,ind,1] += DivRho / MCCol[I,J,iz] + end + + for iT = NumV+1 : NumV + NumTr +# Second tracer + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz <= Nz + cCol[I,J,iz+1] = U[Iz,ind,iT] / U[Iz,ind,1] + end + if iz == 1 + Izm1 = max(Iz - 1,1) + cCol[I,J,iz] = U[Izm1,ind,iT] / U[Izm1,ind,1] + end + if iz == ColumnTilesDim || Iz == Nz + Izp1 = min(Iz + 1,Nz) + cCol[I,J,iz+2] = U[Izp1,ind,iT] / U[Izp1,ind,1] + Izp2 = min(Iz + 2,Nz) + cCol[I,J,iz+3] = U[Izp2,ind,iT] / U[Izp2,ind,1] + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz < Nz + wCon = wCol[I,J,iz] + cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3], + JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3]) + Flux = eltype(F)(0.25) * ((abs(wCon) + wCon) * cFL + + (-abs(wCon) + wCon) * cFR) + @atomic :monotonic F[Iz,ind,iT] += -Flux / MCCol[I,J,iz] + @atomic :monotonic F[Iz+1,ind,iT] += Flux / MCCol[I,J,iz+1] + end + if Iz <= Nz + DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1] + for k = 2 : N + DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1] + end + @atomic :monotonic F[Iz,ind,iT] += DivRhoTr / MCCol[I,J,iz] + end + end +end @kernel inbounds = true function DivRhoTrUpwind3NewKernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI), @Const(JJ),@Const(M),@Const(Glob))