From fec93ef22c0ffa9f7e3d1dd4c90a56af1dda5afd Mon Sep 17 00:00:00 2001 From: OsKnoth <50015520+OsKnoth@users.noreply.github.com> Date: Sun, 7 Jul 2024 11:22:31 +0100 Subject: [PATCH] Horizontal limiter included --- AA | 174 +++++++++++++++++ BB | 190 +++++++++++++++++++ Examples/testNHSphere.jl | 2 +- Jobs/NHSphere/JobNHHeldSuarezMoistMOSTSphere | 4 +- src/GPU/FcnGPU.jl | 91 ++++++--- src/GPU/HorLimiterKernel.jl | 1 + src/Integration/RosenbrockSchur.jl | 1 + 7 files changed, 433 insertions(+), 30 deletions(-) create mode 100644 AA create mode 100644 BB diff --git a/AA b/AA new file mode 100644 index 0000000..512946e --- /dev/null +++ b/AA @@ -0,0 +1,174 @@ +@kernel inbounds = true function DivRhoTrUpwind3LimKernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI), + @Const(JJ),@Const(M),@Const(Glob),dt,@Const(w),@Const(qMin),@Const(qMax),@Const(Stencil)) + +# gi, gj, gz, gF = @index(Group, NTuple) + I, J, iz = @index(Local, NTuple) + _,_,Iz,IF = @index(Global, NTuple) + + ColumnTilesDim = @uniform @groupsize()[3] + N = @uniform @groupsize()[1] + Nz = @uniform @ndrange()[3] + NF = @uniform @ndrange()[4] + + @uniform l0 = eltype(FTr)(0) + @uniform eta = eltype(FTr)(1.e-12) + @uniform dlFD = eltype(FTr)(1.e-8) + + + cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3) + uConCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + vConCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRhoTr = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRho = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoTrColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + q = @localmem eltype(FTr) (N,N, ColumnTilesDim) + resp = @localmem eltype(FTr) (ColumnTilesDim) + resc = @localmem eltype(FTr) (ColumnTilesDim) + alpha = @localmem eltype(FTr) (ColumnTilesDim) + lp = @localmem eltype(FTr) (ColumnTilesDim) + lc = @localmem eltype(FTr) (ColumnTilesDim) + sumJ = @localmem eltype(FTr) (ColumnTilesDim) + qMinS = @localmem eltype(FTr) (ColumnTilesDim) + qMaxS = @localmem eltype(FTr) (ColumnTilesDim) + conv = @localmem (Bool) (ColumnTilesDim) + if Iz <= Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + cCol[I,J,iz+1] = Tr[Iz,ind] / U[Iz,ind,1] + @views (uCon, vCon) = Contra12(-U[Iz,ind,1],U[Iz,ind,2],U[Iz,ind,3],dXdxI[1:2,1:2,:,ID,Iz,IF]) + uConCol[I,J,iz] = uCon + vConCol[I,J,iz] = vCon + if ID == 1 + resp[iz] = eltype(FTr)(0) + resc[iz] = eltype(FTr)(0) + sumJ[iz] = eltype(FTr)(0) + conv[iz] = true + qMinS[iz] = qMin[Iz,Stencil[IF,1]] + qMaxS[iz] = qMax[Iz,Stencil[IF,1]] + for iS = 2 : 13 + qMinS[iz] = min(qMin[Iz,Stencil[IF,iS]],qMinS[iz]) + qMaxS[iz] = max(qMax[Iz,Stencil[IF,iS]],qMaxS[iz]) + end + end + end + if iz == 1 + Izm1 = max(Iz - 1,1) + cCol[I,J,iz] = Tr[Izm1,ind] / U[Izm1,ind,1] + end + if iz == ColumnTilesDim || Iz == Nz + Izp1 = min(Iz + 1,Nz) + cCol[I,J,iz+2] = Tr[Izp1,ind] / U[Izp1,ind,1] + Izp2 = min(Iz + 2,Nz) + cCol[I,J,iz+3] = Tr[Izp2,ind] / U[Izp2,ind,1] + end + @synchronize + + if Iz <= Nz + ID = I + (J - 1) * N + @atomic :monotonic sumJ[iz] += JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + end + @synchronize + + if Iz < Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + cLL = cCol[I,J,iz] + cL = cCol[I,J,iz+1] + cR = cCol[I,J,iz+2] + cRR = cCol[I,J,iz+3] + + @views wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3], + U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF]) + + Izm1 = max(Iz - 1,1) + Izp2 = min(Iz + 2, Nz) + JLL = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF] + JL = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + JR = JJ[ID,1,Iz+1,IF] + JJ[ID,2,Iz+1,IF] + JRR = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF] + cFL, cFR = RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR) + Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL + (-abs(wCon) + wCon) * cFR) + @atomic :monotonic FTr[Iz,ind] += -Flux / M[Iz,ind] + @atomic :monotonic FTr[Iz+1,ind] += Flux / M[Iz+1,ind] + end + + if Iz <= Nz + ID = I + (J - 1) * N + DivRhoTr[I,J,iz] = D[I,1] * uConCol[1,J,iz] * cCol[1,J,iz+1] + DivRhoTr[I,J,iz] += D[J,1] * vConCol[I,1,iz] * cCol[I,1,iz+1] + DivRho[I,J,iz] = D[I,1] * uConCol[1,J,iz] + DivRho[I,J,iz] += D[J,1] * vConCol[I,1,iz] + for k = 2 : N + DivRhoTr[I,J,iz] += D[I,k] * uConCol[k,J,iz] * cCol[k,J,iz+1] + DivRhoTr[I,J,iz] += D[J,k] * vConCol[I,k,iz] * cCol[I,k,iz+1] + DivRho[I,J,iz] += D[I,k] * uConCol[k,J,iz] + DivRho[I,J,iz] += D[J,k] * vConCol[I,k,iz] + end + ind = Glob[ID,IF] + RhoTrColS[I,J,iz] = Tr[Iz,ind] + dt * DivRhoTr[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + RhoColS[I,J,iz] = U[Iz,ind,1] + dt * DivRho[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + # Finite difference step + q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + l0, qMaxS[iz]) + @atomic :monotonic resp[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + @synchronize + if Iz <= Nz + ID = I + (J - 1) * N + if abs(resp[iz]) <= eta + if ID == 1 + conv[iz] = false + end + else + qLoc = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + (l0 + dlFD), qMaxS[iz]) + @atomic :monotonic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (qLoc * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + end + @synchronize + + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + conv[iz] = false + else + alpha[iz] = dlFD / (resc[iz] - resp[iz]) + lp[iz] = l0 + lc[iz] = lp[iz] - alpha[iz] * resp[iz] + resp[iz] = eltype(FTr)(0) + resc[iz] = eltype(FTr)(0) + end + end + @synchronize + for iTer = 1 : 8 + if Iz <= Nz && conv[iz] + ID = I + (J - 1) * N + q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + lc[iz], qMaxS[iz]) + @atomic :monotonic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + @synchronize + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + conv[iz] = false + else + alpha[iz] = (lp[iz] - lc[iz]) / (resp[iz] - resc[iz]) + resp[iz] = resc[iz] + lp[iz] = lc[iz] + lc[iz] = lc[iz] - alpha[iz] * resc[iz] + resc[iz] = eltype(FTr)(0) + end + end + @synchronize + end + if Iz <= Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + @show Iz,ind,q[I,J,iz],RhoColS[I,J,iz] + @atomic :monotonic FTr[Iz,ind] += (q[I,J,iz] * RhoColS[I,J,iz] - Tr[Iz,ind]) * + (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) / dt / M[Iz,ind] + end +end diff --git a/BB b/BB new file mode 100644 index 0000000..4b01774 --- /dev/null +++ b/BB @@ -0,0 +1,190 @@ +@kernel inbounds = true function DivRhoTrViscUpwind3LimKernel!(FTr,@Const(Tr),@Const(U),@Const(Cache),@Const(D),@Const(DW),@Const(dXdxI), + @Const(JJ),@Const(M),@Const(Glob),Koeff,dt,@Const(w),@Const(qMin),@Const(qMax),@Const(Stencil)) + + I, J, iz = @index(Local, NTuple) + _,_,Iz,IF = @index(Global, NTuple) + + ColumnTilesDim = @uniform @groupsize()[3] + N = @uniform @groupsize()[1] + Nz = @uniform @ndrange()[3] + NF = @uniform @ndrange()[4] + + @uniform l0 = eltype(FTr)(0) + @uniform eta = eltype(FTr)(1.e-12) + @uniform dlFD = eltype(FTr)(1.e-8) + + cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + CacheCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + uCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + vCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + wCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRhoTr = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRho = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoTrColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + q = @localmem eltype(FTr) (N,N, ColumnTilesDim) + resp = @localmem eltype(FTr) (ColumnTilesDim) + resc = @localmem eltype(FTr) (ColumnTilesDim) + alpha = @localmem eltype(FTr) (ColumnTilesDim) + lp = @localmem eltype(FTr) (ColumnTilesDim) + lc = @localmem eltype(FTr) (ColumnTilesDim) + sumJ = @localmem eltype(FTr) (ColumnTilesDim) + qMinS = @localmem eltype(FTr) (ColumnTilesDim) + qMaxS = @localmem eltype(FTr) (ColumnTilesDim) + conv = @localmem (Bool) (ColumnTilesDim) + if Iz <= Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + CacheCol[I,J,iz] = Cache[Iz,ind] + wCol[I,J,iz] = U[Iz,ind,4] + RhoCol[I,J,iz] = U[Iz,ind,1] + cCol[I,J,iz] = Tr[Iz,ind] / RhoCol[I,J,iz] + uCol[I,J,iz] = U[Iz,ind,2] + vCol[I,J,iz] = U[Iz,ind,3] + DivRho[I,J,iz] = eltype(FTr)(0) + DivRhoTr[I,J,iz] = eltype(FTr)(0) + if ID == 1 + resp[iz] = eltype(FTr)(0) + resc[iz] = eltype(FTr)(0) + sumJ[iz] = eltype(FTr)(0) + conv[iz] = true + qMinS[iz] = minimum(qMin[Iz,Stencil[IF,:]]) + qMaxS[iz] = maximum(qMax[Iz,Stencil[IF,:]]) + end + end + @synchronize + if Iz < Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + ind = Glob[ID,IF] + cL = cCol[I,J,iz] + cR = cCol[I,J,iz+1] + if iz > 1 + cLL = cCol[I,J,iz-1] + else + Izm1 = max(Iz - 1,1) + cLL = U[Izm1,ind,5] / U[Izm1,ind,1] + end + if iz < ColumnTilesDim - 1 + cRR = cCol[I,J,iz+2] + else + Izp2 = min(Iz + 2, Nz) + cRR = U[Izp2,ind,5] / U[Izp2,ind,1] + end + + @views wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3], + U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF]) + + Izm1 = max(Iz - 1,1) + Izp2 = min(Iz + 2, Nz) + JLL = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF] + JL = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + JR = JJ[ID,1,Iz+1,IF] + JJ[ID,2,Iz+1,IF] + JRR = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF] + cFL, cFR = RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR) + Flux = 0.25 * ((abs(wCon) + wCon) * cFL + (-abs(wCon) + wCon) * cFR) + @atomic :monotonic FTr[Iz,ind] += -Flux / M[Iz,ind] + @atomic :monotonic FTr[Iz+1,ind] += Flux / M[Iz+1,ind] + end + + if Iz <= Nz + ID = I + (J - 1) * N + Dxc = 0 + Dyc = 0 + for k = 1 : N + Dxc = Dxc + D[I,k] * CacheCol[k,J,iz] + Dyc = Dyc + D[J,k] * CacheCol[I,k,iz] + end + + @views (GradDx, GradDy) = Grad12(RhoCol[I,J,iz],Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF]) + @views (tempx, tempy) = Contra12(-Koeff,GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF]) + for k = 1 : N + @atomic :monotonic DivRhoTr[k,J,iz] += DW[k,I] * tempx + @atomic :monotonic DivRhoTr[I,k,iz] += DW[k,J] * tempy + end + + @views (tempxRho, tempyRho) = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],dXdxI[1:2,1:2,:,ID,Iz,IF]) + for k = 1 : N + @atomic :monotonic DivRho[k,J,iz] += D[k,I] * tempxRho + @atomic :monotonic DivRho[I,k,iz] += D[k,J] * tempyRho + end + tempxTr = tempxRho * cCol[I,J,iz] + tempyTr = tempyRho * cCol[I,J,iz] + for k = 1 : N + @atomic :monotonic DivRhoTr[k,J,iz] += D[k,I] * tempxTr + @atomic :monotonic DivRhoTr[I,k,iz] += D[k,J] * tempyTr + end + @atomic :monotonic sumJ[iz] += JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + end + @synchronize + + if Iz <=Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + RhoTrColS[I,J,iz] = Tr[Iz,ind] + dt * DivRhoTr[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + RhoColS[I,J,iz] = U[Iz,ind,1] + dt * DivRho[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + # Finite difference step + q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + l0, qMaxS[iz]) + @atomic :monotonic resp[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + @synchronize + if Iz <= Nz + ID = I + (J - 1) * N + if abs(resp[iz]) <= eta + if ID == 1 + conv[iz] = false + end + else + qLoc = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + (l0 + dlFD), qMaxS[iz]) + @atomic :monotonic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (qLoc * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + end + @synchronize + + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + conv[iz] = false + else + alpha[iz] = dlFD / (resc[iz] - resp[iz]) + lp[iz] = l0 + lc[iz] = lp[iz] - alpha[iz] * resp[iz] + resp[iz] = eltype(FTr)(0) + resc[iz] = eltype(FTr)(0) + end + end + @synchronize + for iTer = 1 : 5 + if Iz <= Nz && conv[iz] + ID = I + (J - 1) * N + q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + lc[iz], qMaxS[iz]) + @atomic :monotonic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + @synchronize + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + conv[iz] = false + else + alpha[iz] = (lp[iz] - lc[iz]) / (resp[iz] - resc[iz]) + resp[iz] = resc[iz] + lp[iz] = lc[iz] + lc[iz] = lc[iz] - alpha[iz] * resc[iz] + resc[iz] = eltype(FTr)(0) + end + end + @synchronize + end + if Iz <= Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + @show Iz,ind,q[I,J,iz],RhoColS[I,J,iz] + @atomic :monotonic FTr[Iz,ind] += (q[I,J,iz] * RhoColS[I,J,iz] - Tr[Iz,ind]) * + (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) / dt / M[Iz,ind] + end +end diff --git a/Examples/testNHSphere.jl b/Examples/testNHSphere.jl index 2cc6c1e..9d786f1 100644 --- a/Examples/testNHSphere.jl +++ b/Examples/testNHSphere.jl @@ -414,7 +414,7 @@ if ModelType == "VectorInvariant" || ModelType == "Advection" "v", "wB", "Th", - "Vort", +# "Vort", "Tr1", "Tr2", ] diff --git a/Jobs/NHSphere/JobNHHeldSuarezMoistMOSTSphere b/Jobs/NHSphere/JobNHHeldSuarezMoistMOSTSphere index 13ef3d3..d92fcad 100755 --- a/Jobs/NHSphere/JobNHHeldSuarezMoistMOSTSphere +++ b/Jobs/NHSphere/JobNHHeldSuarezMoistMOSTSphere @@ -1,4 +1,4 @@ -mpirun -n 1 julia --project Examples/testNHSphere.jl \ +mpirun -n 6 julia --project Examples/testNHSphere.jl \ --Problem="HeldSuarezMoistSphere" \ --Device="CPU" \ --GPUType="Metal" \ @@ -23,7 +23,7 @@ mpirun -n 1 julia --project Examples/testNHSphere.jl \ --SurfaceScheme="MOST" \ --Coriolis=true \ --Upwind=true \ - --HorLimit=false \ + --HorLimit=true \ --Equation="CompressibleShallow" \ --State="Moist" \ --Microphysics=true \ diff --git a/src/GPU/FcnGPU.jl b/src/GPU/FcnGPU.jl index 62374b3..15e0549 100644 --- a/src/GPU/FcnGPU.jl +++ b/src/GPU/FcnGPU.jl @@ -12,8 +12,6 @@ function FcnAdvectionGPU!(F,U,time,FE,Metric,Phys,Cache,Exchange,Global,Param,Pr dXdxI = Metric.dXdxI X = Metric.X J = Metric.J - JC = Metric.JC - JCW = Metric.JCW N = FE.OrdPoly+1 ww = FE.w Nz = size(F,1) @@ -143,15 +141,19 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models backend = get_backend(F) FT = eltype(F) + dtau = Global.TimeStepper.dtauStage Glob = FE.Glob DS = FE.DS DW = FE.DW M = FE.M + Stencil = FE.Stencil dXdxI = Metric.dXdxI nS = Metric.nS nSS = Metric.nSS X = Metric.X J = Metric.J + N = FE.OrdPoly+1 + ww = FE.w NF = Global.Grid.NumFaces NBF = Global.Grid.NumBoundaryFaces @views dXdxI_B = dXdxI[:,:,:,:,:,1:NBF] @@ -165,6 +167,7 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models @views X_I = X[:,:,:,:,NBF+1:NF] @views nS_I = nS[:,:,NBF+1:NF] @views Glob_I = Glob[:,NBF+1:NF] + @views Stencil_I = Stencil[NBF+1:NF,:] xS = Metric.xS dz = Metric.dz zP = Metric.zP @@ -184,6 +187,7 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models MicrophysicsSource = Global.Model.MicrophysicsSource CoriolisFun = Global.Model.CoriolisFun GravitationFun = Global.Model.GravitationFun + HorLimit = Global.Model.HorLimit KoeffCurl = Global.Model.HyperDCurl KoeffGrad = Global.Model.HyperDGrad @@ -253,7 +257,7 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models LenTemp1 += 1 @views CacheTke = Temp1[:,:,LenTemp1] end - if NumTr > 0 + if ~HorLimit && NumTr > 0 @views CacheTr = Temp1[:,:,LenTemp1+1:LenTemp1+NumTr] LenTemp1 += NumTr end @@ -286,6 +290,9 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models groupw = (N, N, NzG, 1) ndrangewB = (Nz-1, NBF) ndrangewI = (Nz-1, NF-NBF) + NFG = min(div(NumberThreadGPU,Nz),NF) + groupL = (Nz, NFG, 1) + ndrangeL = (Nz, NF, NumTr) KRhoGradKinKernel! = RhoGradKinKernel!(backend,group) KGradKernel! = GradKernel!(backend,group) @@ -296,6 +303,16 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models KHyperViscTracerKernel! = HyperViscTracerKernel!(backend, groupTr) KHyperViscTracerKoeffKernel! = HyperViscTracerKoeffKernel!(backend, groupTr) KDivRhoTrUpwind3Kernel! = DivRhoTrUpwind3Kernel!(backend, groupTr) + KDivRhoTrUpwind3LimKernel! = DivRhoTrUpwind3LimKernel!(backend, groupTr) + KLimitKernel! = LimitKernel!(backend, groupL) + + if HorLimit + @views qMin = Cache.qMin[:,:,1:NumTr] + @views qMax = Cache.qMax[:,:,1:NumTr] + @views KLimitKernel!(DoF,qMin,qMax,UTr,Rho,Glob,ndrange=ndrangeL) + KernelAbstractions.synchronize(backend) + Parallels.ExchangeDataFSendGPU(qMin,qMax,Exchange) + end #### @@ -304,9 +321,11 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models Temp1 .= FT(0) KHyperViscKernel!(CacheF,U,DS,DW,dXdxI,J,M,Glob,ndrange=ndrangeB) KernelAbstractions.synchronize(backend) - for iT = 1 : NumTr - @views KHyperViscTracerKernel!(CacheTr[:,:,iT],UTr[:,:,iT],Rho,DS,DW,dXdxI,J,M,Glob,ndrange=ndrangeB) - KernelAbstractions.synchronize(backend) + if ~HorLimit + for iT = 1 : NumTr + @views KHyperViscTracerKernel!(CacheTr[:,:,iT],UTr[:,:,iT],Rho,DS,DW,dXdxI,J,M,Glob,ndrange=ndrangeB) + KernelAbstractions.synchronize(backend) + end end if TkePos > 0 @views KHyperViscTracerKernel!(CacheTke,Tke,Rho,DS,DW,dXdxI,J,M,Glob,ndrange=ndrangeB) @@ -334,13 +353,18 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models end end + if HorLimit + Parallels.ExchangeDataFRecvGPU!(qMin,qMax,Exchange) + end @views Parallels.ExchangeData3DSendGPU(Temp1[:,:,1:LenTemp1],Exchange) KHyperViscKernel!(CacheF,U,DS,DW,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) KernelAbstractions.synchronize(backend) - for iT = 1 : NumTr - @views KHyperViscTracerKernel!(CacheTr[:,:,iT],UTr[:,:,iT],Rho,DS,DW,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) - KernelAbstractions.synchronize(backend) + if ~HorLimit + for iT = 1 : NumTr + @views KHyperViscTracerKernel!(CacheTr[:,:,iT],UTr[:,:,iT],Rho,DS,DW,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) + KernelAbstractions.synchronize(backend) + end end if TkePos > 0 @views KHyperViscTracerKernel!(CacheTke,Tke,Rho,DS,DW,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) @@ -372,10 +396,20 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models F .= FT(0) KHyperViscKoeffKernel!(F,U,CacheF,DS,DW,dXdxI,J,M,Glob,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrangeB) KernelAbstractions.synchronize(backend) - for iT = 1 : NumTr - @views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI,J,M,Glob, - KoeffDiv,ndrange=ndrangeB) - KernelAbstractions.synchronize(backend) + if ~HorLimit + for iT = 1 : NumTr + @views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI,J,M,Glob, + KoeffDiv,ndrange=ndrangeB) + KernelAbstractions.synchronize(backend) + @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, + dXdxI,J,M,Glob,ndrange=ndrangeB) + KernelAbstractions.synchronize(backend) + end + else + for iT = 1 : NumTr + @views KDivRhoTrUpwind3LimKernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, + dXdxI,J,M,Glob,dtau,ww,qMin[:,:,iT],qMax[:,:,iT],Stencil,ndrange=ndrangeB) + end end if TkePos > 0 @views KHyperViscTracerKoeffKernel!(FTke,CacheTke,Rho,DS,DW,dXdxI,J,M,Glob, @@ -407,11 +441,6 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models KernelAbstractions.synchronize(backend) KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrangeB) KernelAbstractions.synchronize(backend) - for iT = 1 : NumTr - @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, - dXdxI,J,M,Glob,ndrange=ndrangeB) - KernelAbstractions.synchronize(backend) - end if TkePos > 0 @views KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS, dXdxI,J,M,Glob,ndrange=ndrangeB) KernelAbstractions.synchronize(backend) @@ -436,10 +465,23 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models KHyperViscKoeffKernel!(F,U,CacheF,DS,DW,dXdxI_I,J_I,M,Glob_I,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrangeI) KernelAbstractions.synchronize(backend) - for iT = 1 : NumTr - @views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI_I,J_I,M,Glob_I, - KoeffDiv,ndrange=ndrangeI) - KernelAbstractions.synchronize(backend) + if ~HorLimit + for iT = 1 : NumTr + @views KHyperViscTracerKoeffKernel!(FTr[:,:,iT],CacheTr[:,:,iT],Rho,DS,DW,dXdxI_I,J_I,M,Glob_I, + KoeffDiv,ndrange=ndrangeI) + KernelAbstractions.synchronize(backend) + end + for iT = 1 : NumTr + @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, + dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) + KernelAbstractions.synchronize(backend) + end + else + for iT = 1 : NumTr + @views KDivRhoTrUpwind3LimKernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, + dXdxI_I,J_I,M,Glob_I,dtau,ww,qMin[:,:,iT],qMax[:,:,iT],Stencil_I,ndrange=ndrangeI) + KernelAbstractions.synchronize(backend) + end end if TkePos > 0 @views KHyperViscTracerKoeffKernel!(FTke,CacheTke,Rho,DS,DW,dXdxI_I,J_I,M,Glob_I, @@ -472,11 +514,6 @@ function FcnGPU!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,Equation::Models KDivRhoThUpwind3Kernel!(F,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) KernelAbstractions.synchronize(backend) - for iT = 1 : NumTr - @views KDivRhoTrUpwind3Kernel!(FTr[:,:,iT],UTr[:,:,iT],U,DS, - dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) - KernelAbstractions.synchronize(backend) - end if TkePos > 0 KDivRhoTrUpwind3Kernel!(FTke,Tke,U,DS,dXdxI_I,J_I,M,Glob_I,ndrange=ndrangeI) KernelAbstractions.synchronize(backend) diff --git a/src/GPU/HorLimiterKernel.jl b/src/GPU/HorLimiterKernel.jl index b443f98..6716210 100644 --- a/src/GPU/HorLimiterKernel.jl +++ b/src/GPU/HorLimiterKernel.jl @@ -380,6 +380,7 @@ end if Iz <= Nz ID = I + (J - 1) * N ind = Glob[ID,IF] + @show Iz,ind,q[I,J,iz],RhoColS[I,J,iz] @atomic :monotonic FTr[Iz,ind] += (q[I,J,iz] * RhoColS[I,J,iz] - Tr[Iz,ind]) * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) / dt / M[Iz,ind] end diff --git a/src/Integration/RosenbrockSchur.jl b/src/Integration/RosenbrockSchur.jl index facc0ca..19d35cd 100644 --- a/src/Integration/RosenbrockSchur.jl +++ b/src/Integration/RosenbrockSchur.jl @@ -6,6 +6,7 @@ function RosenbrockSchur!(V,dt,Fcn!,FcnPrepare!,Jac,CG,Metric,Phys,Cache,JCache, k = Cache.k fV = Cache.fV Vn = Cache.Vn + Global.TimeStepper.dtauStage = dt # Oswald JCache.CompTri = true @. Vn = V