From 78d19f69851d9a311f780716c5fc581bc0426b8f Mon Sep 17 00:00:00 2001
From: OsKnoth <50015520+OsKnoth@users.noreply.github.com>
Date: Mon, 20 Nov 2023 20:50:44 +0100
Subject: [PATCH] @Unroll

---
 Examples/testNHSphere.jl        |    3 +-
 src/GPU/GPU.jl                  |    1 +
 src/GPU/OperatorKernel.jl       |    8 +-
 src/GPU/OperatorUnrollKernel.jl | 1424 +++++++++++++++++++++++++++++++
 4 files changed, 1431 insertions(+), 5 deletions(-)
 create mode 100644 src/GPU/OperatorUnrollKernel.jl

diff --git a/Examples/testNHSphere.jl b/Examples/testNHSphere.jl
index c2fa1ef..9ee86a8 100644
--- a/Examples/testNHSphere.jl
+++ b/Examples/testNHSphere.jl
@@ -70,6 +70,7 @@ Stretch = parsed_args["Stretch"]
 StretchType = parsed_args["StretchType"]
 TopoS = parsed_args["TopoS"]
 GridType = parsed_args["GridType"]
+RadEarth = parsed_args["RadEarth"]
 # CG Element
 OrdPoly = parsed_args["OrdPoly"]
 # Viscosity
@@ -206,7 +207,7 @@ if RadEarth == 0.0
   RadEarth = Phys.RadEarth
 end
 
-Topography = (TopoS=TopoS,H=H,Rad=Phys.RadEarth)
+Topography = (TopoS=TopoS,H=H,Rad=RadEarth)
 
 @show "InitSphere"
 (CG, Metric, Exchange, Global) = DyCore.InitSphere(backend,FTB,OrdPoly,OrdPolyZ,nz,nPanel,H,
diff --git a/src/GPU/GPU.jl b/src/GPU/GPU.jl
index 7dec706..1d81173 100644
--- a/src/GPU/GPU.jl
+++ b/src/GPU/GPU.jl
@@ -6,6 +6,7 @@ import ..Grids
 using StaticArrays
 using KernelAbstractions
 using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
+using KernelAbstractions.Extras
 
 include("OperatorKernel.jl")
 include("FcnGPU.jl")
diff --git a/src/GPU/OperatorKernel.jl b/src/GPU/OperatorKernel.jl
index f0b29e3..abb8a86 100644
--- a/src/GPU/OperatorKernel.jl
+++ b/src/GPU/OperatorKernel.jl
@@ -1012,7 +1012,6 @@ end
 @kernel function DivRhoKernel!(F,@Const(U),@Const(D),@Const(dXdxI),
   @Const(JJ),@Const(M),@Const(Glob))
 
-# gi, gj, gz, gF = @index(Group, NTuple)
   I, J, iz   = @index(Local, NTuple)
   _,_,Iz,IF = @index(Global, NTuple)
 
@@ -1046,9 +1045,10 @@ end
 
   if Iz <= Nz
     ID = I + (J - 1) * N  
-    @inbounds DivRho = D[I,1] * uConCol[1,J,iz] 
-    @inbounds DivRho += D[J,1] * vConCol[I,1,iz] 
-    for k = 2 : N
+#   @inbounds DivRho = D[I,1] * uConCol[1,J,iz] 
+#   @inbounds DivRho += D[J,1] * vConCol[I,1,iz] 
+    DivRho = eltype(F)(0)
+    @unroll for k = 1 : N
       @inbounds DivRho += D[I,k] * uConCol[k,J,iz] 
       @inbounds DivRho += D[J,k] * vConCol[I,k,iz] 
     end
diff --git a/src/GPU/OperatorUnrollKernel.jl b/src/GPU/OperatorUnrollKernel.jl
new file mode 100644
index 0000000..0ad43c3
--- /dev/null
+++ b/src/GPU/OperatorUnrollKernel.jl
@@ -0,0 +1,1424 @@
+@kernel function MomentumCoriolisKernel!(F,@Const(U),@Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(X),@Const(MRho),@Const(M),@Const(Glob),Phys)
+
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  RhoCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  uCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  vCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  wCol = @localmem eltype(F) (N,N,ColumnTilesDim+1)
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds RhoCol[I,J,iz] = U[Iz,ind,1]
+    @inbounds uCol[I,J,iz] = U[Iz,ind,2]
+    @inbounds vCol[I,J,iz] = U[Iz,ind,3]
+    @inbounds wCol[I,J,iz+1] = U[Iz,ind,4]
+    if Iz == 1
+      wCol[I,J,1] = -(dXdxI[3,1,1,ID,1,IF] * U[Iz,ind,2] + 
+        dXdxI[3,2,1,ID,1,IF] * U[Iz,ind,3]) / dXdxI[3,3,1,ID,1,IF]
+    elseif iz == 1
+      wCol[I,J,1] = U[Iz-1,ind,4] 
+    end    
+  end  
+
+  @synchronize
+    
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds uCon1 = -RhoCol[I,J,iz] * (dXdxI[1,1,1,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[1,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1,3,1,ID,Iz,IF] * wCol[I,J,iz])
+    @inbounds uCon2 = -RhoCol[I,J,iz] * (dXdxI[1,1,2,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[1,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
+    @inbounds vCon1 = -RhoCol[I,J,iz] * (dXdxI[2,1,1,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[2,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2,3,1,ID,Iz,IF] * wCol[I,J,iz])
+    @inbounds vCon2 = -RhoCol[I,J,iz] * (dXdxI[2,1,2,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[2,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
+    @inbounds wCon1 = -RhoCol[I,J,iz] * (dXdxI[3,1,1,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[3,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3,3,1,ID,Iz,IF] * wCol[I,J,iz])
+    @inbounds wCon2 = -RhoCol[I,J,iz] * (dXdxI[3,1,2,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[3,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
+
+    @inbounds Dxu = D[I,1] * uCol[1,J,iz]
+    @inbounds Dyu = D[J,1] * uCol[I,1,iz]
+    @inbounds Dxv = D[I,1] * vCol[1,J,iz]
+    @inbounds Dyv = D[J,1] * vCol[I,1,iz]
+    @inbounds Dxw1 = D[I,1] * wCol[1,J,iz]
+    @inbounds Dyw1 = D[J,1] * wCol[I,1,iz]
+    @inbounds Dxw2 = D[I,1] * wCol[1,J,iz+1]
+    @inbounds Dyw2 = D[J,1] * wCol[I,1,iz+1]
+    Izp = min(Iz+1,Nz)
+    Izm = max(Iz-1,1)
+    ind = Glob[ID,IF]
+    Dzu2 = eltype(F)(0.5) * (U[Izp,ind,2] - uCol[I,J,iz])
+    Dzv2 = eltype(F)(0.5) * (U[Izp,ind,3] - vCol[I,J,iz])
+    Dzu1 = eltype(F)(0.5) * (uCol[I,J,iz] - U[Izm,ind,2])
+    Dzv1 = eltype(F)(0.5) * (vCol[I,J,iz] - U[Izm,ind,3])
+    Dzw = eltype(F)(0.5) * (wCol[I,J,iz+1] - wCol[I,J,iz]) 
+    @unroll @unroll for k = 2 : N
+      @inbounds Dxu += D[I,k] * uCol[k,J,iz]
+      @inbounds Dyu += D[J,k] * uCol[I,k,iz]
+      @inbounds Dxv += D[I,k] * vCol[k,J,iz]
+      @inbounds Dyv += D[J,k] * vCol[I,k,iz]
+      @inbounds Dxw1 += D[I,k] * wCol[k,J,iz]
+      @inbounds Dyw1 += D[J,k] * wCol[I,k,iz]
+      @inbounds Dxw2 += D[I,k] * wCol[k,J,iz+1]
+      @inbounds Dyw2 += D[J,k] * wCol[I,k,iz+1]
+    end  
+    x = eltype(F)(0.5) * (X[ID,1,1,Iz,IF] + X[ID,2,1,Iz,IF])
+    y = eltype(F)(0.5) * (X[ID,1,2,Iz,IF] + X[ID,2,2,Iz,IF])
+    z = eltype(F)(0.5) * (X[ID,1,3,Iz,IF] + X[ID,2,3,Iz,IF])
+    r = sqrt(x^2 + y^2 + z^2)
+    sinlat = z / r
+    W = -eltype(F)(2) * Phys.Omega * sinlat * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF])
+    FuCoriolis = -RhoCol[I,J,iz] * vCol[I,J,iz] * W
+    FvCoriolis = RhoCol[I,J,iz] * uCol[I,J,iz] * W
+
+    @inbounds @atomic F[Iz,ind,2] += ((uCon1 + uCon2) * Dxu + (vCon1 + vCon2) * Dyu + 
+    wCon1 * Dzu1 + wCon2 * Dzu2 + FuCoriolis) / M[Iz,ind] / RhoCol[I,J,iz]
+    @inbounds @atomic F[Iz,ind,3] += ((uCon1 + uCon2) * Dxv + (vCon1 + vCon2) * Dyv +
+    wCon1 * Dzv1 + wCon2 * Dzv2 + FvCoriolis) / M[Iz,ind] / RhoCol[I,J,iz]
+  end  
+  if Iz > 1
+    @inbounds @atomic F[Iz-1,ind,4] += (uCon1 * Dxw1 + vCon1 * Dyw1 + wCon1 * Dzw) / MRho[Iz-1,ind] 
+  end  
+  if Iz < Nz
+    @inbounds @atomic F[Iz,ind,4] += (uCon2 * Dxw2 + vCon2 * Dyw2 + wCon2 * Dzw) / MRho[Iz,ind]
+  end  
+end  
+
+@kernel function MomentumKernel!(F,@Const(U),@Const(D),@Const(dXdxI),
+  @Const(MRho),@Const(M),@Const(Glob),Phys)
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  RhoCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  uCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  vCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  wCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds RhoCol[I,J,iz] = U[Iz,ind,1]
+    @inbounds uCol[I,J,iz] = U[Iz,ind,2]
+    @inbounds vCol[I,J,iz] = U[Iz,ind,3]
+    @inbounds wCol[I,J,iz+1] = U[Iz,ind,4]
+    if Iz == 1
+      wCol[I,J,1] = -(dXdxI[3,1,1,ID,1,IF] * U[Iz,ind,2] + 
+        dXdxI[3,2,1,ID,1,IF] * U[Iz,ind,3]) / dXdxI[3,3,1,ID,1,IF]
+     elseif iz == 1
+       wCol[I,J,1] = U[Iz-1,ind,4] 
+    end    
+  end  
+
+  @synchronize
+    
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds uCon1 = -RhoCol[I,J,iz] * (dXdxI[1,1,1,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[1,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1,3,1,ID,Iz,IF] * wCol[I,J,iz])
+    @inbounds uCon2 = -RhoCol[I,J,iz] * (dXdxI[1,1,2,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[1,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
+    @inbounds vCon1 = -RhoCol[I,J,iz] * (dXdxI[2,1,1,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[2,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2,3,1,ID,Iz,IF] * wCol[I,J,iz])
+    @inbounds vCon2 = -RhoCol[I,J,iz] * (dXdxI[2,1,2,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[2,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
+    @inbounds wCon1 = -RhoCol[I,J,iz] * (dXdxI[3,1,1,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[3,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3,3,1,ID,Iz,IF] * wCol[I,J,iz])
+    @inbounds wCon2 = -RhoCol[I,J,iz] * (dXdxI[3,1,2,ID,Iz,IF] * uCol[I,J,iz] +
+      dXdxI[3,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
+
+    @inbounds Dxu = D[I,1] * uCol[1,J,iz]
+    @inbounds Dyu = D[J,1] * uCol[I,1,iz]
+    @inbounds Dxv = D[I,1] * vCol[1,J,iz]
+    @inbounds Dyv = D[J,1] * vCol[I,1,iz]
+    @inbounds Dxw1 = D[I,1] * wCol[1,J,iz]
+    @inbounds Dyw1 = D[J,1] * wCol[I,1,iz]
+    @inbounds Dxw2 = D[I,1] * wCol[1,J,iz+1]
+    @inbounds Dyw2 = D[J,1] * wCol[I,1,iz+1]
+    Izp = min(Iz+1,Nz)
+    Izm = max(Iz-1,1)
+    ind = Glob[ID,IF]
+    Dzu2 = 1/2 * (U[Izp,ind,2] - uCol[I,J,iz])
+    Dzv2 = 1/2 * (U[Izp,ind,3] - vCol[I,J,iz])
+    Dzu1 = 1/2 * (uCol[I,J,iz] - U[Izm,ind,2])
+    Dzv1 = 1/2 * (vCol[I,J,iz] - U[Izm,ind,3])
+    Dzw = 1/2 * (wCol[I,J,iz+1] - wCol[I,J,iz]) 
+    @unroll for k = 2 : N
+      @inbounds Dxu += D[I,k] * uCol[k,J,iz]
+      @inbounds Dyu += D[J,k] * uCol[I,k,iz]
+      @inbounds Dxv += D[I,k] * vCol[k,J,iz]
+      @inbounds Dyv += D[J,k] * vCol[I,k,iz]
+      @inbounds Dxw1 += D[I,k] * wCol[k,J,iz]
+      @inbounds Dyw1 += D[J,k] * wCol[I,k,iz]
+      @inbounds Dxw2 += D[I,k] * wCol[k,J,iz+1]
+      @inbounds Dyw2 += D[J,k] * wCol[I,k,iz+1]
+    end  
+
+    @inbounds @atomic F[Iz,ind,2] += ((uCon1 + uCon2) * Dxu + (vCon1 + vCon2) * Dyu + 
+    wCon1 * Dzu1 + wCon2 * Dzu2) / M[Iz,ind] / RhoCol[I,J,iz]
+    @inbounds @atomic F[Iz,ind,3] += ((uCon1 + uCon2) * Dxv + (vCon1 + vCon2) * Dyv +
+    wCon1 * Dzv1 + wCon2 * Dzv2) / M[Iz,ind] / RhoCol[I,J,iz]
+  end  
+  if Iz > 1
+    @inbounds @atomic F[Iz-1,ind,4] += (uCon1 * Dxw1 + vCon1 * Dyw1 + wCon1 * Dzw) / MRho[Iz-1,ind] 
+  end  
+  if Iz < Nz
+    @inbounds @atomic F[Iz,ind,4] += (uCon2 * Dxw2 + vCon2 * Dyw2 + wCon2 * Dzw) / MRho[Iz,ind]
+  end  
+end  
+
+@kernel function GradKernel!(F,@Const(U),@Const(p),@Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(MRho),@Const(Glob),Phys,::Val{BANK}=Val(1)) where BANK
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  Pres = @localmem eltype(F) (N,N,ColumnTilesDim+1)
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds Pres[I,J,iz] = p[Iz,ind]
+  end
+  if iz == ColumnTilesDim && Iz < Nz
+    @inbounds Pres[I,J,iz+1] = p[Iz+1,ind]
+  end  
+
+  @synchronize
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds DXPres = D[I,1] * Pres[1,J,iz]
+    @inbounds DYPres = D[J,1] * Pres[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds DXPres += D[I,k] * Pres[k,J,iz]
+      @inbounds DYPres += D[J,k] * Pres[I,k,iz]
+    end
+    @views @inbounds Gradu, Gradv = Grad12(DXPres,DYPres,dXdxI[1:2,1:2,:,ID,Iz,IF]) 
+    @views @inbounds Gradw1, Gradw2 = Grad3(DXPres,DYPres,dXdxI[1:3,1:3,:,ID,Iz,IF]) 
+
+    @inbounds ind = Glob[ID,IF]
+    @inbounds GradZ = -Phys.Grav * U[Iz,ind,1] *
+        (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) / (dXdxI[3,3,1,ID,Iz,IF] + dXdxI[3,3,2,ID,Iz,IF])
+    @inbounds Gradu += GradZ * (dXdxI[3,1,1,ID,Iz,IF] + dXdxI[3,1,2,ID,Iz,IF])
+    @inbounds Gradv += GradZ * (dXdxI[3,2,1,ID,Iz,IF] + dXdxI[3,2,2,ID,Iz,IF])
+    @inbounds @atomic F[Iz,ind,2] += -Gradu / M[Iz,ind] / U[Iz,ind,1]
+    @inbounds @atomic F[Iz,ind,3] += -Gradv / M[Iz,ind] / U[Iz,ind,1]
+    if Iz > 1
+      @inbounds @atomic F[Iz-1,ind,4] += -Gradw1 / MRho[Iz-1,ind]
+    end  
+  end  
+
+  if Iz < Nz
+    @inbounds GradZ = eltype(F)(0.5) * (Pres[I,J,iz+1] - Pres[I,J,iz])  
+    @inbounds Gradw =  GradZ* (dXdxI[3,3,2,ID,Iz,IF] + dXdxI[3,3,1,ID,Iz+1,IF])
+    @inbounds @atomic F[Iz,ind,4] += -(Gradw + Gradw2 +
+      Phys.Grav * (U[Iz,ind,1] * JJ[ID,2,Iz,IF] + U[Iz+1,ind,1] * JJ[ID,1,Iz+1,IF])) /
+      MRho[Iz,ind]
+  end      
+   
+end
+
+@kernel function RhoGradKinKernel!(F,@Const(U),@Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob))
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  RhoCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  uCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  vCol = @localmem eltype(F) (N,N,ColumnTilesDim)
+  wCol = @localmem eltype(F) (N,N,ColumnTilesDim+1)
+  GraduF = @localmem eltype(F) (N,N,2,ColumnTilesDim)
+  GradvF = @localmem eltype(F) (N,N,2,ColumnTilesDim)
+  GradwF = @localmem eltype(F) (N,N,2,ColumnTilesDim)
+  KinF = @localmem eltype(F) (N,N,2,ColumnTilesDim)
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds RhoCol[I,J,iz] = U[Iz,ind,1]
+    @inbounds uCol[I,J,iz] = U[Iz,ind,2]
+    @inbounds vCol[I,J,iz] = U[Iz,ind,3]
+    @inbounds wCol[I,J,iz+1] = U[Iz,ind,4]
+    @inbounds @views @. GraduF[I,J,:,iz] = 0
+    @inbounds @views @. GradvF[I,J,:,iz] = 0
+    @inbounds @views @. GradwF[I,J,:,iz] = 0
+  end
+
+  @synchronize
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    KinF[I,J,1,iz] = 1/2 * (uCol[I,J,iz] * uCol[I,J,iz] + vCol[I,J,iz] * vCol[I,J,iz])  
+    KinF[I,J,2,iz] = KinF[I,J,1,iz] + 1/2 * wCol[I,J,iz+1] * wCol[I,J,iz+1]
+    KinF[I,J,1,iz] +=  1/2 * wCol[I,J,iz] * wCol[I,J,iz]
+  end  
+
+  @synchronize
+
+  DXKinF1 = 0
+  DYKinF1 = 0
+  DXKinF2 = 0
+  DYKinF2 = 0
+  I = mod(ID-1,N) + 1
+  J = div(ID-I,N) + 1
+  @unroll for k = 1 : N
+    @inbounds DXKinF1 += D[I,k] * KinF[k,J,1,iz]
+    @inbounds DYKinF1 += D[J,k] * KinF[I,k,1,iz]
+    @inbounds DXKinF2 += D[I,k] * KinF[k,J,2,iz]
+    @inbounds DYKinF2 += D[J,k] * KinF[I,k,2,iz]
+  end
+  @inbounds GraduF[I,J,1,iz] +=
+      -RhoCol[I,J,iz] * (dXdxI[1,1,1,ID,Iz,IF]  * DXKinF1 + dXdxI[2,1,1,ID,Iz,IF]  * DYKinF1)
+  @inbounds GradvF[I,J,1,iz] +=
+      -RhoCol[I,J,iz] * (dXdxI[1,2,1,ID,Iz,IF]  * DXKinF1 + dXdxI[2,2,1,ID,Iz,IF]  * DYKinF1)
+  @inbounds GradwF[I,J,1,iz] +=
+      -RhoCol[I,J,iz] * (dXdxI[1,3,1,ID,Iz,IF]  * DXKinF1 + dXdxI[2,3,1,ID,Iz,IF]  * DYKinF1)
+  @inbounds GraduF[I,J,2,iz] +=
+      -RhoCol[I,J,iz] * (dXdxI[1,1,2,ID,Iz,IF]  * DXKinF2 + dXdxI[2,1,2,ID,Iz,IF]  * DYKinF2)
+  @inbounds GradvF[I,J,2,iz] +=
+      -RhoCol[I,J,iz] * (dXdxI[1,2,2,ID,Iz,IF]  * DXKinF2 + dXdxI[2,2,2,ID,Iz,IF]  * DYKinF2)
+  @inbounds GradwF[I,J,2,iz] +=
+      -RhoCol[I,J,iz] * (dXdxI[1,3,2,ID,Iz,IF]  * DXKinF2 + dXdxI[2,3,1,ID,Iz,IF]  * DYKinF2)
+  if iz > 1    
+    @inbounds GraduZ11 = 1/2 * KinF[I,J,1,iz] * dXdxI[3,1,2,ID,Iz-1,IF]
+    @inbounds GradvZ11 = 1/2 * KinF[I,J,1,iz] * dXdxI[3,2,2,ID,Iz-1,IF]
+    @inbounds @atomic GraduF[I,J,2,iz-1] += -RhoCol[I,J,iz-1] * GraduZ11
+    @inbounds @atomic GradvF[I,J,2,iz-1] += -RhoCol[I,J,iz-1] * GradvZ11
+  end  
+  @inbounds GraduZ12 = 1/2 * KinF[I,J,1,iz] * dXdxI[3,1,1,ID,Iz,IF]
+  @inbounds GradvZ12 = 1/2 * KinF[I,J,1,iz] * dXdxI[3,2,1,ID,Iz,IF]
+  @inbounds GraduZ21 = 1/2 * KinF[I,J,2,iz] * dXdxI[3,1,2,ID,Iz,IF]
+  @inbounds GradvZ21 = 1/2 * KinF[I,J,2,iz] * dXdxI[3,2,2,ID,Iz,IF]
+  @inbounds @atomic GraduF[I,J,1,iz] += RhoCol[I,J,iz] * GraduZ12 
+  @inbounds @atomic GraduF[I,J,1,iz] += -RhoCol[I,J,iz] * GraduZ21 
+  @inbounds @atomic GradvF[I,J,1,iz] += RhoCol[I,J,iz] * GradvZ12 
+  @inbounds @atomic GradvF[I,J,1,iz] += -RhoCol[I,J,iz] * GradvZ21 
+  if Iz < Nz
+    @inbounds GraduZ22 = 1/2 * KinF[I,J,2,iz] * dXdxI[3,1,1,ID,Iz+1,IF]
+    @inbounds GradvZ22 = 1/2 * KinF[I,J,2,iz] * dXdxI[3,2,1,ID,Iz+1,IF]
+    @inbounds @atomic GraduF[I,J,2,iz+1] += RhoCol[I,J,iz+1] * GraduZ22
+    @inbounds @atomic GradvF[I,J,2,iz+1] += RhoCol[I,J,iz+1] * GradvZ22
+  end  
+
+  @inbounds GradZ = 1/2 * RhoCol[I,J,iz] * (KinF[I,J,2,iz] - KinF[I,J,1,iz])
+  @inbounds GraduF[I,J,2,iz] += -GradZ * dXdxI[3,1,2,ID,Iz,IF]
+  @inbounds GraduF[I,J,1,iz] += -GradZ * dXdxI[3,1,1,ID,Iz,IF]
+  @inbounds GradvF[I,J,2,iz] += -GradZ * dXdxI[3,2,2,ID,Iz,IF]
+  @inbounds GradvF[I,J,1,iz] += -GradZ * dXdxI[3,2,1,ID,Iz,IF]
+  @inbounds GradwF[I,J,2,iz] += -GradZ * dXdxI[3,3,2,ID,Iz,IF]
+  @inbounds GradwF[I,J,1,iz] += -GradZ * dXdxI[3,3,1,ID,Iz,IF]
+
+  @synchronize
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind,2] += (GraduF[I,J,1,iz] + GraduF[I,J,2,iz]) / M[Iz,ind] / U[Iz,ind,1]
+    @inbounds @atomic F[Iz,ind,3] += (GraduF[I,J,1,iz] + GraduF[I,J,2,iz]) / M[Iz,ind] / U[Iz,ind,1]
+    if iz > 1
+      @inbounds @atomic F[Iz,ind,4] += (GradwF[I,J,2,iz-1] + GradwF[I,J,1,iz]) / 
+        (M[Iz,ind] * U[Iz,ind,1] + M[Iz-1,ind] * U[Iz-1,ind,1])
+    end  
+  end
+end
+
+@kernel function DivRhoGradKernel!(F,@Const(U),@Const(D),@Const(DW),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob),::Val{BANK}=Val(1)) where BANK
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  cCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim)
+  FCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+
+  if Iz <= Nz
+    @inbounds ind = Glob[ID,IF]
+    @inbounds cCol[I,J,iz] = U[Iz,ind,5] / U[Iz,ind,1]
+    @inbounds FCol[I,J,iz] = 0.0
+  end
+  @synchronize
+  if Iz <= Nz
+    Dxc = D[I,1] * cCol[1,J,iz]
+    Dyc = D[J,1] * cCol[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds Dxc = Dxc + D[I,k] * cCol[k,J,iz]
+      @inbounds Dyc = Dyc + D[J,k] * cCol[I,k,iz] 
+    end
+    @views @inbounds (GradDx, GradDy) = Grad12(Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF])
+    @views @inbounds (tempx, tempy) = Contra12(GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @unroll for k = 1 : N
+      @inbounds @atomic FCol[k,J,iz] += DW[k,I] * tempx
+      @inbounds @atomic FCol[I,k,iz] += DW[k,J] * tempy
+    end
+  end
+
+  @synchronize
+  if Iz <= Nz
+    @inbounds ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind,5] += FCol[I,J,iz] / M[Iz,ind]
+  end
+end
+
+@kernel function HyperViscKernel!(F,MRho,@Const(U),@Const(D),@Const(DW),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob)) 
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  ThCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  uCCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vCCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  uDCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vDCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  Curl = @localmem eltype(F) (N,N, ColumnTilesDim)
+  Div = @localmem eltype(F) (N,N, ColumnTilesDim)
+  ThCxCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  ThCyCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @views @inbounds uC, vC = Curl12(U[Iz,ind,2],U[Iz,ind,3],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds uCCol[I,J,iz] = uC
+    @inbounds vCCol[I,J,iz] = vC
+    @views @inbounds uD, vD = Contra12(U[Iz,ind,2],U[Iz,ind,3],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds uDCol[I,J,iz] = uD
+    @inbounds vDCol[I,J,iz] = vD
+    @inbounds ThCol[I,J,iz] = U[Iz,ind,5] / U[Iz,ind,1]
+  end
+  @synchronize
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds Dxc = D[I,1] * ThCol[1,J,iz]
+    @inbounds Dyc = D[J,1] * ThCol[I,1,iz]
+    @inbounds Curl[I,J,iz] = D[I,1] * uCCol[1,J,iz] + D[J,1] * vCCol[I,1,iz] 
+    @inbounds Div[I,J,iz] = D[I,1] * uDCol[1,J,iz] + D[J,1] * vDCol[I,1,iz] 
+    @unroll for k = 2 : N
+      @inbounds Dxc += D[I,k] * ThCol[k,J,iz]
+      @inbounds Dyc += D[J,k] * ThCol[I,k,iz] 
+      @inbounds Curl[I,J,iz] += D[I,k] * uCCol[k,J,iz] + D[J,k] * vCCol[I,k,iz] 
+      @inbounds Div[I,J,iz] += D[I,k] * uDCol[k,J,iz] + D[J,k] * vDCol[I,k,iz] 
+    end
+    @inbounds Curl[I,J,iz] /= (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF])
+    @inbounds Div[I,J,iz] /= (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF])
+    @views @inbounds (GradDx, GradDy) = Grad12(Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF])
+    @views @inbounds (tempx, tempy) = Contra12(GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds ThCxCol[I,J,iz] = tempx
+    @inbounds ThCyCol[I,J,iz] = tempy
+  end
+
+  @synchronize 
+  if Iz <= Nz
+    @inbounds DxCurl = DW[I,1] * Curl[1,J,iz]
+    @inbounds DyCurl = DW[J,1] * Curl[I,1,iz]
+    @inbounds DxDiv = DW[I,1] * Div[1,J,iz]
+    @inbounds DyDiv = DW[J,1] * Div[I,1,iz]
+    @inbounds DivTh = DW[I,1] * ThCxCol[1,J,iz] + DW[J,1] * ThCyCol[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds DxCurl += DW[I,k] * Curl[k,J,iz]
+      @inbounds DyCurl += DW[J,k] * Curl[I,k,iz]
+      @inbounds DxDiv += DW[I,k] * Div[k,J,iz]
+      @inbounds DyDiv += DW[J,k] * Div[I,k,iz]
+      @inbounds DivTh += DW[I,k] * ThCxCol[k,J,iz] + DW[J,k] * ThCyCol[I,k,iz]
+    end
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @views @inbounds FuC, FvC = Rot12(DxCurl,DyCurl,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @views @inbounds FuD, FvD = Grad12(DxDiv,DyDiv,dXdxI[1:2,1:2,:,ID,Iz,IF]) 
+    @inbounds @atomic F[Iz,ind,1] += FuC / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,2] += FvC / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,3] += FuD / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,4] += FvD / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,5] += DivTh / M[Iz,ind]
+    if Iz < Nz
+      @inbounds @atomic MRho[Iz,ind] += U[Iz,ind,1] * JJ[ID,2,Iz,IF] 
+    end  
+    if Iz > 1
+      @inbounds @atomic MRho[Iz-1,ind] += U[Iz,ind,1] * JJ[ID,1,Iz,IF] 
+    end  
+  end
+end
+
+@kernel function HyperViscTracerKernel!(FTr,@Const(Tr),@Const(Rho),@Const(D),@Const(DW),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob)) 
+
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  TrCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  TrCxCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  TrCyCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  if Iz <= Nz && IF <= NF
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds TrCol[I,J,iz] = Tr[Iz,ind] / Rho[Iz,ind]
+  end
+  @synchronize
+
+  if Iz <= Nz && IF <= NF
+    ID = I + (J - 1) * N  
+    @inbounds Dxc = D[I,1] * TrCol[1,J,iz]
+    @inbounds Dyc = D[J,1] * TrCol[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds Dxc += D[I,k] * TrCol[k,J,iz]
+      @inbounds Dyc += D[J,k] * TrCol[I,k,iz] 
+    end
+    @views @inbounds (GradDx, GradDy) = Grad12(Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF])
+    @views @inbounds (tempx, tempy) = Contra12(GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds TrCxCol[I,J,iz] = tempx
+    @inbounds TrCyCol[I,J,iz] = tempy
+  end
+
+  @synchronize 
+  if Iz <= Nz && IF <= NF
+    @inbounds DivTr = DW[I,1] * TrCxCol[1,J,iz] + DW[J,1] * TrCyCol[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds DivTr += DW[I,k] * TrCxCol[k,J,iz] + DW[J,k] * TrCyCol[I,k,iz]
+    end
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds @atomic FTr[Iz,ind] += DivTr / M[Iz,ind]
+  end
+end
+
+@kernel function HyperViscKoeffKernel!(F,@Const(U),@Const(Cache),@Const(D),@Const(DW),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob),KoeffCurl,KoeffGrad,KoeffDiv) 
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  ThCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  uCCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vCCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  uDCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vDCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  Curl = @localmem eltype(F) (N,N, ColumnTilesDim)
+  Div = @localmem eltype(F) (N,N, ColumnTilesDim)
+  ThCxCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  ThCyCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @views @inbounds uC, vC = Curl12(Cache[Iz,ind,1],Cache[Iz,ind,2],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds uCCol[I,J,iz] = uC
+    @inbounds vCCol[I,J,iz] = vC
+    @views @inbounds uD, vD = Contra12(Cache[Iz,ind,3],Cache[Iz,ind,4],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds uDCol[I,J,iz] = uD
+    @inbounds vDCol[I,J,iz] = vD
+    @inbounds ThCol[I,J,iz] = Cache[Iz,ind,5] 
+  end
+  @synchronize
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds Dxc = D[I,1] * ThCol[1,J,iz]
+    @inbounds Dyc = D[J,1] * ThCol[I,1,iz]
+    @inbounds Curl[I,J,iz] = D[I,1] * uCCol[1,J,iz] + D[J,1] * vCCol[I,1,iz] 
+    @inbounds Div[I,J,iz] = D[I,1] * uDCol[1,J,iz] + D[J,1] * vDCol[I,1,iz] 
+    @unroll for k = 2 : N
+      @inbounds Dxc += D[I,k] * ThCol[k,J,iz]
+      @inbounds Dyc += D[J,k] * ThCol[I,k,iz] 
+      @inbounds Curl[I,J,iz] += D[I,k] * uCCol[k,J,iz] + D[J,k] * vCCol[I,k,iz] 
+      @inbounds Div[I,J,iz] += D[I,k] * uDCol[k,J,iz] + D[J,k] * vDCol[I,k,iz] 
+    end
+    @inbounds Curl[I,J,iz] /= (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF])
+    @inbounds Div[I,J,iz] /= (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF])
+    @views @inbounds (GradDx, GradDy) = Grad12(Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF])
+    @inbounds ind = Glob[ID,IF]
+    @views @inbounds (tempx, tempy) = Contra12(U[Iz,ind,1],GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds ThCxCol[I,J,iz] = tempx
+    @inbounds ThCyCol[I,J,iz] = tempy
+  end
+
+  @synchronize 
+  if Iz <= Nz
+    @inbounds DxCurl = DW[I,1] * Curl[1,J,iz]
+    @inbounds DyCurl = DW[J,1] * Curl[I,1,iz]
+    @inbounds DxDiv = DW[I,1] * Div[1,J,iz]
+    @inbounds DyDiv = DW[J,1] * Div[I,1,iz]
+    @inbounds DivTh = DW[I,1] * ThCxCol[1,J,iz] + DW[J,1] * ThCyCol[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds DxCurl += DW[I,k] * Curl[k,J,iz]
+      @inbounds DyCurl += DW[J,k] * Curl[I,k,iz]
+      @inbounds DxDiv += DW[I,k] * Div[k,J,iz]
+      @inbounds DyDiv += DW[J,k] * Div[I,k,iz]
+      @inbounds DivTh += DW[I,k] * ThCxCol[k,J,iz] + DW[J,k] * ThCyCol[I,k,iz]
+    end
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @views @inbounds FuC, FvC = Rot12(DxCurl,DyCurl,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @views @inbounds FuD, FvD = Grad12(DxDiv,DyDiv,dXdxI[1:2,1:2,:,ID,Iz,IF]) 
+    @inbounds @atomic F[Iz,ind,2] += -(KoeffCurl * FuC + KoeffGrad * FuD) / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,3] += -(KoeffCurl * FvC + KoeffGrad * FvD) / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,5] += -KoeffDiv * DivTh / M[Iz,ind]
+  end
+end
+
+@kernel function HyperViscTracerKoeffKernel!(FTr,@Const(Cache),@Const(Rho),@Const(D),@Const(DW),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob),KoeffDiv) 
+
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  TrCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  TrCxCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  TrCyCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  if Iz <= Nz && IF <= NF
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds TrCol[I,J,iz] = Cache[Iz,ind] 
+  end
+  @synchronize
+
+  if Iz <= Nz && IF <= NF
+    ID = I + (J - 1) * N  
+    @inbounds Dxc = D[I,1] * TrCol[1,J,iz]
+    @inbounds Dyc = D[J,1] * TrCol[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds Dxc += D[I,k] * TrCol[k,J,iz]
+      @inbounds Dyc += D[J,k] * TrCol[I,k,iz] 
+    end
+    @views @inbounds (GradDx, GradDy) = Grad12(Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF])
+    @inbounds ind = Glob[ID,IF]
+    @views @inbounds (tempx, tempy) = Contra12(Rho[Iz,ind],GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds TrCxCol[I,J,iz] = tempx
+    @inbounds TrCyCol[I,J,iz] = tempy
+  end
+
+  @synchronize 
+  if Iz <= Nz && IF <= NF
+    @inbounds DivTr = DW[I,1] * TrCxCol[1,J,iz] + DW[J,1] * TrCyCol[I,1,iz]
+    @unroll for k = 2 : N
+      @inbounds DivTr += DW[I,k] * TrCxCol[k,J,iz] + DW[J,k] * TrCyCol[I,k,iz]
+    end
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds @atomic FTr[Iz,ind] += -KoeffDiv * DivTr / M[Iz,ind]
+  end
+end
+
+@kernel function DivRhoGradKernel1!(F,@Const(U),@Const(D),@Const(DW),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob))
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  RhoCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  uCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  wCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  ThCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  FRhoCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  FuCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  FvCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  FThCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  CurlCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  if Iz <= Nz
+    I = mod(ID-1,N) + 1
+    J = div(ID-I,N) + 1
+    @inbounds ind = Glob[ID,IF]
+    @inbounds RhoCol[I,J,iz+1] = U[Iz,ind,1]
+    @inbounds uCol[I,J,iz+1] = U[Iz,ind,2]
+    @inbounds vCol[I,J,iz+1] = U[Iz,ind,3]
+    @inbounds wCol[I,J,iz+1] = U[Iz,ind,4]
+    @inbounds ThCol[I,J,iz+1] = U[Iz,ind,5] / RhoCol[I,J,iz+1]
+    @inbounds FRhoCol[I,J,iz+1] = 0
+    @inbounds FThCol[I,J,iz+1] = 0
+  end
+  @synchronize
+  if Iz <= Nz
+#   DivGrad Th
+    ID = I + (J - 1) * N  
+    Dxc = 0
+    Dyc = 0
+    @unroll for k = 1 : N
+      @inbounds Dxc = Dxc + D[I,k] * ThCol[k,J,iz]
+      @inbounds Dyc = Dyc + D[J,k] * ThCol[I,k,iz] 
+    end
+    @inbounds GradDx = ((dXdxI[I,J,1,Iz,1,1,IF] + dXdxI[I,J,2,Iz,1,1,IF]) * Dxc +
+      (dXdxI[I,J,1,Iz,2,1,IF] + dXdxI[I,J,2,Iz,2,1,IF]) * Dyc) / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF])
+    @inbounds GradDy = ((dXdxI[I,J,1,Iz,1,2,IF] + dXdxI[I,J,2,Iz,1,2,IF]) * Dxc +
+      (dXdxI[I,J,1,Iz,2,2,IF] + dXdxI[I,J,2,Iz,2,2,IF]) * Dyc) / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF])
+    @inbounds tempx = (dXdxI[I,J,1,Iz,1,1,IF] + dXdxI[I,J,2,Iz,1,1,IF]) * GradDx +
+      (dXdxI[I,J,1,Iz,1,2,IF] + dXdxI[I,J,2,Iz,1,2,IF]) * GradDy
+    @inbounds tempy = (dXdxI[I,J,1,Iz,2,1,IF] + dXdxI[I,J,2,Iz,2,1,IF]) * GradDx +
+      (dXdxI[I,J,1,Iz,2,2,IF] + dXdxI[I,J,2,Iz,2,2,IF]) * GradDy
+    @unroll for k = 1 : N
+      @inbounds @atomic FThCol[k,J,iz] += DW[k,I] * tempx
+      @inbounds @atomic FThCol[I,k,iz] += DW[k,J] * tempy
+    end
+#   Curl (u,v)
+    @inbounds tempx = (dXdxI[I,J,1,iz,1,1,IF] + dXdxI[I,J,2,iz,1,1],IF) * vC[I,J,iz] -
+      (dXdxI[I,J,1,iz,1,2,IF] + dXdxI[I,J,2,iz,1,2,IF]) * uC[I,J,iz]
+    @views @. tempy = (dXdxI[I,J,1,iz,2,1,IF] + dXdxI[I,J,2,iz,2,1,IF]) * vC[I,J,iz] -
+      (dXdxI[I,J,1,iz,2,2,IF] + dXdxI[I,J,2,iz,2,2,IF]) * uC[I,J,iz]
+    @unroll for k = 1 : N
+      @inbounds @atomic CurlCol[k,J,iz] += D[k,I] * tempx + D[k,J] * tempy
+    end
+  end
+
+  @synchronize
+  if Iz <= Nz
+#   CurlCurl (u,v)
+    I = mod(ID-1,N) + 1
+    J = div(ID-I,N) + 1
+    CurlCol[I,J,iz] /= (J[I,J,1,iz] + J[I,J,2,iz])
+    DxCurl = eltype(F)(0)
+    DyCurl = eltype(F)(0)
+    @unroll for k = 1 : N
+      @inbounds DxCurl += DW[I,k] * CurlCol[k,J,iz]
+      @inbounds DyCurl += DW[J,k] * CurlCol[I,k,iz] 
+    end
+
+    @inbounds FvCol[I,J,iz] = (-(dXdxI[I,J,1,iz,1,1,IF] + dXdxI[I,J,2,iz,1,1,IF]) * DxCurl -
+      (dXdxI[I,J,1,iz,2,1,IF] + dXdxI[I,J,2,iz,2,1,IF]) * DyCurl)
+    @inbounds FuCol[I,J,iz] = ((dXdxI[I,J,1,iz,1,2,IF] + dXdxI[I,J,2,iz,1,2,IF]) * DxCurl +
+      (dXdxI[I,J,1,iz,2,2,IF] + dXdxI[I,J,2,iz,2,2,IF]) * DyCurl)
+  end
+
+  @synchronize
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind,5] += FThCol[I,J,iz] / M[Iz,ind]
+  end
+end
+
+@kernel function DivRhoTrCentralKernel!(F,@Const(c),@Const(uC),@Const(vC),@Const(w),
+  @Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob),::Val{BANK}=Val(1)) where BANK
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  cCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  uCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  vCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  wCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+1)
+  FCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds wCol[I,J,iz+1] = w[Iz,ind]
+    @inbounds cCol[I,J,iz+1] = c[Iz,ind]
+    @inbounds uCol[I,J,iz+1] = uC[Iz,ind]
+    @inbounds vCol[I,J,iz+1] = vC[Iz,ind]
+    @inbounds FCol[I,J,iz+1] = 0
+    if iz == 1 && Iz > 1
+      @inbounds cCol[I,J,1] = c[Iz-1,ind]
+      @inbounds uCol[I,J,1] = uC[Iz-1,ind]
+      @inbounds vCol[I,J,1] = vC[Iz-1,ind]
+      @inbounds wCol[I,J,1] = w[Iz,ind]
+      @inbounds FCol[I,J,1] = 0
+    elseif iz == 1 && Iz == 1
+      @inbounds cCol[I,J,1] = c[1,ind]
+      @inbounds wCol[I,J,1] = 0
+      @inbounds FCol[I,J,1] = 0
+    end
+    if iz == ColumnTilesDim && Iz < Nz
+      @inbounds cCol[I,J,ColumnTilesDim+2] = c[Iz+1,ind]
+      @inbounds uCol[I,J,ColumnTilesDim+2] = uC[Iz+1,ind]
+      @inbounds vCol[I,J,ColumnTilesDim+2] = vC[Iz+1,ind]
+      @inbounds FCol[I,J,ColumnTilesDim+2] = 0
+    elseif iz == ColumnTilesDim && Iz == Nz
+      @inbounds cCol[I,J,ColumnTilesDim+2] = c[Nz,ind]
+      @inbounds FCol[I,J,ColumnTilesDim+2] = 0
+    end
+  end
+  @synchronize
+  if Iz < Nz 
+    @inbounds wCon = dXdxI[I,J,2,Iz,3,1,IF] * uCol[I,J,iz+1] + 
+      dXdxI[I,J,2,Iz,3,2,IF] * vCol[I,J,iz+1] + 
+      dXdxI[I,J,1,Iz+1,3,1,IF] * uCol[I,J,iz+2] + 
+      dXdxI[I,J,2,Iz+1,3,2,IF] * vCol[I,J,iz+2] + 
+      (dXdxI[I,J,2,Iz,3,3,IF] + dXdxI[I,J,1,Iz+1,3,3,IF]) * wCol[I,J,iz+1]
+    @inbounds cF = (JJ[ID,2,Iz,IF] * cCol[I,J,iz+1] + JJ[ID,1,Iz+1,IF] * cCol[I,J,iz+2]) /
+      (JJ[ID,2,Iz,IF] + JJ[ID,1,Iz+1,IF])
+    Flux = eltype(F)(0.5) * wCon * cF
+    @inbounds @atomic FCol[I,J,iz+1] += -Flux
+    @inbounds @atomic FCol[I,J,iz+2] += Flux
+  end 
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds tempx = -cCol[I,J,iz+1] * ((dXdxI[I,J,1,Iz,1,1,IF] + dXdxI[I,J,2,Iz,1,1,IF]) * uCol[I,J,iz+1] +
+      (dXdxI[I,J,1,Iz,1,2,IF] + dXdxI[I,J,2,Iz,1,2,IF]) * vCol[I,J,iz+1] +
+      dXdxI[I,J,1,Iz,1,3,IF] * wCol[I,J,iz] + dXdxI[I,J,2,Iz,1,3,IF] * wCol[I,J,iz+1])
+    @inbounds tempy = -cCol[I,J,iz+1] * ((dXdxI[I,J,1,Iz,2,1,IF] + dXdxI[I,J,2,Iz,2,1,IF]) * uCol[I,J,iz+1] +
+      (dXdxI[I,J,1,Iz,2,2,IF] + dXdxI[I,J,2,Iz,2,2,IF]) * vCol[I,J,iz+1] +
+      dXdxI[I,J,1,Iz,2,3,IF] * wCol[I,J,iz] + dXdxI[I,J,2,Iz,2,3,IF] * wCol[I,J,iz+1])
+    @unroll for k = 1 : N
+      @inbounds @atomic FCol[k,J,iz+1] += D[k,I] * tempx
+      @inbounds @atomic FCol[I,k,iz+1] += D[k,J] * tempy
+    end
+  end
+  @synchronize
+  if Iz <= Nz 
+    ID = I + (J - 1) * N  
+    ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind] += FCol[I,J,iz+1] / M[Iz,ind]
+    if iz == 1 && Iz >  1
+      @inbounds @atomic F[Iz-1,ind] += FCol[I,J,iz] / M[Iz-1,ind]
+    end
+    if iz == ColumnTilesDim && Iz <  Nz
+      @inbounds @atomic F[Iz+1,ind] += FCol[I,J,iz+2] / M[Iz+1,ind]
+    end
+  end
+end
+
+@kernel function DivRhoTrUpwindKernel!(F,@Const(c),@Const(Rho),@Const(uC),@Const(vC),@Const(w),
+  @Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob),::Val{BANK}=Val(1)) where BANK
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  cCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  uCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  vCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  RhoCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  wCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+1)
+  FCol = @localmem eltype(F) (N+BANK,N, ColumnTilesDim+2)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds wCol[I,J,iz+1] = w[Iz,ind]
+    @inbounds RhoCol[I,J,iz+1] = Rho[Iz,ind]
+    @inbounds cCol[I,J,iz+1] = c[Iz,ind] / RhoCol[I,J,iz+1]
+    @inbounds uCol[I,J,iz+1] = uC[Iz,ind]
+    @inbounds vCol[I,J,iz+1] = vC[Iz,ind]
+    @inbounds FCol[I,J,iz+1] = 0
+    if iz == 1 && Iz > 1
+      @inbounds RhoCol[I,J,1] = Rho[Iz-1,ind]
+      @inbounds cCol[I,J,1] = c[Iz-1,ind] / RhoCol[I,J,1]
+      @inbounds uCol[I,J,1] = uC[Iz-1,ind]
+      @inbounds vCol[I,J,1] = vC[Iz-1,ind]
+      @inbounds wCol[I,J,1] = w[Iz,ind]
+      @inbounds FCol[I,J,1] = 0
+    elseif iz == 1 && Iz == 1
+      @inbounds RhoCol[I,J,1] = Rho[1,ind]
+      @inbounds cCol[I,J,1] = c[1,ind] / RhoCol[I,J,1]
+      @inbounds wCol[I,J,1] = 0
+      @inbounds FCol[I,J,1] = 0
+    end
+    if iz == ColumnTilesDim && Iz < Nz
+      @inbounds RhoCol[I,J,ColumnTilesDim+2] = Rho[Iz+1,ind]
+      @inbounds cCol[I,J,ColumnTilesDim+2] = c[Iz+1,ind] / RhoCol[I,J,ColumnTilesDim+2]
+      @inbounds uCol[I,J,ColumnTilesDim+2] = uC[Iz+1,ind]
+      @inbounds vCol[I,J,ColumnTilesDim+2] = vC[Iz+1,ind]
+      @inbounds FCol[I,J,ColumnTilesDim+2] = 0
+    elseif iz == ColumnTilesDim && Iz == Nz
+      @inbounds RhoCol[I,J,ColumnTilesDim+2] = Rho[Nz,ind]
+      @inbounds cCol[I,J,ColumnTilesDim+2] = c[Nz,ind] / RhoCol[I,J,ColumnTilesDim+2]
+      @inbounds FCol[I,J,ColumnTilesDim+2] = 0
+    end
+  end
+  @synchronize
+  if Iz < Nz 
+    @inbounds wCon = RhoCol[I,J,iz+1] * (dXdxI[I,J,2,Iz,3,1,IF] * uCol[I,J,iz+1] + 
+      dXdxI[I,J,2,Iz,3,2,IF] * vCol[I,J,iz+1] + dXdxI[I,J,2,Iz,3,3,IF] * wCol[I,J,iz+1]) +
+      RhoCol[I,J,iz+2] * (dXdxI[I,J,1,Iz+1,3,1,IF] * uCol[I,J,iz+2] + 
+      dXdxI[I,J,2,Iz+1,3,2,IF] * vCol[I,J,iz+2] + dXdxI[I,J,1,Iz+1,3,3,IF] * wCol[I,J,iz+1])
+    cL = cCol[I,J,iz+1]
+    cR = cCol[I,J,iz+2]
+    Flux = 0.25 * ((abs(wCon) + wCon) * cL + (-abs(wCon) + wCon) * cR)
+    @inbounds @atomic FCol[I,J,iz+1] += -Flux
+    @inbounds @atomic FCol[I,J,iz+2] += Flux
+  end 
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds tempx = -RhoCol[I,J,iz+1] * cCol[I,J,iz+1] * ((dXdxI[I,J,1,Iz,1,1,IF] + dXdxI[I,J,2,Iz,1,1,IF]) * uCol[I,J,iz+1] +
+      (dXdxI[I,J,1,Iz,1,2,IF] + dXdxI[I,J,2,Iz,1,2,IF]) * vCol[I,J,iz+1] +
+      dXdxI[I,J,1,Iz,1,3,IF] * wCol[I,J,iz] + dXdxI[I,J,2,Iz,1,3,IF] * wCol[I,J,iz+1])
+    @inbounds tempy = -RhoCol[I,J,iz+1] * cCol[I,J,iz+1] * ((dXdxI[I,J,1,Iz,2,1,IF] + dXdxI[I,J,2,Iz,2,1,IF]) * uCol[I,J,iz+1] +
+      (dXdxI[I,J,1,Iz,2,2,IF] + dXdxI[I,J,2,Iz,2,2,IF]) * vCol[I,J,iz+1] +
+      dXdxI[I,J,1,Iz,2,3,IF] * wCol[I,J,iz] + dXdxI[I,J,2,Iz,2,3,IF] * wCol[I,J,iz+1])
+    @unroll for k = 1 : N
+      @inbounds @atomic FCol[k,J,iz+1] += D[k,I] * tempx
+      @inbounds @atomic FCol[I,k,iz+1] += D[k,J] * tempy
+    end
+  end
+  @synchronize
+  if Iz <= Nz 
+    ID = I + (J - 1) * N  
+    ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind] += FCol[I,J,iz+1] / M[Iz,ind]
+    if iz == 1 && Iz >  1
+      @inbounds @atomic F[Iz-1,ind] += FCol[I,J,iz] / M[Iz-1,ind]
+    end
+    if iz == ColumnTilesDim && Iz <  Nz
+      @inbounds @atomic F[Iz+1,ind] += FCol[I,J,iz+2] / M[Iz+1,ind]
+    end
+  end
+end
+
+@kernel function DivRhoThUpwind3Kernel!(F,@Const(U),@Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob))
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  cCol = @localmem eltype(F) (N,N, ColumnTilesDim+3)
+  uConCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vConCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds cCol[I,J,iz+1] = U[Iz,ind,5] / U[Iz,ind,1]
+    @views @inbounds (uCon, vCon) = Contra12(-U[Iz,ind,1],U[Iz,ind,2],U[Iz,ind,3],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds uConCol[I,J,iz] = uCon
+    @inbounds vConCol[I,J,iz] = vCon
+  end
+  if iz == 1
+    Izm1 = max(Iz - 1,1)
+    cCol[I,J,iz] = U[Izm1,ind,5] / U[Izm1,ind,1]
+  end
+  if iz == ColumnTilesDim || Iz == Nz
+    Izp1 = min(Iz + 1,Nz)
+    cCol[I,J,iz+2] = U[Izp1,ind,5] / U[Izp1,ind,1]
+    Izp2 = min(Iz + 2,Nz)
+    cCol[I,J,iz+3] = U[Izp2,ind,5] / U[Izp2,ind,1]
+  end
+  @synchronize
+
+  if Iz < Nz 
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds cLL = cCol[I,J,iz]
+    @inbounds cL = cCol[I,J,iz+1]
+    @inbounds cR = cCol[I,J,iz+2]
+    @inbounds cRR = cCol[I,J,iz+3]
+
+    @views @inbounds wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3],
+      U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF])
+
+    Izm1 = max(Iz - 1,1)
+    Izp2 = min(Iz + 2, Nz)
+    @inbounds JLL = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF]
+    @inbounds JL = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]
+    @inbounds JR = JJ[ID,1,Iz+1,IF] + JJ[ID,2,Iz+1,IF]
+    @inbounds JRR = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF]
+    cFL, cFR = RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR) 
+    Flux = eltype(F)(0.25) * ((abs(wCon) + wCon) * cFL + (-abs(wCon) + wCon) * cFR)
+    @inbounds @atomic F[Iz,ind,5] += -Flux / M[Iz,ind]
+    @inbounds @atomic F[Iz+1,ind,5] += Flux / M[Iz+1,ind]
+    Flux = eltype(F)(0.5) * wCon
+    @inbounds @atomic F[Iz,ind,1] += -Flux / M[Iz,ind]
+    @inbounds @atomic F[Iz+1,ind,1] += Flux / M[Iz+1,ind]
+  end 
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds DivRho = D[I,1] * uConCol[1,J,iz] 
+    @inbounds DivRho += D[J,1] * vConCol[I,1,iz] 
+    @inbounds DivRhoTr = D[I,1] * uConCol[1,J,iz] * cCol[1,J,iz+1] 
+    @inbounds DivRhoTr += D[J,1] * vConCol[I,1,iz] * cCol[I,1,iz+1]
+    @unroll for k = 2 : N
+      @inbounds DivRho += D[I,k] * uConCol[k,J,iz] 
+      @inbounds DivRho += D[J,k] * vConCol[I,k,iz] 
+      @inbounds DivRhoTr += D[I,k] * uConCol[k,J,iz] * cCol[k,J,iz+1] 
+      @inbounds DivRhoTr += D[J,k] * vConCol[I,k,iz] * cCol[I,k,iz+1]
+    end
+    ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind,1] += DivRho / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,5] += DivRhoTr / M[Iz,ind]
+  end
+end
+
+@kernel function DivRhoKernel!(F,@Const(U),@Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob))
+
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  uConCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vConCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @views @inbounds (uCon, vCon) = Contra12(-U[Iz,ind,1],U[Iz,ind,2],U[Iz,ind,3],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds uConCol[I,J,iz] = uCon
+    @inbounds vConCol[I,J,iz] = vCon
+  end
+  @synchronize
+
+  if Iz < Nz 
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+
+    @views @inbounds wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3],
+      U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF])
+
+    Flux = eltype(F)(0.5) * wCon
+    @inbounds @atomic F[Iz,ind,1] += -Flux / M[Iz,ind]
+    @inbounds @atomic F[Iz+1,ind,1] += Flux / M[Iz+1,ind]
+  end 
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+#   @inbounds DivRho = D[I,1] * uConCol[1,J,iz] 
+#   @inbounds DivRho += D[J,1] * vConCol[I,1,iz] 
+    DivRho = eltype(F)(0)
+    @unroll for k = 1 : N
+      @inbounds DivRho += D[I,k] * uConCol[k,J,iz] 
+      @inbounds DivRho += D[J,k] * vConCol[I,k,iz] 
+    end
+    ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind,1] += DivRho / M[Iz,ind]
+  end
+end
+
+@kernel function DivRhoTrUpwind3Kernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob))
+
+# gi, gj, gz, gF = @index(Group, NTuple)
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3)
+  uConCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  vConCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds cCol[I,J,iz+1] = Tr[Iz,ind] / U[Iz,ind,1]
+    @views @inbounds (uCon, vCon) = Contra12(-U[Iz,ind,1],U[Iz,ind,2],U[Iz,ind,3],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    @inbounds uConCol[I,J,iz] = uCon
+    @inbounds vConCol[I,J,iz] = vCon
+  end
+  if iz == 1
+    Izm1 = max(Iz - 1,1)
+    cCol[I,J,iz] = Tr[Izm1,ind] / U[Izm1,ind,1]
+  end
+  if iz == ColumnTilesDim || Iz == Nz
+    Izp1 = min(Iz + 1,Nz)
+    cCol[I,J,iz+2] = Tr[Izp1,ind] / U[Izp1,ind,1]
+    Izp2 = min(Iz + 2,Nz)
+    cCol[I,J,iz+3] = Tr[Izp2,ind] / U[Izp2,ind,1]
+  end
+  @synchronize
+
+  if Iz < Nz 
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds cLL = cCol[I,J,iz]
+    @inbounds cL = cCol[I,J,iz+1]
+    @inbounds cR = cCol[I,J,iz+2]
+    @inbounds cRR = cCol[I,J,iz+3]
+
+    @views @inbounds wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3],
+      U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF])
+
+    Izm1 = max(Iz - 1,1)
+    Izp2 = min(Iz + 2, Nz)
+    @inbounds JLL = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF]
+    @inbounds JL = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]
+    @inbounds JR = JJ[ID,1,Iz+1,IF] + JJ[ID,2,Iz+1,IF]
+    @inbounds JRR = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF]
+    cFL, cFR = RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR) 
+    Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL + (-abs(wCon) + wCon) * cFR)
+    @inbounds @atomic FTr[Iz,ind] += -Flux / M[Iz,ind]
+    @inbounds @atomic FTr[Iz+1,ind] += Flux / M[Iz+1,ind]
+  end 
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds DivRhoTr = D[I,1] * uConCol[1,J,iz] * cCol[1,J,iz+1] 
+    @inbounds DivRhoTr += D[J,1] * vConCol[I,1,iz] * cCol[I,1,iz+1]
+    @unroll for k = 2 : N
+      @inbounds DivRhoTr += D[I,k] * uConCol[k,J,iz] * cCol[k,J,iz+1] 
+      @inbounds DivRhoTr += D[J,k] * vConCol[I,k,iz] * cCol[I,k,iz+1]
+    end
+    ind = Glob[ID,IF]
+    @inbounds @atomic FTr[Iz,ind] += DivRhoTr / M[Iz,ind]
+  end
+end
+
+@kernel function DivRhoTrUpwind3Kernel!(F,@Const(U),@Const(Cache),@Const(D),@Const(DW),@Const(dXdxI),
+  @Const(JJ),@Const(M),@Const(Glob),Koeff)
+
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  cCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  CacheCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  RhoCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  uCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  vCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  wCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  FTrCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  FRhoCol = @localmem eltype(F) (N,N, ColumnTilesDim)
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds CacheCol[I,J,iz] = Cache[Iz,ind]
+    @inbounds wCol[I,J,iz] = U[Iz,ind,4]
+    @inbounds RhoCol[I,J,iz] = U[Iz,ind,1]
+    @inbounds cCol[I,J,iz] = U[Iz,ind,5] / RhoCol[I,J,iz]
+    @inbounds uCol[I,J,iz] = U[Iz,ind,2]
+    @inbounds vCol[I,J,iz] = U[Iz,ind,3]
+    @inbounds FRhoCol[I,J,iz] = 0
+    @inbounds FTrCol[I,J,iz] = 0
+  end
+  @synchronize
+  if Iz < Nz 
+    ID = I + (J - 1) * N  
+    @inbounds ind = Glob[ID,IF]
+    @inbounds ind = Glob[ID,IF]
+    @inbounds cL = cCol[I,J,iz]
+    @inbounds cR = cCol[I,J,iz+1]
+    if iz > 1
+      @inbounds cLL = cCol[I,J,iz-1]
+    else
+      Izm1 = max(Iz - 1,1)
+      @inbounds cLL = U[Izm1,ind,5] / U[Izm1,ind,1]
+    end
+    if iz < ColumnTilesDim - 1
+      @inbounds cRR = cCol[I,J,iz+2]
+    else
+      Izp2 = min(Iz + 2, Nz)
+      @inbounds cRR = U[Izp2,ind,5] / U[Izp2,ind,1]
+    end
+
+    @views @inbounds wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3],
+      U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF])
+
+    Izm1 = max(Iz - 1,1)
+    Izp2 = min(Iz + 2, Nz)
+    @inbounds JLL = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF]
+    @inbounds JL = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]
+    @inbounds JR = JJ[ID,1,Iz+1,IF] + JJ[ID,2,Iz+1,IF]
+    @inbounds JRR = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF]
+    cFL, cFR = RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR) 
+    Flux = 0.25 * ((abs(wCon) + wCon) * cFL + (-abs(wCon) + wCon) * cFR)
+    @inbounds @atomic F[Iz,ind,5] += -Flux / M[Iz,ind]
+    @inbounds @atomic F[Iz+1,ind,5] += Flux / M[Iz+1,ind]
+    Flux = eltype(F)(0.5) * wCon
+    @inbounds @atomic F[Iz,ind,1] += -Flux / M[Iz,ind]
+    @inbounds @atomic F[Iz+1,ind,1] += Flux / M[Iz+1,ind]
+  end 
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N  
+    Dxc = 0
+    Dyc = 0
+    for k = 1 : N
+      @inbounds Dxc = Dxc + D[I,k] * CacheCol[k,J,iz]
+      @inbounds Dyc = Dyc + D[J,k] * CacheCol[I,k,iz]
+    end
+    
+    @views @inbounds (GradDx, GradDy) = Grad12(RhoCol[I,J,iz],Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF])
+    @views @inbounds (tempx, tempy) = Contra12(-Koeff,GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF])
+    for k = 1 : N
+      @inbounds @atomic FTrCol[k,J,iz] += DW[k,I] * tempx
+      @inbounds @atomic FTrCol[I,k,iz] += DW[k,J] * tempy
+    end
+
+    @views @inbounds (tempxRho, tempyRho) = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],dXdxI[1:2,1:2,:,ID,Iz,IF])
+    for k = 1 : N
+      @inbounds @atomic FRhoCol[k,J,iz] += D[k,I] * tempxRho
+      @inbounds @atomic FRhoCol[I,k,iz] += D[k,J] * tempyRho
+    end
+    @inbounds tempxTr = tempxRho * cCol[I,J,iz]
+    @inbounds tempyTr = tempyRho * cCol[I,J,iz]
+    for k = 1 : N
+      @inbounds @atomic FTrCol[k,J,iz] += D[k,I] * tempxTr
+      @inbounds @atomic FTrCol[I,k,iz] += D[k,J] * tempyTr
+    end
+  end
+  @synchronize
+
+  if Iz <= Nz 
+    ID = I + (J - 1) * N  
+    ind = Glob[ID,IF]
+    @inbounds @atomic F[Iz,ind,1] += FRhoCol[I,J,iz] / M[Iz,ind]
+    @inbounds @atomic F[Iz,ind,5] += FTrCol[I,J,iz] / M[Iz,ind]
+  end
+end
+
+
+@inline function Contra12(Rho,u,v,dXdxI)
+  @inbounds uCon = Rho * ((dXdxI[1,1,1] + dXdxI[1,1,2]) * u +
+  (dXdxI[1,2,1] + dXdxI[1,2,2]) * v)
+  @inbounds vCon = Rho * ((dXdxI[2,1,1] + dXdxI[2,1,2]) * u +
+  (dXdxI[2,2,1] + dXdxI[2,2,2]) * v)
+  return uCon, vCon
+end
+
+@inline function Contra12(u,v,dXdxI)
+  @inbounds uCon = (dXdxI[1,1,1] + dXdxI[1,1,2]) * u +
+  (dXdxI[1,2,1] + dXdxI[1,2,2]) * v
+  @inbounds vCon = (dXdxI[2,1,1] + dXdxI[2,1,2]) * u +
+  (dXdxI[2,2,1] + dXdxI[2,2,2]) * v
+  return uCon, vCon
+end
+
+@inline function Grad12(Rho,u,v,dXdxI,J)
+  @inbounds uCon = Rho * ((dXdxI[1,1,1] + dXdxI[1,1,2]) * u +
+  (dXdxI[2,1,1] + dXdxI[2,1,2]) * v) / (J[1] + J[2])
+  @inbounds vCon = Rho * ((dXdxI[1,2,1] + dXdxI[1,2,2]) * u +
+  (dXdxI[2,2,1] + dXdxI[2,2,2]) * v) / (J[1] + J[2])
+  return uCon, vCon
+end
+
+@inline function Grad12(u,v,dXdxI,J)
+  @inbounds uCon = ((dXdxI[1,1,1] + dXdxI[1,1,2]) * u +
+  (dXdxI[2,1,1] + dXdxI[2,1,2]) * v) / (J[1] + J[2])
+  @inbounds vCon = ((dXdxI[1,2,1] + dXdxI[1,2,2]) * u +
+  (dXdxI[2,2,1] + dXdxI[2,2,2]) * v) / (J[1] + J[2])
+  return uCon, vCon
+end
+
+@inline function Grad12(u,v,dXdxI)
+  @inbounds uCon = (dXdxI[1,1,1] + dXdxI[1,1,2]) * u +
+  (dXdxI[2,1,1] + dXdxI[2,1,2]) * v
+  @inbounds vCon = (dXdxI[1,2,1] + dXdxI[1,2,2]) * u +
+  (dXdxI[2,2,1] + dXdxI[2,2,2]) * v
+  return uCon, vCon
+end
+
+@inline function Grad3(u,v,dXdxI)
+  @inbounds wCon1 = dXdxI[1,3,1] * u + dXdxI[2,3,1] * v
+  @inbounds wCon2 = dXdxI[1,3,2] * u + dXdxI[2,3,2] * v
+  return wCon1, wCon2
+end
+
+@inline function Curl12(u,v,dXdxI)
+  @inbounds uCon = (dXdxI[1,1,1] + dXdxI[1,1,2]) * v -
+  (dXdxI[1,2,1] + dXdxI[1,2,2]) * u 
+  @inbounds vCon = (dXdxI[2,1,1] + dXdxI[2,1,2]) * v -
+  (dXdxI[2,2,1] + dXdxI[2,2,2]) * u 
+  return uCon, vCon
+end
+
+@inline function Rot12(u,v,dXdxI)
+  @inbounds uCon = (dXdxI[1,2,1] + dXdxI[1,2,2]) * u +
+  (dXdxI[2,2,1] + dXdxI[2,2,2]) * v 
+  @inbounds vCon = -(dXdxI[1,1,1] + dXdxI[1,1,2]) * u -
+  (dXdxI[2,1,1] + dXdxI[2,1,2]) * v 
+  return uCon, vCon
+end
+ 
+@inline function Contra3(Rho,u,v,w,dXdxI)
+  @inbounds wCon = Rho[1] * (dXdxI[1,2,1] * u[1] + dXdxI[2,2,1] * v[1] + dXdxI[3,2,1] * w) + 
+    Rho[2] * (dXdxI[1,1,2] * u[2] + dXdxI[2,1,2] * v[2] + dXdxI[3,1,2] * w)
+end
+  
+@inline function RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR)
+
+  kR = (JL / (JL + JR)) * ((JLL + JL) / (JLL + JL + JR))
+  kL = -(JL / (JLL + JL)) * (JR / (JLL + JL + JR))
+  cFL = kL * cLL + (1 - kL - kR)*cL + kR * cR
+
+  kL = (JR / (JR + JL)) * ((JRR + JR)/(JL + JR + JRR))
+  kR = -(JR /(JRR + JR)) *(JL /(JL + JR + JRR))
+  cFR = kL * cL + (1 - kL - kR) * cR + kR * cRR
+ 
+  return (cFL,cFR)
+end
+
+
+@kernel function ForceKernel!(Force,F,U,p,lat)
+  Iz,IG = @index(Global, NTuple)
+  NG = @uniform @ndrange()[2]
+
+  if IG <= NG
+    @inbounds FRho,Fu,Fv,Fw,FRhoTh = Force(view(U,Iz,IG,1:5),p[Iz,IG],lat[IG])
+    @inbounds F[Iz,IG,1] += FRho
+    @inbounds F[Iz,IG,2] += Fu
+    @inbounds F[Iz,IG,3] += Fv
+    @inbounds F[Iz,IG,4] += Fw
+    @inbounds F[Iz,IG,5] += FRhoTh
+  end
+end  
+
+@kernel function MicrophysicsKernel!(Source,F,U,p)
+  Iz,IG = @index(Global, NTuple)
+  NG = @uniform @ndrange()[2]
+
+  if IG <= NG
+    @inbounds FRho,FRhoTh,FRhoV,FRhoC = Source(view(U,Iz,IG,:),p[Iz,IG])
+    @inbounds F[Iz,IG,1] += FRho
+    @inbounds F[Iz,IG,5] += FRhoTh
+    @inbounds F[Iz,IG,6] += FRhoV
+    @inbounds F[Iz,IG,7] += FRhoC
+  end
+end  
+
+@kernel function VerticalDiffusionScalarKernel!(FTr,@Const(Tr),@Const(Rho),@Const(K),
+  @Const(dXdxI),@Const(JJ),@Const(M),@Const(Glob))
+  I, J, iz   = @index(Local, NTuple)
+  _,_,Iz,IF = @index(Global, NTuple)
+
+  ColumnTilesDim = @uniform @groupsize()[3]
+  N = @uniform @groupsize()[1]
+  Nz = @uniform @ndrange()[3]
+  NF = @uniform @ndrange()[4]
+
+  cCol = @localmem eltype(FTr) (N,N,ColumnTilesDim+1)
+
+  if Iz <= Nz
+    ID = I + (J - 1) * N
+    @inbounds ind = Glob[ID,IF]
+    @inbounds cCol[I,J,iz] = Tr[Iz,ind] / Rho[Iz,ind]
+  end
+  if iz == ColumnTilesDim || Iz == Nz
+    Izp1 = min(Iz + 1,Nz)
+    @inbounds cCol[I,J,iz+1] = Tr[Izp1,ind] / Rho[Izp1,ind]
+  end
+  @synchronize
+
+  if Iz < Nz
+    ID = I + (J - 1) * N
+    @inbounds ind = Glob[ID,IF]
+    @inbounds grad = (K[ID,Iz,IF] + K[ID,Iz+1,IF]) * (cCol[I,J,iz+1] - cCol[I,J,iz]) *
+       (dXdxI[3,3,2,ID,Iz,IF] + dXdxI[3,3,1,ID,Iz+1,IF]) / ( JJ[ID,2,Iz,IF] + JJ[ID,1,Iz+1,IF])
+    @inbounds @atomic FTr[Iz,ind] +=  dXdxI[3,3,2,ID,Iz,IF] * grad / M[Iz,ind]  
+    @inbounds @atomic FTr[Iz+1,ind] += - dXdxI[3,3,1,ID,Iz+1,IF] * grad / M[Iz+1,ind]     
+  end  
+end  
+
+
+# Surface Flux F = nS * grad (c - cS) * FS
+
+@kernel function VerticalDiffusionCScalarKernel!(FTr,@Const(Tr),@Const(Rho),@Const(K),@Const(dz))
+  Iz,IG = @index(Global, NTuple)
+  NG = @uniform @ndrange()[2]
+  NZ = @uniform @ndrange()[1]
+
+  if Iz < Nz && IG <= NG
+    @inbounds Grad = (K[Iz,IG] + K[Iz+1,IG]) * (Tr[Iz+1,IG] / Rho[Iz+1,IG]  - Tr[Iz,IG] / Rho[Iz,IG]) /
+      (dz[Iz+1,IG] + dz[Iz,IG])
+    @inbounds @atomic FTr[Iz,IG] += Grad / dz[Iz,IG]   
+    @inbounds @atomic FTr[Iz+1,IG] += -Grad / dz[Iz+1,IG]  
+  end
+end
+
+@kernel function SurfaceFluxScalarsKernel(F,U,p,TSurf,RhoVSurf,uStar,CT,CH,dXdxI,Glob,M,Phys)
+  ID,IF = @index(Global, NTuple)
+
+  NF = @uniform @ndrange()[2]
+
+  if IF <= NF
+    @inbounds ind = Glob[ID,IF]  
+    @inbounds Rho = U[1,ind,1]
+    @inbounds RhoTh = U[1,ind,5]
+    @inbounds RhoV = U[1,ind,6]
+    RhoD = Rho - RhoV
+    Rm = Phys.Rd * RhoD + Phys.Rv * RhoV
+    Cpml = Phys.Cpd * RhoD + Phys.Cpv * RhoV
+    @inbounds T = p[1,ind] / Rm
+    @inbounds LatFlux = - 2.0 * CT[ID,IF] * uStar[ID,IF] * dXdxI[3,3,1,ID,1,IF] * 
+      (RhoV[1,ind] - RhoVSurf[ID,IF]) / M[1,ind]
+    @inbounds SensFlux = - 2.0 * CH[ID,IF] * uStar[ID,IF] * dXdxI[3,3,1,ID,1,IF] * 
+      (T - TSurf[ID,IF]) / M[1,ind]
+    FRho = LatFlux
+    FRhoV = LatFlux
+    PrePi=(p[1,ind] / Phys.p0)^(Rm / Cpml)
+    FRhoTh = RhoTh * (SensFlux / T + ((Phys.Rv / Rm) - 1.0 / Rho - 
+      log(PrePi)*(Phys.Rv / Rm - Phys.Cpv / Cpml)) *  LatFlux)
+    @inbounds @atomic F[1,ind,1] += FRho 
+    @inbounds @atomic F[1,ind,5] += FRhoTh 
+    @inbounds @atomic F[1,ind,6] += FRhoV 
+  end  
+end
+