From 11214ecbbde85ed124e45fd8f80cb0a979f86d2f Mon Sep 17 00:00:00 2001 From: OsKnoth <50015520+OsKnoth@users.noreply.github.com> Date: Wed, 15 Nov 2023 19:24:48 +0100 Subject: [PATCH] Advection test on the sphere --- Examples/testAdvectionCart.jl | 79 ++++-- Examples/testAdvectionCartGPU.jl | 252 ------------------- Examples/testAdvectionCartMPI.jl | 225 ----------------- Examples/testAdvectionSphere.jl | 242 +++++++++++++----- Examples/testNHSphere.jl | 6 +- Jobs/JobAdvectionCubeCartCPU | 2 +- Jobs/JobAdvectionSlottedCylinder | 7 +- src/DyCore/DiscretizationCG.jl | 4 +- src/DyCore/FiniteElement.jl | 6 +- src/DyCore/InitDriver.jl | 12 +- src/Examples/initial.jl | 42 +++- src/Examples/parameters.jl | 26 +- src/GPU/FcnGPU.jl | 16 +- src/GPU/GPU.jl | 1 - src/GPU/HorLimiterKernel.jl | 57 +++-- src/GPU/HorLimiterKernel.jl_Old | 417 +++++++++++++++++++++++++++++++ src/Grids/Trans.jl | 6 +- 17 files changed, 778 insertions(+), 622 deletions(-) delete mode 100644 Examples/testAdvectionCartGPU.jl delete mode 100644 Examples/testAdvectionCartMPI.jl create mode 100644 src/GPU/HorLimiterKernel.jl_Old diff --git a/Examples/testAdvectionCart.jl b/Examples/testAdvectionCart.jl index 6c5a2b0..5875b39 100644 --- a/Examples/testAdvectionCart.jl +++ b/Examples/testAdvectionCart.jl @@ -1,9 +1,16 @@ -using CGDycore +import CGDycore: + Examples, Parallels, Models, Grids, Outputs, Integration, GPU, DyCore using MPI using Base - +using CUDA +using AMDGPU +using Metal +using KernelAbstractions +using StaticArrays +using ArgParse +using MPI # Model -parsed_args = CGDycore.parse_commandline() +parsed_args = DyCore.parse_commandline() Problem = parsed_args["Problem"] ProfRho = parsed_args["ProfRho"] ProfTheta = parsed_args["ProfTheta"] @@ -76,8 +83,41 @@ PrintHours = parsed_args["PrintHours"] PrintMinutes = parsed_args["PrintMinutes"] PrintSeconds = parsed_args["PrintSeconds"] PrintTime = parsed_args["PrintTime"] +# Device +Device = parsed_args["Device"] +GPUType = parsed_args["GPUType"] +FloatTypeBackend = parsed_args["FloatTypeBackend"] + +if Device == "CPU" + backend = CPU() +elseif Device == "GPU" + if GPUType == "CUDA" + backend = CUDABackend() + CUDA.allowscalar(false) +# CUDA.device!(MPI.Comm_rank(MPI.COMM_WORLD)) + elseif GPUType == "AMD" + backend = ROCBackend() + AMDGPU.allowscalar(false) + elseif GPUType == "Metal" + backend = MetalBackend() + Metal.allowscalar(true) + end +else + backend = CPU() +end + +if FloatTypeBackend == "Float64" + FTB = Float64 +elseif FloatTypeBackend == "Float32" + FTB = Float32 +else + @show "False FloatTypeBackend" + stop +end + +Param = Examples.Parameters(FTB,Problem) -Param = CGDycore.Parameters(Problem) +KernelAbstractions.synchronize(backend) MPI.Init() @@ -85,10 +125,10 @@ OrdPolyZ=1 Parallel = true # Physical parameters -Phys=CGDycore.PhysParameters() +Phys=DyCore.PhysParameters{FTB}() #ModelParameters -Model = CGDycore.Model() +Model = DyCore.ModelStruct{FTB}() # Initial conditions Model.Equation="Compressible" Model.NumV=NumV @@ -151,7 +191,7 @@ Model.HyperDDiv = HyperDDiv # =7.e15 -Boundary = CGDycore.Boundary() +Boundary = Grids.Boundary() Boundary.WE = BoundaryWE Boundary.SN = BoundarySN Boundary.BT = BoundaryBT @@ -161,20 +201,17 @@ Topography=(TopoS=TopoS, P2=P2, P3=P3, P4=P4, - ) + ) -Global = CGDycore.InitCart(OrdPoly,OrdPolyZ,nx,ny,Lx,Ly,x0,y0,nz,H, + @show "vor InitCart" + (CG, Metric, Exchange, Global) = DyCore.InitCart(backend,FTB,OrdPoly,OrdPolyZ,nx,ny,Lx,Ly,x0,y0,nz,H, Boundary,GridType,Topography,Decomp,Model,Phys) -if TopoS == "EarthOrography" - (CG,Global)=CGDycore.DiscretizationCG(OrdPoly,OrdPolyZ,CGDycore.JacobiDG3Neu,Global,zS) -else - (CG,Global)=CGDycore.DiscretizationCG(OrdPoly,OrdPolyZ,CGDycore.JacobiDG3Neu,Global) -end + Profile = Examples.RotationalCartExample()(Param,Phys) + U = GPU.InitialConditionsAdvection(backend,FTB,CG,Metric,Phys,Global,Profile,Param) - U = CGDycore.InitialConditionsAdvection(CG,Global,Param) # Output Global.Output.vtkFileName=string(Problem*"_") @@ -190,7 +227,8 @@ end Global.Output.PrintTime = PrintTime Global.Output.PrintStartTime = 0 Global.Output.OrdPrint=CG.OrdPoly - Global.vtkCache = CGDycore.vtkStruct(Global.Output.OrdPrint,CGDycore.TransCartX,CG,Global) + @show "vor Global.vtkCache" + Global.vtkCache = Outputs.vtkStruct{FTB}(backend,Global.Output.OrdPrint,Grids.TransCartX!,CG,Metric,Global) # TimeStepper @@ -205,5 +243,10 @@ end Global.TimeStepper.SimTime = SimTime nT = NumV + NumTr - CGDycore.InitExchangeData3D(nz,nT,Global.Exchange) - CGDycore.TimeStepperAdvection!(U,CGDycore.TransCartX,CG,Global,Param) + Parallels.InitExchangeData3D(backend,FTB,nz,nT,Exchange) + @show "vor TimeStepperGPUAdvection!" + if Device == "CPU" || Device == "GPU" + Integration.TimeStepperGPUAdvection!(U,GPU.FcnAdvectionGPU!,Grids.TransCartX,CG,Metric,Phys,Exchange,Global,Param,Profile) + else + Integration.TimeStepperAdvection!(U,DyCore.FcnTracer!,Grids.TransCartX,CG,Metric,Phys,Exchange,Global,Param,Profile) + end diff --git a/Examples/testAdvectionCartGPU.jl b/Examples/testAdvectionCartGPU.jl deleted file mode 100644 index d832012..0000000 --- a/Examples/testAdvectionCartGPU.jl +++ /dev/null @@ -1,252 +0,0 @@ -import CGDycore: - Examples, Parallels, Models, Grids, Outputs, Integration, GPU, DyCore -using MPI -using Base -using CUDA -using AMDGPU -using Metal -using KernelAbstractions -using StaticArrays -using ArgParse -using MPI -# Model -parsed_args = DyCore.parse_commandline() -Problem = parsed_args["Problem"] -ProfRho = parsed_args["ProfRho"] -ProfTheta = parsed_args["ProfTheta"] -ProfVel = parsed_args["ProfVel"] -ProfVelW = parsed_args["ProfVelW"] -ProfpBGrd = parsed_args["ProfpBGrd"] -ProfRhoBGrd = parsed_args["ProfRhoBGrd"] -ProfTr = parsed_args["ProfTr"] -HorLimit = parsed_args["HorLimit"] -Upwind = parsed_args["Upwind"] -Damping = parsed_args["Damping"] -Relax = parsed_args["Relax"] -StrideDamp = parsed_args["StrideDamp"] -NumV = parsed_args["NumV"] -NumTr = parsed_args["NumTr"] -BoundaryWE = parsed_args["BoundaryWE"] -BoundarySN = parsed_args["BoundarySN"] -BoundaryBT = parsed_args["BoundaryBT"] -Thermo = parsed_args["Thermo"] -RefProfile = parsed_args["RefProfile"] -Profile = parsed_args["Profile"] -Curl = parsed_args["Curl"] -ModelType = parsed_args["ModelType"] -#Orography -TopoS = parsed_args["TopoS"] -P1 = parsed_args["P1"] -P2 = parsed_args["P2"] -P3 = parsed_args["P3"] -P4 = parsed_args["P4"] - -# Parallel -Decomp = parsed_args["Decomp"] -SimDays = parsed_args["SimDays"] -SimHours = parsed_args["SimHours"] -SimMinutes = parsed_args["SimMinutes"] -SimSeconds = parsed_args["SimSeconds"] -SimTime = parsed_args["SimTime"] -dtau = parsed_args["dtau"] -IntMethod = parsed_args["IntMethod"] -Table = parsed_args["Table"] -GridType = parsed_args["GridType"] -Coriolis = parsed_args["Coriolis"] -CoriolisType = parsed_args["CoriolisType"] -Microphysics = parsed_args["Microphysics"] -Source = parsed_args["Source"] -VerticalDiffusion = parsed_args["VerticalDiffusion"] -VerticalDiffusionMom = parsed_args["VerticalDiffusionMom"] -SurfaceFlux = parsed_args["SurfaceFlux"] -SurfaceFluxMom = parsed_args["SurfaceFluxMom"] -# Grid -nx = parsed_args["nx"] -ny = parsed_args["ny"] -nz = parsed_args["nz"] -H = parsed_args["H"] -Stretch = parsed_args["Stretch"] -StretchType = parsed_args["StretchType"] -OrdPoly = parsed_args["OrdPoly"] -Lx = parsed_args["Lx"] -Ly = parsed_args["Ly"] -x0 = parsed_args["x0"] -y0 = parsed_args["y0"] -# Viscosity -HyperVisc = parsed_args["HyperVisc"] -HyperDCurl = parsed_args["HyperDCurl"] -HyperDGrad = parsed_args["HyperDGrad"] -HyperDDiv = parsed_args["HyperDDiv"] -# Output -PrintDays = parsed_args["PrintDays"] -PrintHours = parsed_args["PrintHours"] -PrintMinutes = parsed_args["PrintMinutes"] -PrintSeconds = parsed_args["PrintSeconds"] -PrintTime = parsed_args["PrintTime"] -# Device -Device = parsed_args["Device"] -GPUType = parsed_args["GPUType"] -FloatTypeBackend = parsed_args["FloatTypeBackend"] - -if Device == "CPU" - backend = CPU() -elseif Device == "GPU" - if GPUType == "CUDA" - backend = CUDABackend() - CUDA.allowscalar(false) -# CUDA.device!(MPI.Comm_rank(MPI.COMM_WORLD)) - elseif GPUType == "AMD" - backend = ROCBackend() - AMDGPU.allowscalar(false) - elseif GPUType == "Metal" - backend = MetalBackend() - Metal.allowscalar(true) - end -else - backend = CPU() -end - -if FloatTypeBackend == "Float64" - FTB = Float64 -elseif FloatTypeBackend == "Float32" - FTB = Float32 -else - @show "False FloatTypeBackend" - stop -end - -Param = Examples.Parameters(FTB,Problem) - -KernelAbstractions.synchronize(backend) - -MPI.Init() - -OrdPolyZ=1 -Parallel = true - -# Physical parameters -Phys=DyCore.PhysParameters{FTB}() - -#ModelParameters -Model = DyCore.ModelStruct{FTB}() -# Initial conditions -Model.Equation="Compressible" -Model.NumV=NumV -Model.NumTr=NumTr -Model.Problem=Problem -if ProfRho == "" - Model.ProfRho = Problem -else - Model.ProfRho = ProfRho -end -if ProfTheta == "" - Model.ProfTheta = Problem -else - Model.ProfTheta = ProfTheta -end -if ProfVel == "" - Model.ProfVel = Problem -else - Model.ProfVel = ProfVel -end -if ProfVelW == "" - Model.ProfVelW = Problem -else - Model.ProfVelW = ProfVelW -end -Model.ProfpBGrd = ProfpBGrd -Model.ProfRhoBGrd = ProfRhoBGrd -Model.ProfTr = ProfTr -Model.RefProfile = RefProfile -Model.Profile = Profile -Model.RhoPos=1 -Model.uPos=2 -Model.vPos=3 -Model.wPos=4 -Model.ThPos=5 -Model.HorLimit = HorLimit -Model.Upwind = Upwind -Model.Damping = Damping -Model.StrideDamp = StrideDamp -Model.Relax = Relax -Model.Coriolis = Coriolis -Model.CoriolisType = CoriolisType -Model.VerticalDiffusion = VerticalDiffusion -Model.VerticalDiffusionMom = VerticalDiffusionMom -Model.Source = Source -Model.Microphysics = Microphysics -Model.Source = Source -Model.SurfaceFlux = SurfaceFlux -Model.SurfaceFluxMom = SurfaceFluxMom -Model.Thermo = Thermo -Model.Curl = Curl -Model.Stretch = Stretch -Model.StretchType = StretchType -Model.ModelType = ModelType -Model.HyperVisc = HyperVisc -Model.HyperDCurl = HyperDCurl # =7.e15 -Model.HyperDGrad = HyperDGrad # =7.e15 -Model.HyperDDiv = HyperDDiv # =7.e15 - - - - -Boundary = Grids.Boundary() -Boundary.WE = BoundaryWE -Boundary.SN = BoundarySN -Boundary.BT = BoundaryBT -Topography=(TopoS=TopoS, - H=H, - P1=P1, - P2=P2, - P3=P3, - P4=P4, - ) - - @show "vor InitCart" - (CG, Metric, Exchange, Global) = DyCore.InitCart(backend,FTB,OrdPoly,OrdPolyZ,nx,ny,Lx,Ly,x0,y0,nz,H, - Boundary,GridType,Topography,Decomp,Model,Phys) - - Profile = Examples.RotationalCartExample()(Param,Phys) - - - U = GPU.InitialConditionsAdvection(backend,FTB,CG,Metric,Phys,Global,Profile,Param) - - -# Output - Global.Output.vtkFileName=string(Problem*"_") - Global.Output.vtk=0 - Global.Output.Flat=true - Global.Output.H=H - Global.Output.cNames = [ - "Rho", - "Tr1", - ] - Global.Output.PrintDays = PrintDays - Global.Output.PrintSeconds = PrintSeconds - Global.Output.PrintTime = PrintTime - Global.Output.PrintStartTime = 0 - Global.Output.OrdPrint=CG.OrdPoly - @show "vor Global.vtkCache" - Global.vtkCache = Outputs.vtkStruct{FTB}(backend,Global.Output.OrdPrint,Grids.TransCartX!,CG,Metric,Global) - - - # TimeStepper - time=[0.0] - Global.TimeStepper.IntMethod = IntMethod - Global.TimeStepper.Table = Table - Global.TimeStepper.dtau = dtau - Global.TimeStepper.SimDays = SimDays - Global.TimeStepper.SimHours = SimHours - Global.TimeStepper.SimMinutes = SimMinutes - Global.TimeStepper.SimSeconds = SimSeconds - Global.TimeStepper.SimTime = SimTime - - nT = NumV + NumTr - Parallels.InitExchangeData3D(backend,FTB,nz,nT,Exchange) - @show "vor TimeStepperGPUAdvection!" - if Device == "CPU" || Device == "CPU" - Integration.TimeStepperGPUAdvection!(U,GPU.FcnAdvectionGPU!,Grids.TransCartX,CG,Metric,Phys,Exchange,Global,Param,Profile) - else - Integration.TimeStepperAdvection!(U,DyCore.FcnTracer!,Grids.TransCartX,CG,Metric,Phys,Exchange,Global,Param,Profile) - end diff --git a/Examples/testAdvectionCartMPI.jl b/Examples/testAdvectionCartMPI.jl deleted file mode 100644 index b23d912..0000000 --- a/Examples/testAdvectionCartMPI.jl +++ /dev/null @@ -1,225 +0,0 @@ -using CGDycore -using MPI -using Base - -# Model -parsed_args = CGDycore.parse_commandline() -Problem = parsed_args["Problem"] -ProfRho = parsed_args["ProfRho"] -ProfTheta = parsed_args["ProfTheta"] -ProfVel = parsed_args["ProfVel"] -ProfVelW = parsed_args["ProfVelW"] -ProfTr = parsed_args["ProfTr"] -HorLimit = parsed_args["HorLimit"] -Upwind = parsed_args["Upwind"] -Damping = parsed_args["Damping"] -Relax = parsed_args["Relax"] -StrideDamp = parsed_args["StrideDamp"] -NumV = parsed_args["NumV"] -NumTr = parsed_args["NumTr"] -# Parallel -Decomp = parsed_args["Decomp"] -SimDays = parsed_args["SimDays"] -SimHours = parsed_args["SimHours"] -SimMinutes = parsed_args["SimMinutes"] -SimSeconds = parsed_args["SimSeconds"] -SimTime = parsed_args["SimTime"] -dtau = parsed_args["dtau"] -IntMethod = parsed_args["IntMethod"] -Table = parsed_args["Table"] -TopoS = parsed_args["TopoS"] -GridType = parsed_args["GridType"] -Coriolis = parsed_args["Coriolis"] -CoriolisType = parsed_args["CoriolisType"] -Microphysics = parsed_args["Microphysics"] -Source = parsed_args["Source"] -VerticalDiffusion = parsed_args["VerticalDiffusion"] -SurfaceFlux = parsed_args["SurfaceFlux"] -# Grid -nx = parsed_args["nx"] -ny = parsed_args["ny"] -nz = parsed_args["nz"] -nPanel = parsed_args["nPanel"] -H = parsed_args["H"] -stretch = parsed_args["stretch"] -OrdPoly = parsed_args["OrdPoly"] -Lx = parsed_args["Lx"] -Ly = parsed_args["Ly"] -x0 = parsed_args["x0"] -y0 = parsed_args["y0"] -BoundaryWE = parsed_args["BoundaryWE"] -BoundarySN = parsed_args["BoundarySN"] -BoundaryBT = parsed_args["BoundaryBT"] -# Viscosity -HyperVisc = parsed_args["HyperVisc"] -HyperDCurl = parsed_args["HyperDCurl"] -HyperDGrad = parsed_args["HyperDGrad"] -HyperDDiv = parsed_args["HyperDDiv"] -# Output -vtkFileName = parsed_args["vtkFileName"] -PrintDays = parsed_args["PrintDays"] -PrintHours = parsed_args["PrintHours"] -PrintMinutes = parsed_args["PrintMinutes"] -PrintSeconds = parsed_args["PrintSeconds"] -PrintTime = parsed_args["PrintTime"] -Flat = parsed_args["Flat"] -#Orography -TopoS = parsed_args["TopoS"] - - -Param = CGDycore.Parameters(Problem) - -MPI.Init() -comm = MPI.COMM_WORLD -Proc = MPI.Comm_rank(comm) + 1 -ProcNumber = MPI.Comm_size(comm) -ParallelCom = CGDycore.ParallelCom() -ParallelCom.Proc = Proc -ParallelCom.ProcNumber = ProcNumber -print("$Proc: \n") -print("$ProcNumber: \n") - -OrdPolyZ=1 -Parallel = true - -# Physical parameters -Phys=CGDycore.PhysParameters() - -#ModelParameters -Model = CGDycore.Model() -# Initial conditions - Model.Equation="Compressible" - Model.NumV=NumV - Model.NumTr=NumTr - Model.Problem=Problem - if ProfRho == "" - Model.ProfRho = Problem - else - Model.ProfRho = ProfRho - end - if ProfTheta == "" - Model.ProfTheta = Problem - else - Model.ProfTheta = ProfTheta - end - if ProfVel == "" - Model.ProfVel = Problem - else - Model.ProfVel = ProfVel - end - if ProfVelW == "" - Model.ProfVelW = Problem - else - Model.ProfVelW = ProfVelW - end - if ProfTr == "" - Model.ProfTr = Problem - else - Model.ProfTr = ProfTr - end - Model.RhoPos=1 - Model.uPos=0 - Model.vPos=0 - Model.wPos=0 - Model.HorLimit = HorLimit - Model.Upwind = Upwind - - -# Grid -TimeStepper=CGDycore.TimeStepper() - - -Boundary = CGDycore.Boundary() -Boundary.WE = BoundaryWE -Boundary.SN = BoundarySN -Boundary.BT = BoundaryBT - -Topography=(TopoS=TopoS, - H=H, - ) - -Grid=CGDycore.Grid(nz,Topography) -Grid=CGDycore.CartGrid(nx,ny,Lx,Ly,x0,y0,CGDycore.OrientFaceCart,Boundary,Grid) - -if Parallel - CellToProc = CGDycore.Decompose(Grid,ProcNumber) - SubGrid = CGDycore.ConstructSubGrid(Grid,CellToProc,Proc) - - if stretch - sigma = 1.0 - lambda = 3.16 - CGDycore.AddStretchICONVerticalGrid!(SubGrid,nz,H,sigma,lambda) - else - CGDycore.AddVerticalGrid!(SubGrid,nz,H) - end - Exchange = CGDycore.InitExchangeCG(SubGrid,OrdPoly,CellToProc,Proc,ProcNumber,Parallel,HorLimit) - Output=CGDycore.Output(Topography) - Global = CGDycore.Global(SubGrid,Model,TimeStepper,ParallelCom,Phys,Output,Exchange,OrdPoly+1,nz,NumV,NumTr,()) - Global.Metric=CGDycore.Metric(OrdPoly+1,OrdPolyZ+1,SubGrid.NumFaces,nz) - (CG,Global)=CGDycore.DiscretizationCG(OrdPoly,OrdPolyZ,CGDycore.JacobiDG3Neu,Global) - # Output partition - nzTemp = Global.Grid.nz - Global.Grid.nz = 1 - vtkCachePart = CGDycore.vtkInit3D(1,CGDycore.TransCartX,CG,Global) - CGDycore.unstructured_vtkPartition(vtkCachePart, Global.Grid.NumFaces, Proc, ProcNumber) - Global.Grid.nz = nzTemp - -else - CellToProc=zeros(0) - Proc = 0 - ProcNumber = 0 - sigma = 1.0 - lambda = 3.16 - CGDycore.AddStretchICONVerticalGrid!(Grid,nz,H,sigma,lambda) - Exchange = CGDycore.InitExchange(Grid,OrdPoly,CellToProc,Proc,ProcNumber,Parallel) - Output=CGDycore.Output(Topography) - Global = CGDycore.Global(Grid,Model,Phys,Output,Exchange,OrdPoly+1,nz,NumV,NumTr,()) - Global.Metric=CGDycore.Metric(OrdPoly+1,OrdPolyZ+1,Grid.NumFaces,nz) -end - (CG,Global)=CGDycore.DiscretizationCG(OrdPoly,OrdPolyZ,CGDycore.JacobiDG3Neu,Global) - -Model.HyperVisc = HyperVisc -Model.HyperDCurl = HyperDCurl # =7.e15 -Model.HyperDGrad = HyperDGrad # =7.e15 -Model.HyperDDiv = HyperDDiv # =7.e15 - - -U = CGDycore.InitialConditionsAdvection(CG,Global,Param) - -# Output partition - nzTemp = Global.Grid.nz - Global.Grid.nz = 1 - vtkCachePart = CGDycore.vtkInit2D(1,CGDycore.TransCartX,CG,Global) - Global.Grid.nz = nzTemp - CGDycore.unstructured_vtkPartition(vtkCachePart, Global.Grid.NumFaces, Proc, ProcNumber) - -# Output - Output.vtkFileName=string(vtkFileName*"_") - Output.vtk=0 - Output.Flat=Flat - Output.nPanel=nPanel - Output.RadPrint=H - Output.H=H - Output.cNames = [ - "Rho", - "Tr1", -] - Output.PrintDays = PrintDays - Output.PrintHours = PrintHours - Output.PrintSeconds = PrintSeconds - Output.PrintTime = PrintTime - Output.PrintStartDays = 0 - Output.OrdPrint=CG.OrdPoly - Global.vtkCache = CGDycore.vtkInit3D(Output.OrdPrint,CGDycore.TransCartX,CG,Global) - - # TimeStepper - time=[0.0] - TimeStepper.IntMethod = IntMethod - TimeStepper.Table = Table - TimeStepper.dtau = dtau - TimeStepper.SimDays = SimDays - TimeStepper.SimHours = SimHours - TimeStepper.SimMinutes = SimMinutes - TimeStepper.SimSeconds = SimSeconds - TimeStepper.SimTime = SimTime - CGDycore.TimeStepperAdvection!(U,CGDycore.TransCartX,CG,Global,Param) diff --git a/Examples/testAdvectionSphere.jl b/Examples/testAdvectionSphere.jl index 9f6281f..ac0f357 100644 --- a/Examples/testAdvectionSphere.jl +++ b/Examples/testAdvectionSphere.jl @@ -1,14 +1,23 @@ -using CGDycore +import CGDycore: + Examples, Parallels, Models, Grids, Outputs, Integration, GPU, DyCore using MPI using Base - +using CUDA +using AMDGPU +using Metal +using KernelAbstractions +using StaticArrays +using ArgParse +using MPI # Model -parsed_args = CGDycore.parse_commandline() +parsed_args = DyCore.parse_commandline() Problem = parsed_args["Problem"] ProfRho = parsed_args["ProfRho"] ProfTheta = parsed_args["ProfTheta"] ProfVel = parsed_args["ProfVel"] ProfVelW = parsed_args["ProfVelW"] +ProfpBGrd = parsed_args["ProfpBGrd"] +ProfRhoBGrd = parsed_args["ProfRhoBGrd"] ProfTr = parsed_args["ProfTr"] HorLimit = parsed_args["HorLimit"] Upwind = parsed_args["Upwind"] @@ -17,7 +26,21 @@ Relax = parsed_args["Relax"] StrideDamp = parsed_args["StrideDamp"] NumV = parsed_args["NumV"] NumTr = parsed_args["NumTr"] -RadEarth = parsed_args["RadEarth"] +BoundaryWE = parsed_args["BoundaryWE"] +BoundarySN = parsed_args["BoundarySN"] +BoundaryBT = parsed_args["BoundaryBT"] +Thermo = parsed_args["Thermo"] +RefProfile = parsed_args["RefProfile"] +Profile = parsed_args["Profile"] +Curl = parsed_args["Curl"] +ModelType = parsed_args["ModelType"] +#Orography +TopoS = parsed_args["TopoS"] +P1 = parsed_args["P1"] +P2 = parsed_args["P2"] +P3 = parsed_args["P3"] +P4 = parsed_args["P4"] + # Parallel Decomp = parsed_args["Decomp"] SimDays = parsed_args["SimDays"] @@ -28,119 +51,195 @@ SimTime = parsed_args["SimTime"] dtau = parsed_args["dtau"] IntMethod = parsed_args["IntMethod"] Table = parsed_args["Table"] -TopoS = parsed_args["TopoS"] GridType = parsed_args["GridType"] Coriolis = parsed_args["Coriolis"] CoriolisType = parsed_args["CoriolisType"] Microphysics = parsed_args["Microphysics"] Source = parsed_args["Source"] VerticalDiffusion = parsed_args["VerticalDiffusion"] +VerticalDiffusionMom = parsed_args["VerticalDiffusionMom"] SurfaceFlux = parsed_args["SurfaceFlux"] +SurfaceFluxMom = parsed_args["SurfaceFluxMom"] # Grid -nz = parsed_args["nz"] +RadEarth = parsed_args["RadEarth"] nPanel = parsed_args["nPanel"] +nz = parsed_args["nz"] H = parsed_args["H"] +Stretch = parsed_args["Stretch"] +StretchType = parsed_args["StretchType"] OrdPoly = parsed_args["OrdPoly"] +Lx = parsed_args["Lx"] +Ly = parsed_args["Ly"] +x0 = parsed_args["x0"] +y0 = parsed_args["y0"] # Viscosity HyperVisc = parsed_args["HyperVisc"] HyperDCurl = parsed_args["HyperDCurl"] HyperDGrad = parsed_args["HyperDGrad"] HyperDDiv = parsed_args["HyperDDiv"] # Output -vtkFileName = parsed_args["vtkFileName"] PrintDays = parsed_args["PrintDays"] PrintHours = parsed_args["PrintHours"] PrintMinutes = parsed_args["PrintMinutes"] PrintSeconds = parsed_args["PrintSeconds"] PrintTime = parsed_args["PrintTime"] -Flat = parsed_args["Flat"] +# Device +Device = parsed_args["Device"] +GPUType = parsed_args["GPUType"] +FloatTypeBackend = parsed_args["FloatTypeBackend"] + +if Device == "CPU" + backend = CPU() +elseif Device == "GPU" + if GPUType == "CUDA" + backend = CUDABackend() + CUDA.allowscalar(false) +# CUDA.device!(MPI.Comm_rank(MPI.COMM_WORLD)) + elseif GPUType == "AMD" + backend = ROCBackend() + AMDGPU.allowscalar(false) + elseif GPUType == "Metal" + backend = MetalBackend() + Metal.allowscalar(true) + end +else + backend = CPU() +end + +if FloatTypeBackend == "Float64" + FTB = Float64 +elseif FloatTypeBackend == "Float32" + FTB = Float32 +else + @show "False FloatTypeBackend" + stop +end + +Param = Examples.Parameters(FTB,Problem) -Param = CGDycore.Parameters(Problem) +KernelAbstractions.synchronize(backend) MPI.Init() OrdPolyZ=1 +Parallel = true # Physical parameters -Phys=CGDycore.PhysParameters() +Phys=DyCore.PhysParameters{FTB}() #ModelParameters -Model = CGDycore.Model() +Model = DyCore.ModelStruct{FTB}() # Initial conditions - Model.Equation="Compressible" - Model.NumV=NumV - Model.NumTr=NumTr - Model.Problem=Problem - if ProfRho == "" - Model.ProfRho = Problem - else - Model.ProfRho = ProfRho - end - if ProfTheta == "" - Model.ProfTheta = Problem - else - Model.ProfTheta = ProfTheta - end - if ProfVel == "" - Model.ProfVel = Problem - else - Model.ProfVel = ProfVel - end - if ProfVelW == "" - Model.ProfVelW = Problem - else - Model.ProfVelW = ProfVelW - end - if ProfTr == "" - Model.ProfTr = Problem - else - Model.ProfTr = ProfTr - end - Model.RhoPos=1 - Model.uPos=0 - Model.vPos=0 - Model.wPos=0 - Model.HorLimit = HorLimit - Model.Upwind = Upwind +Model.Equation="Compressible" +Model.NumV=NumV +Model.NumTr=NumTr +Model.Problem=Problem +if ProfRho == "" + Model.ProfRho = Problem +else + Model.ProfRho = ProfRho +end +if ProfTheta == "" + Model.ProfTheta = Problem +else + Model.ProfTheta = ProfTheta +end +if ProfVel == "" + Model.ProfVel = Problem +else + Model.ProfVel = ProfVel +end +if ProfVelW == "" + Model.ProfVelW = Problem +else + Model.ProfVelW = ProfVelW +end +Model.ProfpBGrd = ProfpBGrd +Model.ProfRhoBGrd = ProfRhoBGrd +Model.ProfTr = ProfTr +Model.RefProfile = RefProfile +Model.Profile = Profile +Model.RhoPos=1 +Model.uPos=2 +Model.vPos=3 +Model.wPos=4 +Model.ThPos=5 +Model.HorLimit = HorLimit +Model.Upwind = Upwind +Model.Damping = Damping +Model.StrideDamp = StrideDamp +Model.Relax = Relax +Model.Coriolis = Coriolis +Model.CoriolisType = CoriolisType +Model.VerticalDiffusion = VerticalDiffusion +Model.VerticalDiffusionMom = VerticalDiffusionMom +Model.Source = Source +Model.Microphysics = Microphysics +Model.Source = Source +Model.SurfaceFlux = SurfaceFlux +Model.SurfaceFluxMom = SurfaceFluxMom +Model.Thermo = Thermo +Model.Curl = Curl +Model.Stretch = Stretch +Model.StretchType = StretchType +Model.ModelType = ModelType +Model.HyperVisc = HyperVisc +Model.HyperDCurl = HyperDCurl # =7.e15 +Model.HyperDGrad = HyperDGrad # =7.e15 +Model.HyperDDiv = HyperDDiv # =7.e15 + - if RadEarth == 0.0 - RadEarth = Phys.RadEarth - end +Boundary = Grids.Boundary() +Boundary.WE = BoundaryWE +Boundary.SN = BoundarySN +Boundary.BT = BoundaryBT +Topography=(TopoS=TopoS, + H=H, + P1=P1, + P2=P2, + P3=P3, + P4=P4, + ) + # Grid +if RadEarth == 0.0 + RadEarth = Phys.RadEarth +end Topography=(TopoS=TopoS,H=H,Rad=RadEarth) OrdPolyZ = 1 +@show "vor InitSphere" +(CG,Metric,Exchange,Global) = DyCore.InitSphere(backend,FTB,OrdPoly,OrdPolyZ,nz,nPanel,H,GridType, + Topography,Decomp,Model,Phys,RadEarth) -(CG,Global) = CGDycore.InitSphere(OrdPoly,OrdPolyZ,nz,nPanel,H,GridType,Topography,Decomp,Model,Phys) - -Model.HyperVisc = HyperVisc -Model.HyperDCurl = HyperDCurl # =7.e15 -Model.HyperDGrad = HyperDGrad # =7.e15 -Model.HyperDDiv = HyperDDiv # =7.e15 +if Problem == "DCMIPAdvection" + Profile = Examples.DCMIPAdvectionExample()(Param,Phys) +elseif Problem == "AdvectionSphereSlottedCylinder" + Profile = Examples.DivergentSphereExample()(Param,Phys) +end - -U = CGDycore.InitialConditionsAdvection(CG,Global,Param) +U = GPU.InitialConditionsAdvection(backend,FTB,CG,Metric,Phys,Global,Profile,Param) # Output - Global.Output.vtkFileName=string(vtkFileName*"_") + Global.Output.vtkFileName=string(Problem*"_") Global.Output.vtk=0 - Global.Output.Flat=Flat - Global.Output.nPanel=nPanel - Global.Output.RadPrint=H + Global.Output.Flat=true + Global.Output.nPanel = nPanel Global.Output.H=H Global.Output.cNames = [ - "Rho", - "Tr1", -] + "Rho", + "Tr1", + ] Global.Output.PrintDays = PrintDays - Global.Output.PrintHours = PrintHours Global.Output.PrintSeconds = PrintSeconds Global.Output.PrintTime = PrintTime Global.Output.PrintStartTime = 0 Global.Output.OrdPrint=CG.OrdPoly - Global.vtkCache = CGDycore.vtkStruct(Global.Output.OrdPrint,CGDycore.TransSphereX,CG,Global) + @show "vor Global.vtkCache" + Global.vtkCache = Outputs.vtkStruct{FTB}(backend,Global.Output.OrdPrint,Grids.TransSphereX!,CG,Metric,Global) + # TimeStepper time=[0.0] @@ -152,7 +251,12 @@ U = CGDycore.InitialConditionsAdvection(CG,Global,Param) Global.TimeStepper.SimMinutes = SimMinutes Global.TimeStepper.SimSeconds = SimSeconds Global.TimeStepper.SimTime = SimTime - + nT = NumV + NumTr - CGDycore.InitExchangeData3D(nz,nT,Global.Exchange) - CGDycore.TimeStepperAdvection!(U,CGDycore.TransSphereX,CG,Global,Param) + Parallels.InitExchangeData3D(backend,FTB,nz,nT,Exchange) + @show "vor TimeStepperGPUAdvection!" + if Device == "CPU" || Device == "GPU" + Integration.TimeStepperGPUAdvection!(U,GPU.FcnAdvectionGPU!,Grids.TransSphereX,CG,Metric,Phys,Exchange,Global,Param,Profile) + else + Integration.TimeStepperAdvection!(U,DyCore.FcnTracer!,Grids.TransSphereX,CG,Metric,Phys,Exchange,Global,Param,Profile) + end diff --git a/Examples/testNHSphere.jl b/Examples/testNHSphere.jl index 2585713..c2fa1ef 100644 --- a/Examples/testNHSphere.jl +++ b/Examples/testNHSphere.jl @@ -202,11 +202,15 @@ Model.HyperDRhoDiv = HyperDRhoDiv Model.HyperDDiv = HyperDDiv OrdPolyZ = 1 +if RadEarth == 0.0 + RadEarth = Phys.RadEarth +end Topography = (TopoS=TopoS,H=H,Rad=Phys.RadEarth) @show "InitSphere" -(CG, Metric, Exchange, Global) = DyCore.InitSphere(backend,FTB,OrdPoly,OrdPolyZ,nz,nPanel,H,GridType,Topography,Decomp,Model,Phys) +(CG, Metric, Exchange, Global) = DyCore.InitSphere(backend,FTB,OrdPoly,OrdPolyZ,nz,nPanel,H, + GridType,Topography,Decomp,Model,Phys,RadEarth) # Initial values if Problem == "Galewski" diff --git a/Jobs/JobAdvectionCubeCartCPU b/Jobs/JobAdvectionCubeCartCPU index bd43afe..1e17d51 100755 --- a/Jobs/JobAdvectionCubeCartCPU +++ b/Jobs/JobAdvectionCubeCartCPU @@ -11,7 +11,7 @@ mpirun -n 1 julia --project Examples/testAdvectionCartGPU.jl \ --vtkFileName="AdvectionCubeRotCart" \ --SimTime=1000.0 \ --PrintTime=100.0 \ - --dtau=.25\ + --dtau=0.25\ --IntMethod="SSPRungeKutta" \ --Table="SSP32" \ --Lx=1000.0 \ diff --git a/Jobs/JobAdvectionSlottedCylinder b/Jobs/JobAdvectionSlottedCylinder index cac885d..56755a3 100755 --- a/Jobs/JobAdvectionSlottedCylinder +++ b/Jobs/JobAdvectionSlottedCylinder @@ -1,6 +1,8 @@ -mpirun -n 6 julia --project Examples/testAdvectionSphere.jl \ +mpirun -n 1 julia --project Examples/testAdvectionSphere.jl \ --Problem="AdvectionSphereSlottedCylinder" \ - --NumV=1 \ + --Device="CPU" \ + --FloatTypeBackend="Float64" \ + --NumV=5 \ --NumTr=1 \ --ProfRho="" \ --ProfTheta="" \ @@ -20,6 +22,7 @@ mpirun -n 6 julia --project Examples/testAdvectionSphere.jl \ --TopoS="" \ --GridType="CubedSphere" \ --nz=1 \ + --RadEarth=1.0 \ --nPanel=32 \ --RadEarth=1 \ --H=1.0 \ diff --git a/src/DyCore/DiscretizationCG.jl b/src/DyCore/DiscretizationCG.jl index e54f5c1..666351f 100644 --- a/src/DyCore/DiscretizationCG.jl +++ b/src/DyCore/DiscretizationCG.jl @@ -151,8 +151,8 @@ end if Iz <= Nz && IF <= NF ind = Glob[ID,IF] - @atomic zP[Iz,ind] = eltype(X)(0.5) * (X[ID,1,3,Iz,IF] + X[ID,2,3,Iz,IF]) - @atomic dz[Iz,ind] = X[ID,2,3,Iz,IF] - X[ID,1,3,Iz,IF] + @inbounds zP[Iz,ind] = eltype(X)(0.5) * (X[ID,1,3,Iz,IF] + X[ID,2,3,Iz,IF]) + @inbounds dz[Iz,ind] = X[ID,2,3,Iz,IF] - X[ID,1,3,Iz,IF] end end diff --git a/src/DyCore/FiniteElement.jl b/src/DyCore/FiniteElement.jl index 7911ea3..9a378b5 100644 --- a/src/DyCore/FiniteElement.jl +++ b/src/DyCore/FiniteElement.jl @@ -36,7 +36,7 @@ mutable struct CGStruct{FT<:AbstractFloat, DWT::Array{FT, 2} DS::AT2 DST::Array{FT, 2} - DSZ::Array{FT, 2} + DSZ::AT2 S::Array{FT, 2} M::AT2 MMass::AT2 @@ -96,7 +96,9 @@ function CGStruct{FT}(backend,OrdPoly,OrdPolyZ,Grid) where FT<:AbstractFloat Q = diagm(wCPU) * DSCPU S = Q - Q' - (DWZ,DSZ)=DG.DerivativeMatrixSingle(OrdPolyZ) + (DWZ,DSZCPU)=DG.DerivativeMatrixSingle(OrdPolyZ) + DSZ = KernelAbstractions.zeros(backend,FT,size(DSZCPU)) + copyto!(DSZ,DSZCPU) (GlobCPU,NumG,NumI,StencilCPU,MasterSlaveCPU) = NumberingFemCG(Grid,OrdPoly) diff --git a/src/DyCore/InitDriver.jl b/src/DyCore/InitDriver.jl index 3be7bf5..6523eb0 100644 --- a/src/DyCore/InitDriver.jl +++ b/src/DyCore/InitDriver.jl @@ -1,4 +1,4 @@ -function InitSphere(backend,FT,OrdPoly,OrdPolyZ,nz,nPanel,H,GridType,Topography,Decomp,Model,Phys) +function InitSphere(backend,FT,OrdPoly,OrdPolyZ,nz,nPanel,H,GridType,Topography,Decomp,Model,Phys,RadEarth) comm = MPI.COMM_WORLD Proc = MPI.Comm_rank(comm) + 1 @@ -13,13 +13,13 @@ function InitSphere(backend,FT,OrdPoly,OrdPolyZ,nz,nPanel,H,GridType,Topography, if GridType == "HealPix" # Grid=CGDycore.InputGridH("Grid/mesh_H12_no_pp.nc", # CGDycore.OrientFaceSphere,Phys.RadEarth,Grid) - Grid=Grids.InputGridH("Grid/mesh_H24_no_pp.nc", OrientFaceSphere,Phys.RadEarth,Grid) + Grid=Grids.InputGridH("Grid/mesh_H24_no_pp.nc", OrientFaceSphere,RadEarth,Grid) elseif GridType == "SQuadGen" - Grid = Grids.InputGrid("Grid/baroclinic_wave_2deg_x4.g",OrientFaceSphere,Phys.RadEarth,Grid) + Grid = Grids.InputGrid("Grid/baroclinic_wave_2deg_x4.g",OrientFaceSphere,RadEarth,Grid) elseif GridType == "Msh" - Grid = Grids.InputGridMsh("Grid/Quad.msh",OrientFaceSphere,Phys.RadEarth,Grid) + Grid = Grids.InputGridMsh("Grid/Quad.msh",OrientFaceSphere,RadEarth,Grid) elseif GridType == "CubedSphere" - Grid = Grids.CubedGrid(nPanel,Grids.OrientFaceSphere,Phys.RadEarth,Grid) + Grid = Grids.CubedGrid(nPanel,Grids.OrientFaceSphere,RadEarth,Grid) elseif GridType == "TriangularSphere" IcosahedronGrid = Grids.CreateIcosahedronGrid() RefineLevel = 0 @@ -28,7 +28,7 @@ function InitSphere(backend,FT,OrdPoly,OrdPolyZ,nz,nPanel,H,GridType,Topography, Grids.RefineFaceTriangularGrid!(IcosahedronGrid) end Grids.NumberingTriangularGrid!(IcosahedronGrid) - Grid = Grids.TriangularGridToGrid(IcosahedronGrid,Rad,Grid) + Grid = Grids.TriangularGridToGrid(IcosahedronGrid,RadEarth,Grid) end if Decomp == "Hilbert" diff --git a/src/Examples/initial.jl b/src/Examples/initial.jl index eddb9ca..27d25ed 100644 --- a/src/Examples/initial.jl +++ b/src/Examples/initial.jl @@ -1,7 +1,42 @@ abstract type Example end -Base.@kwdef struct RotationalCartExample <: Example end +Base.@kwdef struct DivergentSphereExample <: Example end + +function (profile::DivergentSphereExample)(Param,Phys) + function local_profile(x,time) + FT = eltype(x) + Rho = FT(1) + Lon,Lat,R = Grids.cart2sphere(x[1],x[2],x[3]) + lonP = Lon - FT(2) * pi * time / Param.EndTime + uS = FT(10) / Param.EndTime * sin(lonP) * sin(lonP) * + sin(FT(2) * Lat) * cos(pi * time / Param.EndTime) + FT(2) * pi / Param.EndTime * cos(Lat) + vS = FT(10) / Param.EndTime * sin(FT(2) * lonP) * cos(Lat) * cos(pi * time / Param.EndTime) + w = FT(0) + lon1 = Param.lon1 + lat1 = Param.lat1 + lon2 = Param.lon2 + lat2 = Param.lat2 + R = FT(1) + r = FT(0.5) * R + r1 = R * Grids.GreatCircle(Lon,Lat,lon1,lat1) + r2 = R * Grids.GreatCircle(Lon,Lat,lon2,lat2) + if r1 <= r && abs(Lon - lon1) >= r / (FT(6.0) * R) + Tr = FT(1.0) + elseif r2 <= r && abs(Lon - lon2) >= r / (FT(6.0) * R) + Tr = FT(1.0) + elseif r1 <= r && abs(Lon - lon1) < r / (FT(6.0) * R) && Lat - lat1 < FT(-5.0 / 12.0) * r / R + Tr = FT(1.0) + elseif r2 <= r && abs(Lon - lon2) < r / (FT(6.0) * R) && Lat - lat2 > FT(5.0 / 12.0) * r / R + Tr = FT(1.0) + else + Tr = FT(.1) + end + return (Rho,uS,vS,w,Tr) + end + return local_profile +end +Base.@kwdef struct RotationalCartExample <: Example end function (profile::RotationalCartExample)(Param,Phys) function local_profile(x,time) @@ -11,7 +46,9 @@ function (profile::RotationalCartExample)(Param,Phys) v = Param.vMax w = sinpi(x[3] / Param.H) * cospi(time / Param.EndTime) w = FT(0) - if x[1] >= Param.x1 && x[1] <= Param.x2 && x[3] >= Param.z1 && x[3] <= Param.z2 + if x[1] >= Param.x1 && x[1] <= Param.x2 && + x[2] >= Param.y1 && x[2] <= Param.y2 && + x[3] >= Param.z1 && x[3] <= Param.z2 Tr = FT(1) else Tr = FT(0) @@ -20,7 +57,6 @@ function (profile::RotationalCartExample)(Param,Phys) end return local_profile end - Base.@kwdef struct WarmBubbleCartExample <: Example end function (profile::WarmBubbleCartExample)(Param,Phys) diff --git a/src/Examples/parameters.jl b/src/Examples/parameters.jl index c655c01..e78d334 100644 --- a/src/Examples/parameters.jl +++ b/src/Examples/parameters.jl @@ -232,18 +232,20 @@ Base.@kwdef struct ParamAdvectionCubeCart y2::Float64 = 601.0 end -Base.@kwdef struct ParamAdvectionCubeRotCart +Base.@kwdef struct ParamAdvectionCubeRotCart{FT} StreamFun::Bool = false - uMax::Float64 = 1.0 - vMax::Float64 = 0.0 - xC::Float64 = 500.0 - zC::Float64 = 500.0 - x1::Float64 = 299.0 - x2::Float64 = 501.0 - z1::Float64 = 299.0 - z2::Float64 = 501.0 - EndTime::Float64 = 1000.0 - H::Float64 = 1000.0 + uMax::FT = 1.0 + vMax::FT = 1.0 + xC::FT = 500.0 + zC::FT = 500.0 + x1::FT = 299.0 + x2::FT = 501.0 + y1::FT = 299.0 + y2::FT = 501.0 + z1::FT = 299.0 + z2::FT = 501.0 + EndTime::FT = 1000.0 + H::FT = 1000.0 end Base.@kwdef struct ParamAdvectionCart @@ -293,7 +295,7 @@ function Parameters(FT,Problem::String) Param = ParamAdvectionCubeCart() elseif Problem == "AdvectionCubeRotCart" @show Problem - Param = ParamAdvectionCubeRotCart() + Param = ParamAdvectionCubeRotCart{FT}() elseif Problem == "WarmBubble2DXCart" @show Problem Param = ParamWarmBubble2DXCart() diff --git a/src/GPU/FcnGPU.jl b/src/GPU/FcnGPU.jl index 0641b28..994edb8 100644 --- a/src/GPU/FcnGPU.jl +++ b/src/GPU/FcnGPU.jl @@ -7,6 +7,7 @@ function FcnAdvectionGPU!(F,U,time,FE,Metric,Phys,Cache,Exchange,Global,Param,Pr DS = FE.DS DW = FE.DW M = FE.M + DoF = FE.DoF Stencil = FE.Stencil dXdxI = Metric.dXdxI X = Metric.X @@ -54,7 +55,7 @@ function FcnAdvectionGPU!(F,U,time,FE,Metric,Phys,Cache,Exchange,Global,Param,Pr KDivRhoTrViscUpwind3LimKernel! = DivRhoTrViscUpwind3LimKernel!(backend, group) if Global.Model.HorLimit - @views KLimitKernel!(qMin,qMax,U[:,:,NumV+1:NumV+NumTr],Rho,Glob,ndrange=ndrangeL) + @views KLimitKernel!(DoF,qMin,qMax,U[:,:,NumV+1:NumV+NumTr],Rho,Glob,ndrange=ndrangeL) KernelAbstractions.synchronize(backend) end @@ -81,15 +82,12 @@ function FcnAdvectionGPU!(F,U,time,FE,Metric,Phys,Cache,Exchange,Global,Param,Pr KernelAbstractions.synchronize(backend) if Global.Model.HorLimit -# @views KDivRhoTrUpwind3LimKernel!(F[:,:,1+NumV],U[:,:,1+NumV],U,DS, -# dXdxI,J,M,Glob,dtau,ww,qMin[:,:,1],qMax[:,:,1],Stencil,ndrange=ndrange) -# KernelAbstractions.synchronize(backend) - @views KDivRhoTrViscUpwind3LimKernel!(F[:,:,1+NumV],U[:,:,1+NumV],U,CacheTr,DS,DW, - dXdxI,J,M,Glob,KoeffDiv,dtau,ww,qMin[:,:,1],qMax[:,:,1],Stencil,ndrange=ndrange) + @views KDivRhoTrUpwind3LimKernel!(F[:,:,1+NumV],U[:,:,1+NumV],U,DS, + dXdxI,J,M,Glob,dtau,ww,qMin[:,:,1],qMax[:,:,1],Stencil,ndrange=ndrange) KernelAbstractions.synchronize(backend) -# for i = 1 : size(F[:,:,1+NumV],2) -# @show i,F[1,i,1+NumV] -# end +# @views KDivRhoTrViscUpwind3LimKernel!(F[:,:,1+NumV],U[:,:,1+NumV],U,CacheTr,DS,DW, +# dXdxI,J,M,Glob,KoeffDiv,dtau,ww,qMin[:,:,1],qMax[:,:,1],Stencil,ndrange=ndrange) +# KernelAbstractions.synchronize(backend) else @views KHyperViscTracerKoeffKernel!(F[:,:,1+NumV],CacheTr,Rho,DS,DW,dXdxI,J,M,Glob, KoeffDiv,ndrange=ndrange) diff --git a/src/GPU/GPU.jl b/src/GPU/GPU.jl index 355de51..7dec706 100644 --- a/src/GPU/GPU.jl +++ b/src/GPU/GPU.jl @@ -6,7 +6,6 @@ import ..Grids using StaticArrays using KernelAbstractions using KernelAbstractions: @atomic, @atomicswap, @atomicreplace -using Statistics: median include("OperatorKernel.jl") include("FcnGPU.jl") diff --git a/src/GPU/HorLimiterKernel.jl b/src/GPU/HorLimiterKernel.jl index 6cbd8f6..2184a62 100644 --- a/src/GPU/HorLimiterKernel.jl +++ b/src/GPU/HorLimiterKernel.jl @@ -1,5 +1,5 @@ -@kernel function LimitKernel!(qMin,qMax,@Const(Rhoq),@Const(Rho),@Const(Glob)) +@kernel function LimitKernel!(DoF,qMin,qMax,@Const(Rhoq),@Const(Rho),@Const(Glob)) iz = @index(Local, NTuple) Iz,IF,IT = @index(Global, NTuple) @@ -13,7 +13,7 @@ @inbounds qMax[Iz,IF,IT] = eltype(Rhoq)(-1/0) if Iz <= Nz && IF <= NF && IT <= NT - for ID in eachindex(Glob[:,IF]) + for ID = 1 : DoF @inbounds ind = Glob[ID,IF] @inbounds qMin[Iz,IF,IT] = min(qMin[Iz,IF,IT],Rhoq[Iz,ind,IT] / Rho[Iz,ind]) @inbounds qMax[Iz,IF,IT] = max(qMax[Iz,IF,IT],Rhoq[Iz,ind,IT] / Rho[Iz,ind]) @@ -67,8 +67,12 @@ end resc[iz] = eltype(FTr)(0) sumJ[iz] = eltype(FTr)(0) conv[iz] = true - qMinS[iz] = minimum(qMin[Iz,Stencil[IF,:]]) - qMaxS[iz] = maximum(qMax[Iz,Stencil[IF,:]]) + qMinS[iz] = qMin[Iz,Stencil[IF,1]] + qMaxS[iz] = qMax[Iz,Stencil[IF,1]] + for iS = 2 : 13 + qMinS[iz] = min(qMin[Iz,Stencil[IF,iS]],qMinS[iz]) + qMaxS[iz] = max(qMax[Iz,Stencil[IF,iS]],qMaxS[iz]) + end end end if iz == 1 @@ -128,8 +132,8 @@ end @inbounds RhoTrColS[I,J,iz] = Tr[Iz,ind] + dt * DivRhoTr[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) @inbounds RhoColS[I,J,iz] = U[Iz,ind,1] + dt * DivRho[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) # Finite difference step - @inbounds q[I,J,iz] = median([qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + - l0 * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]]) + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + l0, qMaxS[iz]) @inbounds @atomic resp[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) end @@ -141,8 +145,8 @@ end @inbounds conv[iz] = false end else - @inbounds qLoc = median([qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + - (l0 + dlFD) * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]]) + @inbounds qLoc = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + (l0 + dlFD), qMaxS[iz]) @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * (qLoc * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) end @@ -164,8 +168,8 @@ end for iTer = 1 : 5 if Iz <= Nz && conv[iz] ID = I + (J - 1) * N - @inbounds q[I,J,iz] = median([qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + - lc[iz] * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]]) + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + lc[iz], qMaxS[iz]) @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) end @@ -318,8 +322,8 @@ end @inbounds RhoTrColS[I,J,iz] = Tr[Iz,ind] + dt * DivRhoTr[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) @inbounds RhoColS[I,J,iz] = U[Iz,ind,1] + dt * DivRho[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) # Finite difference step - @inbounds q[I,J,iz] = median([qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + - l0 * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]]) + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + l0, qMaxS[iz]) @inbounds @atomic resp[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) end @@ -331,8 +335,8 @@ end @inbounds conv[iz] = false end else - @inbounds qLoc = median([qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + - (l0 + dlFD) * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]]) + @inbounds qLoc = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + (l0 + dlFD), qMaxS[iz]) @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * (qLoc * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) end @@ -354,8 +358,8 @@ end for iTer = 1 : 5 if Iz <= Nz && conv[iz] ID = I + (J - 1) * N - @inbounds q[I,J,iz] = median([qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + - lc[iz] * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]]) + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + lc[iz], qMaxS[iz]) @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) end @@ -380,3 +384,24 @@ end (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) / dt / M[Iz,ind] end end + +@inline function medianGPU(a1,a2,a3) + if a1 <= a2 + if a2 <= a3 + m = a2 + elseif a1 <= a3 + m = a3 + else + m = a1 + end + else + if a1 <= a3 + m = a1 + elseif a2 <= a3 + m = a3 + else + m = a2 + end + end +end + diff --git a/src/GPU/HorLimiterKernel.jl_Old b/src/GPU/HorLimiterKernel.jl_Old new file mode 100644 index 0000000..bbcf571 --- /dev/null +++ b/src/GPU/HorLimiterKernel.jl_Old @@ -0,0 +1,417 @@ + +@kernel function LimitKernel!(DoF,qMin,qMax,@Const(Rhoq),@Const(Rho),@Const(Glob)) + + iz = @index(Local, NTuple) + Iz,IF,IT = @index(Global, NTuple) + + Nz = @uniform @ndrange()[1] + NF = @uniform @ndrange()[2] + NT = @uniform @ndrange()[3] + + + @inbounds qMin[Iz,IF,IT] = eltype(Rhoq)(1/0) + @inbounds qMax[Iz,IF,IT] = eltype(Rhoq)(-1/0) + + if Iz <= Nz && IF <= NF && IT <= NT + for ID = 1 : DoF + @inbounds ind = Glob[ID,IF] + @inbounds qMin[Iz,IF,IT] = min(qMin[Iz,IF,IT],Rhoq[Iz,ind,IT] / Rho[Iz,ind]) + @inbounds qMax[Iz,IF,IT] = max(qMax[Iz,IF,IT],Rhoq[Iz,ind,IT] / Rho[Iz,ind]) + end + end +end + +@kernel function DivRhoTrUpwind3LimKernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI), + @Const(JJ),@Const(M),@Const(Glob),dt,@Const(w),@Const(qMin),@Const(qMax),@Const(Stencil)) + +# gi, gj, gz, gF = @index(Group, NTuple) + I, J, iz = @index(Local, NTuple) + _,_,Iz,IF = @index(Global, NTuple) + + ColumnTilesDim = @uniform @groupsize()[3] + N = @uniform @groupsize()[1] + Nz = @uniform @ndrange()[3] + NF = @uniform @ndrange()[4] + + @uniform l0 = eltype(FTr)(0) + @uniform eta = eltype(FTr)(1.e-12) + @uniform dlFD = eltype(FTr)(1.e-8) + + + cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3) + uConCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + vConCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRhoTr = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRho = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoTrColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + q = @localmem eltype(FTr) (N,N, ColumnTilesDim) + resp = @localmem eltype(FTr) (ColumnTilesDim) + resc = @localmem eltype(FTr) (ColumnTilesDim) + alpha = @localmem eltype(FTr) (ColumnTilesDim) + lp = @localmem eltype(FTr) (ColumnTilesDim) + lc = @localmem eltype(FTr) (ColumnTilesDim) + sumJ = @localmem eltype(FTr) (ColumnTilesDim) + qMinS = @localmem eltype(FTr) (ColumnTilesDim) + qMaxS = @localmem eltype(FTr) (ColumnTilesDim) + conv = @localmem (Bool) (ColumnTilesDim) + if Iz <= Nz + ID = I + (J - 1) * N + @inbounds ind = Glob[ID,IF] + @inbounds cCol[I,J,iz+1] = Tr[Iz,ind] / U[Iz,ind,1] + @views @inbounds (uCon, vCon) = Contra12(-U[Iz,ind,1],U[Iz,ind,2],U[Iz,ind,3],dXdxI[1:2,1:2,:,ID,Iz,IF]) + @inbounds uConCol[I,J,iz] = uCon + @inbounds vConCol[I,J,iz] = vCon + if ID == 1 + resp[iz] = eltype(FTr)(0) + resc[iz] = eltype(FTr)(0) + sumJ[iz] = eltype(FTr)(0) + conv[iz] = true + qMinS[iz] = qMin[Iz,Stencil[IF,1]] + qMaxS[iz] = qMax[Iz,Stencil[IF,1]] + for iS = 2 : 13 + qMinS[iz] = min(qMin[Iz,Stencil[IF,iS]],qMinS[iz]) + qMaxS[iz] = max(qMax[Iz,Stencil[IF,iS]],qMaxS[iz]) + end + end + end + if iz == 1 + Izm1 = max(Iz - 1,1) + cCol[I,J,iz] = Tr[Izm1,ind] / U[Izm1,ind,1] + end + if iz == ColumnTilesDim || Iz == Nz + Izp1 = min(Iz + 1,Nz) + cCol[I,J,iz+2] = Tr[Izp1,ind] / U[Izp1,ind,1] + Izp2 = min(Iz + 2,Nz) + cCol[I,J,iz+3] = Tr[Izp2,ind] / U[Izp2,ind,1] + end + @synchronize + + if Iz <= Nz + ID = I + (J - 1) * N + @inbounds @atomic sumJ[iz] += JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + end + @synchronize + + if Iz < Nz + ID = I + (J - 1) * N + @inbounds ind = Glob[ID,IF] + @inbounds cLL = cCol[I,J,iz] + @inbounds cL = cCol[I,J,iz+1] + @inbounds cR = cCol[I,J,iz+2] + @inbounds cRR = cCol[I,J,iz+3] + + @views @inbounds wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3], + U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF]) + + Izm1 = max(Iz - 1,1) + Izp2 = min(Iz + 2, Nz) + @inbounds JLL = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF] + @inbounds JL = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + @inbounds JR = JJ[ID,1,Iz+1,IF] + JJ[ID,2,Iz+1,IF] + @inbounds JRR = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF] + cFL, cFR = RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR) + Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL + (-abs(wCon) + wCon) * cFR) + @inbounds @atomic FTr[Iz,ind] += -Flux / M[Iz,ind] + @inbounds @atomic FTr[Iz+1,ind] += Flux / M[Iz+1,ind] + end + + if Iz <= Nz + ID = I + (J - 1) * N + @inbounds DivRhoTr[I,J,iz] = D[I,1] * uConCol[1,J,iz] * cCol[1,J,iz+1] + @inbounds DivRhoTr[I,J,iz] += D[J,1] * vConCol[I,1,iz] * cCol[I,1,iz+1] + @inbounds DivRho[I,J,iz] = D[I,1] * uConCol[1,J,iz] + @inbounds DivRho[I,J,iz] += D[J,1] * vConCol[I,1,iz] + for k = 2 : N + @inbounds DivRhoTr[I,J,iz] += D[I,k] * uConCol[k,J,iz] * cCol[k,J,iz+1] + @inbounds DivRhoTr[I,J,iz] += D[J,k] * vConCol[I,k,iz] * cCol[I,k,iz+1] + @inbounds DivRho[I,J,iz] += D[I,k] * uConCol[k,J,iz] + @inbounds DivRho[I,J,iz] += D[J,k] * vConCol[I,k,iz] + end + @inbounds ind = Glob[ID,IF] + @inbounds RhoTrColS[I,J,iz] = Tr[Iz,ind] + dt * DivRhoTr[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + @inbounds RhoColS[I,J,iz] = U[Iz,ind,1] + dt * DivRho[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + # Finite difference step + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + l0 * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]) + @inbounds @atomic resp[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + if IF == 6 && I == 5 + @show "S", I,J,q[I,J,iz] + end + end + @synchronize + if Iz <= Nz + ID = I + (J - 1) * N + if abs(resp[iz]) <= eta + if ID == 1 + @inbounds conv[iz] = false + end + else + @inbounds qLoc = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + (l0 + dlFD) * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]) + @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (qLoc * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + end + @synchronize + + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + @inbounds conv[iz] = false + else + @inbounds alpha[iz] = dlFD / (resc[iz] - resp[iz]) + @inbounds lp[iz] = l0 + @inbounds lc[iz] = lp[iz] - alpha[iz] * resp[iz] + @inbounds resp[iz] = eltype(FTr)(0) + @inbounds resc[iz] = eltype(FTr)(0) + end + end + @synchronize + for iTer = 1 : 5 + if Iz <= Nz && conv[iz] + ID = I + (J - 1) * N + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + lc[iz] * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]) + @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + if IF == 6 && I == 5 + ff = lc[iz] * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] + @show "Iter", I,J,q[I,J,iz],ff + end + end + @synchronize + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + @inbounds conv[iz] = false + else + @inbounds alpha[iz] = (lp[iz] - lc[iz]) / (resp[iz] - resc[iz]) + @inbounds resp[iz] = resc[iz] + @inbounds lp[iz] = lc[iz] + @inbounds lc[iz] = lc[iz] - alpha[iz] * resc[iz] + @inbounds resc[iz] = eltype(FTr)(0) + end + end + @synchronize + end + if Iz <= Nz + ID = I + (J - 1) * N + @inbounds ind = Glob[ID,IF] + @inbounds @atomic FTr[Iz,ind] += (q[I,J,iz] * RhoColS[I,J,iz] - Tr[Iz,ind]) * + (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) / dt / M[Iz,ind] + if IF == 6 && I == 5 + @show "E", I,J,q[I,J,iz] + end + end +end + +@kernel function DivRhoTrViscUpwind3LimKernel!(FTr,@Const(Tr),@Const(U),@Const(Cache),@Const(D),@Const(DW),@Const(dXdxI), + @Const(JJ),@Const(M),@Const(Glob),Koeff,dt,@Const(w),@Const(qMin),@Const(qMax),@Const(Stencil)) + + I, J, iz = @index(Local, NTuple) + _,_,Iz,IF = @index(Global, NTuple) + + ColumnTilesDim = @uniform @groupsize()[3] + N = @uniform @groupsize()[1] + Nz = @uniform @ndrange()[3] + NF = @uniform @ndrange()[4] + + @uniform l0 = eltype(FTr)(0) + @uniform eta = eltype(FTr)(1.e-12) + @uniform dlFD = eltype(FTr)(1.e-8) + + cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + CacheCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + uCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + vCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + wCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRhoTr = @localmem eltype(FTr) (N,N, ColumnTilesDim) + DivRho = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoTrColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + RhoColS = @localmem eltype(FTr) (N,N, ColumnTilesDim) + q = @localmem eltype(FTr) (N,N, ColumnTilesDim) + resp = @localmem eltype(FTr) (ColumnTilesDim) + resc = @localmem eltype(FTr) (ColumnTilesDim) + alpha = @localmem eltype(FTr) (ColumnTilesDim) + lp = @localmem eltype(FTr) (ColumnTilesDim) + lc = @localmem eltype(FTr) (ColumnTilesDim) + sumJ = @localmem eltype(FTr) (ColumnTilesDim) + qMinS = @localmem eltype(FTr) (ColumnTilesDim) + qMaxS = @localmem eltype(FTr) (ColumnTilesDim) + conv = @localmem (Bool) (ColumnTilesDim) + if Iz <= Nz + ID = I + (J - 1) * N + @inbounds ind = Glob[ID,IF] + @inbounds CacheCol[I,J,iz] = Cache[Iz,ind] + @inbounds wCol[I,J,iz] = U[Iz,ind,4] + @inbounds RhoCol[I,J,iz] = U[Iz,ind,1] + @inbounds cCol[I,J,iz] = Tr[Iz,ind] / RhoCol[I,J,iz] + @inbounds uCol[I,J,iz] = U[Iz,ind,2] + @inbounds vCol[I,J,iz] = U[Iz,ind,3] + @inbounds DivRho[I,J,iz] = eltype(FTr)(0) + @inbounds DivRhoTr[I,J,iz] = eltype(FTr)(0) + if ID == 1 + resp[iz] = eltype(FTr)(0) + resc[iz] = eltype(FTr)(0) + sumJ[iz] = eltype(FTr)(0) + conv[iz] = true + qMinS[iz] = minimum(qMin[Iz,Stencil[IF,:]]) + qMaxS[iz] = maximum(qMax[Iz,Stencil[IF,:]]) + end + end + @synchronize + if Iz < Nz + ID = I + (J - 1) * N + @inbounds ind = Glob[ID,IF] + @inbounds ind = Glob[ID,IF] + @inbounds cL = cCol[I,J,iz] + @inbounds cR = cCol[I,J,iz+1] + if iz > 1 + @inbounds cLL = cCol[I,J,iz-1] + else + Izm1 = max(Iz - 1,1) + @inbounds cLL = U[Izm1,ind,5] / U[Izm1,ind,1] + end + if iz < ColumnTilesDim - 1 + @inbounds cRR = cCol[I,J,iz+2] + else + Izp2 = min(Iz + 2, Nz) + @inbounds cRR = U[Izp2,ind,5] / U[Izp2,ind,1] + end + + @views @inbounds wCon = Contra3(U[Iz:Iz+1,ind,1],U[Iz:Iz+1,ind,2],U[Iz:Iz+1,ind,3], + U[Iz,ind,4],dXdxI[3,:,:,ID,Iz:Iz+1,IF]) + + Izm1 = max(Iz - 1,1) + Izp2 = min(Iz + 2, Nz) + @inbounds JLL = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF] + @inbounds JL = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + @inbounds JR = JJ[ID,1,Iz+1,IF] + JJ[ID,2,Iz+1,IF] + @inbounds JRR = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF] + cFL, cFR = RecU4(cLL,cL,cR,cRR,JLL,JL,JR,JRR) + Flux = 0.25 * ((abs(wCon) + wCon) * cFL + (-abs(wCon) + wCon) * cFR) + @inbounds @atomic FTr[Iz,ind] += -Flux / M[Iz,ind] + @inbounds @atomic FTr[Iz+1,ind] += Flux / M[Iz+1,ind] + end + + if Iz <= Nz + ID = I + (J - 1) * N + Dxc = 0 + Dyc = 0 + for k = 1 : N + @inbounds Dxc = Dxc + D[I,k] * CacheCol[k,J,iz] + @inbounds Dyc = Dyc + D[J,k] * CacheCol[I,k,iz] + end + + @views @inbounds (GradDx, GradDy) = Grad12(RhoCol[I,J,iz],Dxc,Dyc,dXdxI[1:2,1:2,:,ID,Iz,IF],JJ[ID,:,Iz,IF]) + @views @inbounds (tempx, tempy) = Contra12(-Koeff,GradDx,GradDy,dXdxI[1:2,1:2,:,ID,Iz,IF]) + for k = 1 : N + @inbounds @atomic DivRhoTr[k,J,iz] += DW[k,I] * tempx + @inbounds @atomic DivRhoTr[I,k,iz] += DW[k,J] * tempy + end + + @views @inbounds (tempxRho, tempyRho) = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],dXdxI[1:2,1:2,:,ID,Iz,IF]) + for k = 1 : N + @inbounds @atomic DivRho[k,J,iz] += D[k,I] * tempxRho + @inbounds @atomic DivRho[I,k,iz] += D[k,J] * tempyRho + end + @inbounds tempxTr = tempxRho * cCol[I,J,iz] + @inbounds tempyTr = tempyRho * cCol[I,J,iz] + for k = 1 : N + @inbounds @atomic DivRhoTr[k,J,iz] += D[k,I] * tempxTr + @inbounds @atomic DivRhoTr[I,k,iz] += D[k,J] * tempyTr + end + @inbounds @atomic sumJ[iz] += JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + end + @synchronize + + if Iz <=Nz + ID = I + (J - 1) * N + ind = Glob[ID,IF] + @inbounds RhoTrColS[I,J,iz] = Tr[Iz,ind] + dt * DivRhoTr[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + @inbounds RhoColS[I,J,iz] = U[Iz,ind,1] + dt * DivRho[I,J,iz] / (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) + # Finite difference step + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + l0 * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]) + @inbounds @atomic resp[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + @synchronize + if Iz <= Nz + ID = I + (J - 1) * N + if abs(resp[iz]) <= eta + if ID == 1 + @inbounds conv[iz] = false + end + else + @inbounds qLoc = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + (l0 + dlFD) * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]) + @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (qLoc * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + end + @synchronize + + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + @inbounds conv[iz] = false + else + @inbounds alpha[iz] = dlFD / (resc[iz] - resp[iz]) + @inbounds lp[iz] = l0 + @inbounds lc[iz] = lp[iz] - alpha[iz] * resp[iz] + @inbounds resp[iz] = eltype(FTr)(0) + @inbounds resc[iz] = eltype(FTr)(0) + end + end + @synchronize + for iTer = 1 : 5 + if Iz <= Nz && conv[iz] + ID = I + (J - 1) * N + @inbounds q[I,J,iz] = medianGPU(qMinS[iz], RhoTrColS[I,J,iz] / RhoColS[I,J,iz] + + lc[iz] * (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz], qMaxS[iz]) + @inbounds @atomic resc[iz] += (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) * w[I] * w[J] / sumJ[iz] * + (q[I,J,iz] * RhoColS[I,J,iz] - RhoTrColS[I,J,iz]) + end + @synchronize + if Iz <= Nz && I == 1 && J == 1 && conv[iz] + if abs(resc[iz] - resp[iz]) <= eltype(FTr)(1.e-13) + @inbounds conv[iz] = false + else + @inbounds alpha[iz] = (lp[iz] - lc[iz]) / (resp[iz] - resc[iz]) + @inbounds resp[iz] = resc[iz] + @inbounds lp[iz] = lc[iz] + @inbounds lc[iz] = lc[iz] - alpha[iz] * resc[iz] + @inbounds resc[iz] = eltype(FTr)(0) + end + end + @synchronize + end + if Iz <= Nz + ID = I + (J - 1) * N + @inbounds ind = Glob[ID,IF] + @inbounds @atomic FTr[Iz,ind] += (q[I,J,iz] * RhoColS[I,J,iz] - Tr[Iz,ind]) * + (JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]) / dt / M[Iz,ind] + end +end + +@inline function medianGPU(a1,a2,a3) + if a1 <= a2 + if a2 <= a3 + m = a2 + elseif a1 <= a3 + m = a3 + else + m = a1 + end + else + if a1 <= a3 + m = a1 + elseif a2 <= a3 + m = a3 + else + m = a2 + end + end +end + diff --git a/src/Grids/Trans.jl b/src/Grids/Trans.jl index 2d2c476..54cbed7 100644 --- a/src/Grids/Trans.jl +++ b/src/Grids/Trans.jl @@ -126,11 +126,11 @@ function TransCartX!(XP,ksi,eta,zeta,X,CG,Global) OrdPolyZ=CG.OrdPolyZ @. XP = 0 @inbounds for j = 1 : OrdPoly + 1 - Lj = DG.Lagrange(eta,CG.xw,j) + Lj = DG.Lagrange(eta,CG.xwCPU,j) @inbounds for i = 1 : OrdPoly + 1 - Li = DG.Lagrange(ksi,CG.xw,i) * Lj + Li = DG.Lagrange(ksi,CG.xwCPU,i) * Lj @inbounds for k = 1 : OrdPolyZ + 1 - Fac = Li * DG.Lagrange(zeta,CG.xwZ,k) + Fac = Li * DG.Lagrange(zeta,CG.xwZCPU,k) @inbounds for l = 1 : 3 XP[l] += Fac * X[i,j,k,l] end