diff --git a/LIB/MESH/allocate_grid.f90 b/LIB/MESH/allocate_grid.f90 index d68a078e..7c50c26c 100644 --- a/LIB/MESH/allocate_grid.f90 +++ b/LIB/MESH/allocate_grid.f90 @@ -32,7 +32,7 @@ subroutine allocate_tree(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, rank, number_procs, dim integer(kind=ik), dimension(3) :: Bs integer(kind=ik) :: rk_steps - real(kind=rk) :: effective_memory + real(kind=rk) :: memory_this, memory_total integer(kind=ik) :: status, nrhs_slots, nwork, nx, ny, nz, max_neighbors, mpierr integer, allocatable :: blocks_per_mpirank(:) @@ -45,10 +45,12 @@ subroutine allocate_tree(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, ! set parameters for readability rank = params%rank + dim = params%dim Bs = params%Bs g = params%n_ghosts Neqn = params%n_eqn number_procs = params%number_procs + memory_total = 0.0_rk nx = Bs(1)+2*g ny = Bs(2)+2*g nz = Bs(3)+2*g @@ -101,7 +103,7 @@ subroutine allocate_tree(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, write(*,'(80("_"))') endif - !Automatic memory management. If specified --memory=0.3GB in the call line, + ! Automatic memory management. If specified --memory=0.3GB in the call line, if (params%number_blocks < 1) then !--------------------------------------------------------------------------- ! Automatic memory management. If specified --memory=0.3GB in the call line, @@ -128,8 +130,8 @@ subroutine allocate_tree(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, if (params%dim==3) then mem_per_block = real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g)) & ! hvy_block - + 2.0 * nstages * real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g) - ((Bs(1))*(Bs(2))*(Bs(3)))) & ! real buffer ghosts - + 2.0 * nstages * real(max_neighbors) * 5 / 2.0 ! int bufer (4byte hence /2) + + 2.0 * nstages * real(N_MAX_COMPONENTS) * real((Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g) - ((Bs(1))*(Bs(2))*(Bs(3)))) & ! real buffer ghosts + + 2.0 * nstages * real(max_neighbors) * 5 / 2.0 ! int bufer (4byte hence /2) ! hvy_mask if ( present(hvy_mask) .and. params%N_mask_components>0 ) then @@ -146,10 +148,9 @@ subroutine allocate_tree(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, mem_per_block = mem_per_block + real(Neqn) * real(nrhs_slots) * real((Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g)) endif - else mem_per_block = real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g)) & ! hvy_block - + 2.0 * nstages * real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g) - (Bs(1)*Bs(2))) & ! real buffer ghosts + + 2.0 * nstages * real(N_MAX_COMPONENTS) * real((Bs(1)+2*g)*(Bs(2)+2*g) - (Bs(1)*Bs(2))) & ! real buffer ghosts + 2.0 * nstages * real(max_neighbors) * 5 / 2.0 ! int bufer (4byte hence /2) ! hvy_mask @@ -199,121 +200,99 @@ subroutine allocate_tree(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_block", nx, ny, nz, Neqn, params%number_blocks - endif allocate( hvy_block( nx, ny, nz, Neqn, params%number_blocks ) ) + memory_this = product(real(shape(hvy_block)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_block", product(real(shape(hvy_block)))*8.0e-9, shape(hvy_block) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_block", memory_this, shape(hvy_block) endif !--------------------------------------------------------------------------- ! work data (Runge-Kutta substeps and old time level) if (present(hvy_work)) then - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",7(i9,1x),")")') & - "hvy_work", nx, ny, nz, Neqn, params%number_blocks, nrhs_slots - endif allocate( hvy_work( nx, ny, nz, Neqn, params%number_blocks, nrhs_slots ) ) + memory_this = product(real(shape(hvy_work)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_work", product(real(shape(hvy_work)))*8.0e-9, shape(hvy_work) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_work", memory_this, shape(hvy_work) endif end if if ( present(hvy_tmp) ) then - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_tmp", nx, ny, nz, nwork, params%number_blocks - endif allocate( hvy_tmp( nx, ny, nz, nwork, params%number_blocks ) ) + memory_this = product(real(shape(hvy_tmp)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_tmp", product(real(shape(hvy_tmp)))*8.0e-9, shape(hvy_tmp) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_tmp", memory_this, shape(hvy_tmp) endif endif if ( present(hvy_mask) .and. params%N_mask_components > 0 ) then - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_mask", nx, ny, nz, params%N_mask_components, params%number_blocks - endif allocate( hvy_mask( nx, ny, nz, params%N_mask_components, params%number_blocks ) ) - + memory_this = product(real(shape(hvy_mask)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_mask", product(real(shape(hvy_mask)))*8.0e-9, shape(hvy_mask) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_mask", memory_this, shape(hvy_mask) endif elseif ( present(hvy_mask) .and. params%N_mask_components <= 0 ) then ! dummy allocation, to prevent IFORT from yelling. - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_mask", 1, 1, 1, 1, 1 - endif allocate( hvy_mask(1, 1, 1, 1, 1) ) - + memory_this = product(real(shape(hvy_mask)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_mask", product(real(shape(hvy_mask)))*8.0e-9, shape(hvy_mask) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_mask", memory_this, shape(hvy_mask) endif endif !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",2(i9,1x),")")') & - "hvy_neighbor", params%number_blocks, max_neighbors - endif allocate( hvy_neighbor( params%number_blocks, max_neighbors ) ) + memory_this = product(real(shape(hvy_neighbor)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_neighbor", product(real(shape(hvy_neighbor)))*8.0e-9, shape(hvy_neighbor) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_neighbor", memory_this, shape(hvy_neighbor) endif !---------------------------------------------------------------------------) - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",2(i9,1x),")")') & - "lgt_block", number_procs*params%number_blocks, params%max_treelevel+EXTRA_LGT_FIELDS - endif allocate( lgt_block( number_procs*params%number_blocks, params%max_treelevel+EXTRA_LGT_FIELDS) ) + memory_this = product(real(shape(lgt_block)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "lgt_block", product(real(shape(lgt_block)))*4.0e-9, shape(lgt_block) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "lgt_block", memory_this, shape(lgt_block) endif !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",2(i9,1x),")")') & - "lgt_sortednumlist", size(lgt_block,1), 2 - endif allocate( lgt_sortednumlist( size(lgt_block,1), 2) ) + memory_this = product(real(shape(lgt_sortednumlist)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "lgt_sortednumlist", product(real(shape(lgt_sortednumlist)))*4.0e-9, shape(lgt_sortednumlist) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "lgt_sortednumlist", memory_this, shape(lgt_sortednumlist) endif !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",1(i9,1x),")")') & - "lgt_active", size(lgt_block, 1) - endif allocate( lgt_active( size(lgt_block, 1) ) ) + memory_this = product(real(shape(lgt_active)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "lgt_active", product(real(shape(lgt_active)))*4.0e-9, shape(lgt_active) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "lgt_active", memory_this, shape(lgt_active) endif !--------------------------------------------------------------------------- ! note: 5th dimension in heavy data is block id - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",1(i9,1x),")")') & - "hvy_active", size(hvy_block, 5) - endif allocate( hvy_active( size(hvy_block, 5) ) ) + memory_this = product(real(shape(hvy_active)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_active", product(real(shape(hvy_active)))*4.0e-9, shape(hvy_active) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_active", memory_this, shape(hvy_active) endif ! setting -1 is required to avoid "ERROR: We try to fetch a light free block ID from the list but all blocks are used on this CPU" @@ -324,21 +303,7 @@ subroutine allocate_tree(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, if (rank == 0) then write(*,'("INIT: System is allocating heavy data for ",i7," blocks and ", i3, " fields" )') params%number_blocks, Neqn write(*,'("INIT: System is allocating light data for ",i7," blocks" )') number_procs*params%number_blocks - write(*,'("INIT: System is allocating heavy work data for ",i7," blocks " )') params%number_blocks - - effective_memory = (dble(size(hvy_block)) + & ! real data - dble(size(lgt_block)+size(lgt_sortednumlist)+size(hvy_neighbor)+size(lgt_active)+size(hvy_active))/2.0 & ! integer (hence /2) - )*8.0e-9 ! in GB - - if (present(hvy_tmp)) effective_memory = effective_memory + dble(size(hvy_tmp))*8.0e-9 ! in GB - if (present(hvy_work)) effective_memory = effective_memory + dble(size(hvy_work))*8.0e-9 ! in GB - - ! note we currently use 8byte per real and integer by default, so all the same bytes per point - write(*,'("INIT: Measured (true) local (on 1 cpu) memory (hvy_block+hvy_work+lgt_block no ghosts!) is ",g15.3,"GB per mpirank")') & - effective_memory - - write(*,'("INIT-GLOBAL: Measured (true) TOTAL (on all CPU) memory (hvy_block+hvy_work+lgt_block no ghosts!) is ",g15.3,"GB")') & - effective_memory*dble(params%number_procs) + write(*,'("INIT: Measured local (on 1 cpu) memory (hvy_block+hvy_work+lgt_block no ghosts!): ",g15.3," GB per rank")') memory_total end if end subroutine allocate_tree @@ -380,7 +345,7 @@ subroutine allocate_forest(params, lgt_block, hvy_block, hvy_neighbor, lgt_activ rank, number_procs, dim integer(kind=ik), dimension(3) :: Bs integer(kind=ik) :: rk_steps - real(kind=rk) :: effective_memory + real(kind=rk) :: memory_this, memory_total integer :: status, nrhs_slots, nwork, nx, ny, nz, max_neighbors, mpierr integer, allocatable :: blocks_per_mpirank(:) @@ -394,6 +359,7 @@ subroutine allocate_forest(params, lgt_block, hvy_block, hvy_neighbor, lgt_activ g = params%n_ghosts Neqn = params%n_eqn number_procs = params%number_procs + memory_total = 0.0_rk nx = Bs(1)+2*g ny = Bs(2)+2*g nz = Bs(3)+2*g @@ -464,16 +430,16 @@ subroutine allocate_forest(params, lgt_block, hvy_block, hvy_neighbor, lgt_activ ! read memory from command line (in GB) read(memstring(10:len_trim(memstring)-2),* ) maxmem - if (params%rank==0) write(*,'("INIT: total memory: ",f9.4,"GB")') maxmem + if (params%rank==0) write(*,'("INIT: desired total memory: ",f9.4,"GB")') maxmem ! memory per MPIRANK (in GB) maxmem = maxmem / dble(params%number_procs) - if (params%rank==0) write(*,'("INIT: memory-per-rank: ",f9.4,"GB")') maxmem + if (params%rank==0) write(*,'("INIT: desired memory-per-rank: ",f9.4,"GB")') maxmem if (params%dim==3) then mem_per_block = real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g)) & ! hvy_block - + 2.0 * nstages * real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g) - ((Bs(1))*(Bs(2))*(Bs(3)))) & ! real buffer ghosts + + 2.0 * nstages * real(N_MAX_COMPONENTS) * real((Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g) - ((Bs(1))*(Bs(2))*(Bs(3)))) & ! real buffer ghosts + 2.0 * nstages * real(max_neighbors) * 5 / 2.0 ! int bufer (4byte hence /2) ! hvy_mask @@ -494,7 +460,7 @@ subroutine allocate_forest(params, lgt_block, hvy_block, hvy_neighbor, lgt_activ else mem_per_block = real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g)) & ! hvy_block - + 2.0 * nstages * real(Neqn) * real((Bs(1)+2*g)*(Bs(2)+2*g) - (Bs(1)*Bs(2))) & ! real buffer ghosts + + 2.0 * nstages * real(N_MAX_COMPONENTS) * real((Bs(1)+2*g)*(Bs(2)+2*g) - (Bs(1)*Bs(2))) & ! real buffer ghosts + 2.0 * nstages * real(max_neighbors) * 5 / 2.0 ! int bufer (4byte hence /2) ! hvy_mask @@ -544,140 +510,116 @@ subroutine allocate_forest(params, lgt_block, hvy_block, hvy_neighbor, lgt_activ !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_block", nx, ny, nz, Neqn, params%number_blocks - endif allocate( hvy_block( nx, ny, nz, Neqn, params%number_blocks ) ) + memory_this = product(real(shape(hvy_block)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_block", product(real(shape(hvy_block)))*8.0e-9, shape(hvy_block) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_block", memory_this, shape(hvy_block) endif !--------------------------------------------------------------------------- ! work data (Runge-Kutta substeps and old time level) if (present(hvy_work)) then - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",6(i9,1x),")")') & - "hvy_work", nx, ny, nz, Neqn, params%number_blocks, nrhs_slots - endif allocate( hvy_work( nx, ny, nz, Neqn, params%number_blocks, nrhs_slots ) ) + memory_this = product(real(shape(hvy_work)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_work", product(real(shape(hvy_work)))*8.0e-9, shape(hvy_work) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_work", memory_this, shape(hvy_work) endif end if if ( present(hvy_tmp) ) then - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_tmp", nx, ny, nz, nwork, params%number_blocks - endif allocate( hvy_tmp( nx, ny, nz, nwork, params%number_blocks ) ) + memory_this = product(real(shape(hvy_tmp)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_tmp", product(real(shape(hvy_tmp)))*8.0e-9, shape(hvy_tmp) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_tmp", memory_this, shape(hvy_tmp) endif endif if ( present(hvy_mask) .and. params%N_mask_components > 0 ) then - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_mask", nx, ny, nz, params%N_mask_components, params%number_blocks - endif allocate( hvy_mask( nx, ny, nz, params%N_mask_components, params%number_blocks ) ) + memory_this = product(real(shape(hvy_mask)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_mask", product(real(shape(hvy_mask)))*8.0e-9, shape(hvy_mask) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_mask", memory_this, shape(hvy_mask) endif elseif ( present(hvy_mask) .and. params%N_mask_components <= 0 ) then ! dummy allocation, to prevent IFORT from yelling. - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",5(i9,1x),")")') & - "hvy_mask", 1, 1, 1, 1, 1 - endif allocate( hvy_mask(1, 1, 1, 1, 1) ) - + memory_this = product(real(shape(hvy_mask)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_mask", product(real(shape(hvy_mask)))*8.0e-9, shape(hvy_mask) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_mask", memory_this, shape(hvy_mask) endif endif !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",2(i9,1x),")")') & - "hvy_neighbor", params%number_blocks, max_neighbors - endif allocate( hvy_neighbor( params%number_blocks, max_neighbors ) ) + memory_this = product(real(shape(hvy_neighbor)))*8.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_neighbor", product(real(shape(hvy_neighbor)))*8.0e-9, shape(hvy_neighbor) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_neighbor", memory_this, shape(hvy_neighbor) endif !---------------------------------------------------------------------------) - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",2(i9,1x),")")') & - "lgt_block", number_procs*params%number_blocks, params%max_treelevel+EXTRA_LGT_FIELDS - endif allocate( lgt_block( number_procs*params%number_blocks, params%max_treelevel+EXTRA_LGT_FIELDS) ) + memory_this = product(real(shape(lgt_block)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "lgt_block", product(real(shape(lgt_block)))*4.0e-9, shape(lgt_block) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "lgt_block", memory_this, shape(lgt_block) endif !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",3(i9,1x),")")') & - "lgt_sortednumlist", size(lgt_block,1), 2, params%forest_size - endif allocate( lgt_sortednumlist( size(lgt_block,1), 2, params%forest_size) ) + memory_this = product(real(shape(lgt_sortednumlist)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "lgt_sortednumlist", product(real(shape(lgt_sortednumlist)))*4.0e-9, shape(lgt_sortednumlist) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "lgt_sortednumlist", memory_this, shape(lgt_sortednumlist) endif !--------------------------------------------------------------------------- - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",2(i9,1x),")")') & - "lgt_active", size(lgt_block,1), params%forest_size - endif allocate( lgt_active( size(lgt_block,1), params%forest_size ) ) + memory_this = product(real(shape(lgt_active)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "lgt_active", product(real(shape(lgt_active)))*4.0e-9, shape(lgt_active) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "lgt_active", memory_this, shape(lgt_active) endif - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",1(i9,1x),")")') & - "lgt_n", params%forest_size - endif allocate( lgt_n(params%forest_size ) ) + memory_this = product(real(shape(lgt_n)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "lgt_n", product(real(shape(lgt_n)))*4.0e-9, shape(lgt_n) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "lgt_n", memory_this, shape(lgt_n) endif - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",1(i9,1x),")")') & - "hvy_n", params%forest_size - endif allocate( hvy_n(params%forest_size ) ) + memory_this = product(real(shape(hvy_n)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_n", product(real(shape(hvy_n)))*4.0e-9, shape(hvy_n) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_n", memory_this, shape(hvy_n) endif + !--------------------------------------------------------------------------- ! note: 5th dimension in heavy data is block id - if (rank==0) then - write(*,'("INIT: ALLOCATING ",A19,"(",2(i9,1x),")")') & - "hvy_active", size(hvy_block, 5), params%forest_size - endif allocate( hvy_active( size(hvy_block, 5), params%forest_size ) ) + memory_this = product(real(shape(hvy_active)))*4.0e-9 + memory_total = memory_total + memory_this if (rank==0) then - write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4,"GB SHAPE=",7(i9,1x))') & - "hvy_active", product(real(shape(hvy_active)))*4.0e-9, shape(hvy_active) + write(*,'("INIT: ALLOCATED ",A19," MEM=",f8.4," GB per rank, shape=",7(i9,1x))') & + "hvy_active", memory_this, shape(hvy_active) endif lgt_n = 0 @@ -688,21 +630,7 @@ subroutine allocate_forest(params, lgt_block, hvy_block, hvy_neighbor, lgt_activ if (rank == 0) then write(*,'("INIT: System is allocating heavy data for ",i7," blocks and ", i3, " fields" )') params%number_blocks, Neqn write(*,'("INIT: System is allocating light data for ",i7," blocks" )') number_procs*params%number_blocks - write(*,'("INIT: System is allocating heavy work data for ",i7," blocks " )') params%number_blocks - - effective_memory = (dble(size(hvy_block)) + & ! real data - dble(size(lgt_block)+size(lgt_sortednumlist)+size(hvy_neighbor)+size(lgt_active)+size(hvy_active))/2.0 & ! integer (hence /2) - )*8.0e-9 ! in GB - - if (present(hvy_tmp)) effective_memory = effective_memory + dble(size(hvy_tmp))*8.0e-9 ! in GB - if (present(hvy_work)) effective_memory = effective_memory + dble(size(hvy_work))*8.0e-9 ! in GB - - ! note we currently use 8byte per real and integer by default, so all the same bytes per point - write(*,'("INIT: Measured (true) local (on 1 cpu) memory (hvy_block+hvy_work+lgt_block no ghosts!) is ",g15.3,"GB per mpirank")') & - effective_memory - - write(*,'("INIT-GLOBAL: Measured (true) TOTAL (on all CPU) memory (hvy_block+hvy_work+lgt_block no ghosts!) is ",g15.3,"GB")') & - effective_memory*dble(params%number_procs) + write(*,'("INIT: Measured local (on 1 cpu) memory (hvy_block+hvy_work+lgt_block no ghosts!): ",g15.3," GB per rank")') memory_total end if end subroutine allocate_forest diff --git a/LIB/MODULE/module_globals.f90 b/LIB/MODULE/module_globals.f90 index 5ee9dbba..7c37c8a4 100644 --- a/LIB/MODULE/module_globals.f90 +++ b/LIB/MODULE/module_globals.f90 @@ -32,7 +32,10 @@ module module_globals ! this parameter is a hack. in most parts of the code, a block has n_eqn component entries. ! universality dictates that we can also use a different number of components, for example ! when syn'ing the mask function (which in many cases has six entries.) - integer, public :: N_MAX_COMPONENTS = 6 + ! New in 06/2021: the hack continues. We now set this parameter at different places + ! to save on memory. That can be params%n_eqn (default in simulations), 6 (if mask is synced). The new default is 3, + ! for postprocessing. + integer, public :: N_MAX_COMPONENTS = 3 !subroutines of this module interface abort diff --git a/LIB/MPI/module_mpi.f90 b/LIB/MPI/module_mpi.f90 index 08b35d2b..8a9c966e 100644 --- a/LIB/MPI/module_mpi.f90 +++ b/LIB/MPI/module_mpi.f90 @@ -198,8 +198,9 @@ subroutine init_ghost_nodes( params ) allocate( tmp_block( Bs(1)+2*g, Bs(2)+2*g, 1, Neqn) ) end if - ! size of ghost nodes buffer. Note this contains only the ghost nodes layer - ! for all my blocks. previous versions allocated one of those per "friend" + ! size of ghost nodes buffer. The worst case is that we have to send ALL my ghost node + ! points to another rank, but this happens only if ALL my blocks have ONLY neighbors + ! on other MPIRANKS. if ( params%dim==3 ) then buffer_N = number_blocks * Neqn * ( (Bs(1)+2*g)*(Bs(2)+2*g)*(Bs(3)+2*g) - (Bs(1)*Bs(2)*Bs(3)) ) else @@ -213,20 +214,14 @@ subroutine init_ghost_nodes( params ) ! allocate synch buffer if (rank==0) then write(*,'("GHOSTS-INIT: Attempting to allocate the ghost-sync-buffer.")') - - write(*,'("GHOSTS-INIT: buffer_N_int=",i12," buffer_N=",i12," Nstages=",i1)') & - buffer_N_int, buffer_N, Nstages - - write(*,'("GHOSTS-INIT: On each MPIRANK, Int buffer:", f9.4, "GB")') & - 2.0*dble(buffer_N_int)*dble(Nstages)*8e-9 - - write(*,'("GHOSTS-INIT: On each MPIRANK, Real buffer:", f9.4, "GB")') & - 2.0*dble(buffer_N)*dble(Nstages)*8e-9 + write(*,'("GHOSTS-INIT: buffer_N_int=",i12," buffer_N=",i12," Nstages=",i1)') buffer_N_int, buffer_N, Nstages + write(*,'("GHOSTS-INIT: Int buffer:", f9.4, " GB per rank")') 2.0*dble(buffer_N_int)*dble(Nstages)*8e-9 + write(*,'("GHOSTS-INIT: Real buffer:", f9.4, " GB per rank")') 2.0*dble(buffer_N)*dble(Nstages)*8e-9 write(*,'("---------------- allocating now ----------------")') endif ! wait now so that if allocation fails, we get at least the above info - call MPI_barrier( WABBIT_COMM, status(1)) + call MPI_barrier( WABBIT_COMM, status(1) ) allocate( int_send_buffer( 1:buffer_N_int, 1:Nstages), stat=status(1) ) allocate( int_recv_buffer( 1:buffer_N_int, 1:Nstages), stat=status(2) ) @@ -236,7 +231,6 @@ subroutine init_ghost_nodes( params ) if (maxval(status) /= 0) call abort(999999, "Buffer allocation failed. Not enough memory?") if (rank==0) then - write(*,'("GHOSTS-INIT: on each mpirank, Allocated ",A25," SHAPE=",7(i9,1x))') & "new_send_buffer", shape(new_send_buffer) diff --git a/LIB/MPI/synchronize_ghosts_generic.f90 b/LIB/MPI/synchronize_ghosts_generic.f90 index a31d8964..6ea31dde 100644 --- a/LIB/MPI/synchronize_ghosts_generic.f90 +++ b/LIB/MPI/synchronize_ghosts_generic.f90 @@ -31,6 +31,7 @@ subroutine synchronize_ghosts_generic_sequence( params, lgt_block, hvy_block, hv integer(kind=ik) :: ijk(2,3) integer(kind=ik) :: bounds_type, istage, istage_buffer(1:4), rounds(1:4), inverse + logical, save :: informed = .false. if (.not. ghost_nodes_module_ready) then @@ -42,8 +43,23 @@ subroutine synchronize_ghosts_generic_sequence( params, lgt_block, hvy_block, hv ! if this mpirank has no active blocks, it has nothing to do here. if (hvy_n == 0) return - if (size(hvy_block,4)>N_max_components) then - call abort(160720191,"You try to ghost-sync a vector with too many components.") + if (size(hvy_block,4)>N_MAX_COMPONENTS .and. .not. informed) then + if (params%rank ==0) then + write(*,*) "-------------------------------------------------------------------------" + write(*,*) "---warning---warning---warning---warning---warning---warning---warning---" + write(*,*) "---warning---warning---warning---warning---warning---warning---warning---" + write(*,*) "-------------------------------------------------------------------------" + write(*,*) " A warning from the ghost nodes module: we have allocated a buffer with an estimation for" + write(*,*) " neqn=", N_MAX_COMPONENTS, " components of a vector, but you try to sync" + write(*,*) " neqn=", size(hvy_block,4), " This may work just fine: but in some (rare) cases, " + write(*,*) " we will see a buffer overflow. The code will then abort with an error, and you have" + write(*,*) " to restart this simulation with more memory." + write(*,*) "-------------------------------------------------------------------------" + write(*,*) "---warning---warning---warning---warning---warning---warning---warning---" + write(*,*) "---warning---warning---warning---warning---warning---warning---warning---" + write(*,*) "-------------------------------------------------------------------------" + endif + informed = .true. endif Bs = params%Bs @@ -1053,6 +1069,9 @@ subroutine AppendLineToBuffer( int_send_buffer, new_send_buffer, buffer_size, ne ! real data if (buffer_size>0) then + if (i0+buffer_size-1 >= size(new_send_buffer,1)) then + call abort(202106049, "Internal bug: we ran out of space for the ghost nodes. Restart simulation with more memory.") + endif new_send_buffer( i0:i0+buffer_size-1, istage ) = line_buffer(1:buffer_size) endif diff --git a/LIB/PARAMS/ini_file_to_params.f90 b/LIB/PARAMS/ini_file_to_params.f90 index 98411b91..8824b955 100644 --- a/LIB/PARAMS/ini_file_to_params.f90 +++ b/LIB/PARAMS/ini_file_to_params.f90 @@ -104,6 +104,15 @@ subroutine ini_file_to_params( params, filename ) call read_param_mpi(FILE, 'VPM', 'mask_time_independent_part', params%mask_time_independent_part, .true.) call read_param_mpi(FILE, 'VPM', 'dont_use_pruned_tree_mask', params%dont_use_pruned_tree_mask, .false.) + if (params%physics_type == "ACM-new") then + if (params%penalization) then + if ((.not.params%dont_use_pruned_tree_mask).and.(params%mask_time_independent_part)) then + ! we sync the mask array in this case, which has 6 components + N_MAX_COMPONENTS = max(6, params%n_eqn) + endif + endif + endif + ! decide if we use hartens point value multiresolution transform, which uses a coarsening operator ! that just takes every 2nd grid point or biorthogonal wavlets, which apply a smoothing filter (lowpass) ! prior to downsampling. @@ -270,6 +279,10 @@ subroutine ini_blocks(params, FILE ) call abort(170619,"Error: Max treelevel cannot be larger 18 (64bit long integer problem) ") end if + ! the default case is that we synchronize (ghosts) with n-eqn compontents in the vector + ! may be overwritten if pruned tree mask is used (by six) + N_MAX_COMPONENTS = params%n_eqn + ! read switch to turn on|off mesh refinement call read_param_mpi(FILE, 'Blocks', 'adapt_mesh', params%adapt_mesh, .true. ) call read_param_mpi(FILE, 'Blocks', 'adapt_inicond', params%adapt_inicond, params%adapt_mesh ) diff --git a/LIB/POSTPROCESSING/module_MOR.f90 b/LIB/POSTPROCESSING/module_MOR.f90 index 4e65c31a..b587e13d 100644 --- a/LIB/POSTPROCESSING/module_MOR.f90 +++ b/LIB/POSTPROCESSING/module_MOR.f90 @@ -460,6 +460,8 @@ subroutine post_POD(params) call get_cmd_arg( "--start_from_eigenbasis", eigenbasis_files) call get_cmd_arg( "--components", n_components, default=1_ik) + N_MAX_COMPONENTS = n_components ! used for ghost node sync'ing (buffer allocation) + !------------------------------- ! Set some wabbit specific params !------------------------------- @@ -751,6 +753,7 @@ subroutine post_PODerror(params) call get_cmd_arg( "--components", n_components, default=1_ik) call get_cmd_arg( "--iteration", iteration, default=1_ik) + N_MAX_COMPONENTS = n_components ! used for ghost node sync'ing (buffer allocation) if ( iteration>0 ) then if ( params%rank == 0 ) write(*,*) "Iteration reconstructed: " ,iteration @@ -1320,6 +1323,7 @@ subroutine post_reconstruct(params) call get_cmd_arg( "--iteration", iteration, default=-1_ik) call get_cmd_arg( "--nmodes", N_modes_used, default=1_ik) + N_MAX_COMPONENTS = n_components ! used for ghost node sync'ing (buffer allocation) if ( iteration>0 ) then save_all = .False. @@ -1729,6 +1733,8 @@ subroutine post_timecoef_POD(params) call get_cmd_arg( "--components", n_components, default=1_ik) call get_cmd_arg( "--iteration", iteration, default=1) + N_MAX_COMPONENTS = n_components ! used for ghost node sync'ing (buffer allocation) + if ( iteration>0 ) then if ( params%rank == 0 ) write(*,*) "Iteration reconstructed: " ,iteration endif diff --git a/LIB/POSTPROCESSING/post_generate_forest.f90 b/LIB/POSTPROCESSING/post_generate_forest.f90 index b6389e19..02ca1e0f 100644 --- a/LIB/POSTPROCESSING/post_generate_forest.f90 +++ b/LIB/POSTPROCESSING/post_generate_forest.f90 @@ -62,6 +62,8 @@ subroutine post_generate_forest(params) params%block_distribution = "sfc_hilbert" params%time_step_method = 'none' + N_MAX_COMPONENTS = params%n_eqn ! used for ghost node sync'ing (buffer allocation) + ! we have to allocate grid if this routine is called for the first time call allocate_forest(params, lgt_block, hvy_block, hvy_neighbor, lgt_active, & diff --git a/TESTING/runtests.sh b/TESTING/runtests.sh index 6eb03aa0..e36289e4 100755 --- a/TESTING/runtests.sh +++ b/TESTING/runtests.sh @@ -95,7 +95,10 @@ do echo "Writing output to: " ${logfile} # run the actual test + T2="$(date +%s)" ./${ts} > $logfile + T2="$(($(date +%s)-T2))" + echo "Time used in test: ${T2} seconds" if [ $? == 0 ]; then printf "%s \n" "${pass_color} pass ${end_color}"