From 450a84731f1d9244d7e2f3f7699f406189955b13 Mon Sep 17 00:00:00 2001 From: Hyungjun Lee Date: Tue, 27 Mar 2018 21:27:10 +0200 Subject: [PATCH 1/3] into src --- src/disentangle.F90 | 159 ++++++++++++++++++++++++++++++-------------- src/overlap.F90 | 86 ++++++++++++++++++++---- src/parameters.F90 | 15 +++-- src/wannierise.F90 | 75 ++++++++++++--------- 4 files changed, 235 insertions(+), 100 deletions(-) diff --git a/src/disentangle.F90 b/src/disentangle.F90 index 88b57db9a..18dd85716 100644 --- a/src/disentangle.F90 +++ b/src/disentangle.F90 @@ -25,7 +25,8 @@ module w90_disentangle dis_win_max,dis_froz_min,dis_froz_max,dis_spheres_num, & dis_spheres_first_wann,num_kpts,nnlist,ndimwin,wb,gamma_only, & eigval,length_unit,dis_spheres,m_matrix,dis_conv_tol,frozen_states, & - optimisation,recip_lattice,kpt_latt + optimisation,recip_lattice,kpt_latt,& + m_matrix_orig_local,m_matrix_local use w90_comms, only : on_root, my_node_id, num_nodes,& comms_bcast, comms_array_split,& @@ -75,10 +76,16 @@ subroutine dis_main() ! internal variables integer :: nkp,nkp2,nn,j,ierr,page_unit + integer :: nkp_global complex(kind=dp), allocatable :: cwb(:,:),cww(:,:) + ! Needed to split an array on different nodes + integer, dimension(0:num_nodes-1) :: counts + integer, dimension(0:num_nodes-1) :: displs if (timing_level>0) call io_stopwatch('dis: main',1) + call comms_array_split(num_kpts,counts,displs) + if (on_root) write(stdout,'(/1x,a)') & '*------------------------------- DISENTANGLE --------------------------------*' @@ -136,16 +143,17 @@ subroutine dis_main() ! Find the num_wann x num_wann overlap matrices between ! the basis states of the optimal subspaces - do nkp = 1, num_kpts + do nkp = 1, counts(my_node_id) + nkp_global=nkp+displs(my_node_id) do nn = 1, nntot - nkp2 = nnlist(nkp,nn) - call zgemm('C','N',num_wann,ndimwin(nkp2),ndimwin(nkp),cmplx_1,& - u_matrix_opt(:,:,nkp),num_bands,m_matrix_orig(:,:,nn,nkp),num_bands,& + nkp2 = nnlist(nkp_global,nn) + call zgemm('C','N',num_wann,ndimwin(nkp2),ndimwin(nkp_global),cmplx_1,& + u_matrix_opt(:,:,nkp_global),num_bands,m_matrix_orig_local(:,:,nn,nkp),num_bands,& cmplx_0,cwb,num_wann) call zgemm('N','N',num_wann,num_wann,ndimwin(nkp2),cmplx_1,& cwb,num_wann,u_matrix_opt(:,:,nkp2),num_bands,& cmplx_0,cww,num_wann) - m_matrix_orig(1:num_wann,1:num_wann,nn,nkp) = cww(:,:) + m_matrix_orig_local(1:num_wann,1:num_wann,nn,nkp) = cww(:,:) enddo enddo @@ -163,11 +171,12 @@ subroutine dis_main() page_unit=io_file_unit() open(unit=page_unit,form='unformatted',status='scratch') ! Update the m_matrix accordingly - do nkp = 1, num_kpts + do nkp = 1, counts(my_node_id) + nkp_global=nkp+displs(my_node_id) do nn = 1, nntot - nkp2 = nnlist(nkp,nn) + nkp2 = nnlist(nkp_global,nn) call zgemm('C','N',num_wann,num_wann,num_wann,cmplx_1,& - u_matrix(:,:,nkp),num_wann,m_matrix_orig(:,:,nn,nkp),num_bands,& + u_matrix(:,:,nkp_global),num_wann,m_matrix_orig_local(:,:,nn,nkp),num_bands,& cmplx_0,cwb,num_wann) call zgemm('N','N',num_wann,num_wann,num_wann,cmplx_1,& cwb,num_wann,u_matrix(:,:,nkp2),num_wann,& @@ -176,37 +185,50 @@ subroutine dis_main() enddo enddo rewind(page_unit) - deallocate( m_matrix_orig, stat=ierr ) - if (ierr/=0) call io_error('Error deallocating m_matrix_orig in dis_main') + deallocate( m_matrix_orig_local, stat=ierr ) + if (ierr/=0) call io_error('Error deallocating m_matrix_orig_local in dis_main') + if (on_root) then allocate ( m_matrix( num_wann,num_wann,nntot,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating m_matrix in dis_main') - do nkp = 1, num_kpts + endif + allocate ( m_matrix_local( num_wann,num_wann,nntot,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error in allocating m_matrix_local in dis_main') + do nkp = 1, counts(my_node_id) do nn = 1, nntot - read(page_unit) m_matrix(:,:,nn,nkp) + read(page_unit) m_matrix_local(:,:,nn,nkp) end do end do + call comms_gatherv(m_matrix_local,num_wann*num_wann*nntot*counts(my_node_id),& + m_matrix,num_wann*num_wann*nntot*counts,num_wann*num_wann*nntot*displs) close(page_unit) else + if (on_root) then allocate ( m_matrix( num_wann,num_wann,nntot,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating m_matrix in dis_main') + endif + allocate ( m_matrix_local( num_wann,num_wann,nntot,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error in allocating m_matrix_local in dis_main') ! Update the m_matrix accordingly - do nkp = 1, num_kpts + do nkp = 1, counts(my_node_id) + nkp_global=nkp+displs(my_node_id) do nn = 1, nntot - nkp2 = nnlist(nkp,nn) + nkp2 = nnlist(nkp_global,nn) call zgemm('C','N',num_wann,num_wann,num_wann,cmplx_1,& - u_matrix(:,:,nkp),num_wann,m_matrix_orig(:,:,nn,nkp),num_bands,& + u_matrix(:,:,nkp_global),num_wann,m_matrix_orig_local(:,:,nn,nkp),num_bands,& cmplx_0,cwb,num_wann) call zgemm('N','N',num_wann,num_wann,num_wann,cmplx_1,& cwb,num_wann,u_matrix(:,:,nkp2),num_wann,& cmplx_0,cww,num_wann) - m_matrix(:,:,nn,nkp) = cww(:,:) + m_matrix_local(:,:,nn,nkp) = cww(:,:) enddo enddo - deallocate( m_matrix_orig, stat=ierr ) - if (ierr/=0) call io_error('Error deallocating m_matrix_orig in dis_main') + call comms_gatherv(m_matrix_local,num_wann*num_wann*nntot*counts(my_node_id),& + m_matrix,num_wann*num_wann*nntot*counts,num_wann*num_wann*nntot*displs) + deallocate( m_matrix_orig_local, stat=ierr ) + if (ierr/=0) call io_error('Error deallocating m_matrix_orig_local in dis_main') endif @@ -332,27 +354,34 @@ subroutine internal_slim_m() implicit none integer :: nkp,nkp2,nn,i,j,m,n,ierr + integer :: nkp_global complex(kind=dp), allocatable :: cmtmp(:,:) + ! Needed to split an array on different nodes + integer, dimension(0:num_nodes-1) :: counts + integer, dimension(0:num_nodes-1) :: displs if (timing_level>1 .and. on_root) call io_stopwatch('dis: main: slim_m',1) + call comms_array_split(num_kpts,counts,displs) + allocate(cmtmp(num_bands,num_bands),stat=ierr) if (ierr/=0) call io_error('Error in allocating cmtmp in dis_main') - do nkp = 1, num_kpts + do nkp = 1, counts(my_node_id) + nkp_global=nkp+displs(my_node_id) do nn = 1, nntot - nkp2 = nnlist(nkp,nn) + nkp2 = nnlist(nkp_global,nn) do j = 1, ndimwin(nkp2) n = nfirstwin(nkp2) + j - 1 - do i = 1, ndimwin(nkp) - m = nfirstwin(nkp) + i - 1 - cmtmp(i,j) = m_matrix_orig(m,n,nn,nkp) + do i = 1, ndimwin(nkp_global) + m = nfirstwin(nkp_global) + i - 1 + cmtmp(i,j) = m_matrix_orig_local(m,n,nn,nkp) enddo enddo - m_matrix_orig(:,:,nn,nkp) = cmplx_0 + m_matrix_orig_local(:,:,nn,nkp) = cmplx_0 do j = 1, ndimwin(nkp2) - do i = 1, ndimwin(nkp) - m_matrix_orig(i,j,nn,nkp) = cmtmp(i,j) + do i = 1, ndimwin(nkp_global) + m_matrix_orig_local(i,j,nn,nkp) = cmtmp(i,j) enddo enddo enddo @@ -404,6 +433,8 @@ subroutine internal_find_u() if (timing_level>1.and.on_root) call io_stopwatch('dis: main: find_u',1) + ! Currently, this part is not parallelized; thus, we perform the task only on root and then broadcast the result. + if (on_root) then ! Allocate arrays needed for ZGESVD allocate(svals(num_wann),stat=ierr) if (ierr/=0) call io_error('Error in allocating svals in dis_main') @@ -441,8 +472,11 @@ subroutine internal_find_u() call zgemm('N','N',num_wann,num_wann,num_wann,cmplx_1,& cz,num_wann,cv,num_wann,cmplx_0,u_matrix(:,:,nkp),num_wann) enddo - if (lsitesymmetry) call sitesym_symmetrize_u_matrix(num_wann,u_matrix) !RS: + endif + call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts) +! if (lsitesymmetry) call sitesym_symmetrize_u_matrix(num_wann,u_matrix) !RS: + if (on_root) then ! Deallocate arrays for ZGESVD deallocate(caa,stat=ierr) if (ierr/=0) call io_error('Error deallocating caa in dis_main') @@ -456,6 +490,9 @@ subroutine internal_find_u() if (ierr/=0) call io_error('Error deallocating rwork in dis_main') deallocate(svals,stat=ierr) if (ierr/=0) call io_error('Error deallocating svals in dis_main') + endif + + if (lsitesymmetry) call sitesym_symmetrize_u_matrix(num_wann,u_matrix) !RS: if (timing_level>1) call io_stopwatch('dis: main: find_u',2) @@ -957,7 +994,7 @@ subroutine dis_project() complex(kind=dp), allocatable :: cwork(:) complex(kind=dp), allocatable :: cz(:,:) complex(kind=dp), allocatable :: cvdag(:,:) - complex(kind=dp), allocatable :: catmpmat(:,:,:) +! complex(kind=dp), allocatable :: catmpmat(:,:,:) if (timing_level>1) call io_stopwatch('dis: project',1) @@ -968,8 +1005,8 @@ subroutine dis_project() if (on_root) write(stdout,'(3x,a)') 'A_mn = --> S = A.A^+ --> U = S^-1/2.A' if (on_root) write(stdout,'(3x,a)',advance='no') 'In dis_project...' - allocate(catmpmat(num_bands,num_bands,num_kpts),stat=ierr) - if (ierr/=0) call io_error('Error in allocating catmpmat in dis_project') +! allocate(catmpmat(num_bands,num_bands,num_kpts),stat=ierr) +! if (ierr/=0) call io_error('Error in allocating catmpmat in dis_project') allocate(svals(num_bands),stat=ierr) if (ierr/=0) call io_error('Error in allocating svals in dis_project') allocate(rwork(5*num_bands),stat=ierr) @@ -984,18 +1021,30 @@ subroutine dis_project() ! here we slim down the ca matrix ! up to here num_bands(=num_bands) X num_wann(=num_wann) +! do nkp = 1, num_kpts +! do j = 1, num_wann +! do i = 1, ndimwin(nkp) +! catmpmat(i,j,nkp) = a_matrix(nfirstwin(nkp)+i-1,j,nkp) +! enddo +! enddo +! do j = 1, num_wann +! a_matrix(1:ndimwin(nkp),j,nkp) = catmpmat(1:ndimwin(nkp),j,nkp) +! enddo +! do j = 1, num_wann +! a_matrix(ndimwin(nkp)+1:num_bands,j,nkp) = cmplx_0 +! enddo +! enddo + ! in order to reduce the memory usage, we don't use catmpmat. do nkp = 1, num_kpts - do j = 1, num_wann - do i = 1, ndimwin(nkp) - catmpmat(i,j,nkp) = a_matrix(nfirstwin(nkp)+i-1,j,nkp) + if (ndimwin(nkp).ne.num_bands) then + do j = 1, num_wann + do i = 1, ndimwin(nkp) + ctmp2 = a_matrix(nfirstwin(nkp)+i-1,j,nkp) + a_matrix(i,j,nkp) = ctmp2 + enddo + a_matrix(ndimwin(nkp)+1:num_bands,j,nkp) = cmplx_0 enddo - enddo - do j = 1, num_wann - a_matrix(1:ndimwin(nkp),j,nkp) = catmpmat(1:ndimwin(nkp),j,nkp) - enddo - do j = 1, num_wann - a_matrix(ndimwin(nkp)+1:num_bands,j,nkp) = cmplx_0 - enddo + endif enddo do nkp = 1, num_kpts @@ -1089,8 +1138,8 @@ subroutine dis_project() if (ierr/=0) call io_error('Error in deallocating rwork in dis_project') deallocate(svals,stat=ierr) if (ierr/=0) call io_error('Error in deallocating svals in dis_project') - deallocate(catmpmat,stat=ierr) - if (ierr/=0) call io_error('Error in deallocating catmpmat in dis_project') +! deallocate(catmpmat,stat=ierr) +! if (ierr/=0) call io_error('Error in deallocating catmpmat in dis_project') if (on_root) write(stdout,'(a)') ' done' @@ -1689,7 +1738,7 @@ subroutine dis_extract() ! Initialize Z matrix at k points w/ non-frozen states do nkp_loc = 1, counts(my_node_id) nkp = nkp_loc + displs(my_node_id) - if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_in_loc(:,:,nkp_loc)) + if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,nkp_loc,czmat_in_loc(:,:,nkp_loc)) enddo if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_in_loc,lwindow) !RS: @@ -1740,7 +1789,7 @@ subroutine dis_extract() do nn=1,nntot nkp2=nnlist(nkp,nn) call zgemm('C','N',ndimfroz(nkp),ndimwin(nkp2),ndimwin(nkp),cmplx_1,& - u_matrix_opt(:,:,nkp),num_bands,m_matrix_orig(:,:,nn,nkp),num_bands,cmplx_0,& + u_matrix_opt(:,:,nkp),num_bands,m_matrix_orig_local(:,:,nn,nkp_loc),num_bands,cmplx_0,& cwb,num_wann) call zgemm('N','N',ndimfroz(nkp),num_wann,ndimwin(nkp2),cmplx_1,& cwb,num_wann,u_matrix_opt(:,:,nkp2),num_bands,cmplx_0,cww,num_wann) @@ -1931,7 +1980,7 @@ subroutine dis_extract() do nn=1,nntot nkp2=nnlist(nkp,nn) call zgemm('C','N',num_wann,ndimwin(nkp2),ndimwin(nkp),cmplx_1,& - u_matrix_opt(:,:,nkp),num_bands,m_matrix_orig(:,:,nn,nkp),num_bands,cmplx_0,& + u_matrix_opt(:,:,nkp),num_bands,m_matrix_orig_local(:,:,nn,nkp_loc),num_bands,cmplx_0,& cwb,num_wann) call zgemm('N','N',num_wann,num_wann,ndimwin(nkp2),cmplx_1,& cwb,num_wann,u_matrix_opt(:,:,nkp2),num_bands,cmplx_0,cww,num_wann) @@ -1964,7 +2013,7 @@ subroutine dis_extract() ! Construct the updated Z matrix, CZMAT_OUT, at k points w/ non-frozen s do nkp_loc = 1, counts(my_node_id) nkp = nkp_loc + displs(my_node_id) - if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_out_loc(:,:,nkp_loc)) + if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,nkp_loc,czmat_out_loc(:,:,nkp_loc)) enddo if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_out_loc,lwindow) !RS: @@ -1987,10 +2036,12 @@ subroutine dis_extract() deallocate(czmat_in_loc,stat=ierr) if (ierr/=0) call io_error('Error deallocating czmat_in_loc in dis_extract') + if (on_root) then allocate(ceamp(num_bands,num_bands,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error allocating ceamp in dis_extract') allocate(cham(num_bands,num_bands,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error allocating cham in dis_extract') + endif if (.not.dis_converged) then if (on_root) write(stdout,'(/5x,a)') & @@ -2034,6 +2085,8 @@ subroutine dis_extract() ! Set public variable omega_invariant omega_invariant=womegai + ! Currently, this part is not parallelized; thus, we perform the task only on root and then broadcast the result. + if (on_root) then ! DIAGONALIZE THE HAMILTONIAN WITHIN THE OPTIMIZED SUBSPACES do nkp = 1, num_kpts @@ -2107,6 +2160,9 @@ subroutine dis_extract() !write(stdout,"(a)") & !YN: RS: ! 'Note(symmetry-adapted mode): u_matrix_opt are no longer the eigenstates of the subspace Hamiltonian.' !RS: endif !YN: + endif + call comms_bcast(eigval_opt(1,1),num_bands*num_kpts) + call comms_bcast(u_matrix_opt(1,1,1),num_bands*num_wann*num_kpts) if(index(devel_flag,'compspace')>0) then @@ -2173,8 +2229,10 @@ subroutine dis_extract() deallocate(history,stat=ierr) if (ierr/=0) call io_error('Error deallocating history in dis_extract') + if (on_root) then deallocate(cham,stat=ierr) if (ierr/=0) call io_error('Error deallocating cham in dis_extract') + endif if(allocated(camp)) then deallocate(camp,stat=ierr) if (ierr/=0) call io_error('Error deallocating camp in dis_extract') @@ -2183,8 +2241,10 @@ subroutine dis_extract() deallocate(camp_loc,stat=ierr) if (ierr/=0) call io_error('Error deallocating camp_loc in dis_extract') endif + if (on_root) then deallocate(ceamp,stat=ierr) if (ierr/=0) call io_error('Error deallocating ceamp in dis_extract') + endif deallocate(u_matrix_opt_loc,stat=ierr) if (ierr/=0) call io_error('Error deallocating u_matrix_opt_loc in dis_extract') deallocate(wkomegai1_loc,stat=ierr) @@ -2257,7 +2317,7 @@ end subroutine internal_test_convergence !==================================================================! - subroutine internal_zmatrix(nkp,cmtrx) + subroutine internal_zmatrix(nkp,nkp_loc,cmtrx) !==================================================================! !! Compute the Z-matrix ! ! @@ -2268,6 +2328,7 @@ subroutine internal_zmatrix(nkp,cmtrx) implicit none integer, intent(in) :: nkp + integer, intent(in) :: nkp_loc !! Which kpoint complex(kind=dp), intent(out) :: cmtrx(num_bands,num_bands) !! (M,N)-TH ENTRY IN THE (NDIMWIN(NKP)-NDIMFROZ(NKP)) x (NDIMWIN(NKP)-NDIMFRO @@ -2284,7 +2345,7 @@ subroutine internal_zmatrix(nkp,cmtrx) do nn=1,nntot nkp2=nnlist(nkp,nn) call zgemm('N','N',num_bands,num_wann,ndimwin(nkp2),cmplx_1,& - m_matrix_orig(:,:,nn,nkp),num_bands,u_matrix_opt(:,:,nkp2),num_bands,& + m_matrix_orig_local(:,:,nn,nkp_loc),num_bands,u_matrix_opt(:,:,nkp2),num_bands,& cmplx_0,cbw,num_bands) do n=1,ndimk q=indxnfroz(n,nkp) diff --git a/src/overlap.F90 b/src/overlap.F90 index ea7e43c9d..7685c3e0f 100644 --- a/src/overlap.F90 +++ b/src/overlap.F90 @@ -44,8 +44,11 @@ subroutine overlap_read( ) use w90_parameters, only : num_bands, num_wann, num_kpts, nntot, nncell, nnlist,& devel_flag, u_matrix, m_matrix, a_matrix, timing_level, & - m_matrix_orig, u_matrix_opt, cp_pp, use_bloch_phases, gamma_only ![ysl] + m_matrix_orig, u_matrix_opt, cp_pp, use_bloch_phases, gamma_only,& ![ysl] + m_matrix_local, m_matrix_orig_local use w90_io, only : io_file_unit, io_error, seedname, io_stopwatch + use w90_comms, only : my_node_id, num_nodes,& + comms_array_split, comms_scatterv implicit none @@ -57,29 +60,46 @@ subroutine overlap_read( ) complex(kind=dp), allocatable :: mmn_tmp(:,:) character(len=50) :: dummy logical :: nn_found + ! Needed to split an array on different nodes + integer, dimension(0:num_nodes-1) :: counts + integer, dimension(0:num_nodes-1) :: displs if (timing_level>0) call io_stopwatch('overlap: read',1) + call comms_array_split(num_kpts,counts,displs) + allocate ( u_matrix( num_wann,num_wann,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating u_matrix in overlap_read') u_matrix = cmplx_0 if (disentanglement) then + if (on_root) then allocate(m_matrix_orig(num_bands,num_bands,nntot,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating m_matrix_orig in overlap_read') + endif + allocate(m_matrix_orig_local(num_bands,num_bands,nntot,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error in allocating m_matrix_orig_local in overlap_read') allocate(a_matrix(num_bands,num_wann,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating a_matrix in overlap_read') allocate(u_matrix_opt(num_bands,num_wann,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating u_matrix_opt in overlap_read') else + if (on_root) then allocate ( m_matrix( num_wann,num_wann,nntot,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating m_matrix in overlap_read') m_matrix = cmplx_0 + endif + allocate ( m_matrix_local( num_wann,num_wann,nntot,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error in allocating m_matrix_local in overlap_read') + m_matrix_local = cmplx_0 endif if (disentanglement) then + if (on_root) then m_matrix_orig = cmplx_0 + endif + m_matrix_orig_local = cmplx_0 a_matrix = cmplx_0 u_matrix_opt = cmplx_0 endif @@ -155,9 +175,13 @@ subroutine overlap_read( ) endif if(disentanglement) then - call comms_bcast(m_matrix_orig(1,1,1,1),num_bands*num_bands*nntot*num_kpts) +! call comms_bcast(m_matrix_orig(1,1,1,1),num_bands*num_bands*nntot*num_kpts) + call comms_scatterv(m_matrix_orig_local,num_bands*num_bands*nntot*counts(my_node_id),& + m_matrix_orig,num_bands*num_bands*nntot*counts,num_bands*num_bands*nntot*displs) else - call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts) +! call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts) + call comms_scatterv(m_matrix_local,num_wann*num_wann*nntot*counts(my_node_id),& + m_matrix,num_wann*num_wann*nntot*counts,num_wann*num_wann*nntot*displs) endif if(.not. use_bloch_phases) then @@ -500,7 +524,8 @@ subroutine overlap_dealloc( ) !! Dellocate memory use w90_parameters, only : u_matrix,m_matrix,m_matrix_orig,& - a_matrix,u_matrix_opt + a_matrix,u_matrix_opt,& + m_matrix_local,m_matrix_orig_local use w90_io, only : io_error implicit none @@ -515,10 +540,16 @@ subroutine overlap_dealloc( ) deallocate( a_matrix, stat=ierr ) if (ierr/=0) call io_error('Error deallocating a_matrix in overlap_dealloc') end if + if (on_root) then if (allocated( m_matrix_orig)) then deallocate( m_matrix_orig, stat=ierr ) if (ierr/=0) call io_error('Error deallocating m_matrix_orig in overlap_dealloc') endif + endif + if (allocated( m_matrix_orig_local)) then + deallocate( m_matrix_orig_local, stat=ierr ) + if (ierr/=0) call io_error('Error deallocating m_matrix_orig_local in overlap_dealloc') + endif !~![ysl-b] !~ if (allocated( ph_g)) then !~ deallocate( ph_g, stat=ierr ) @@ -527,10 +558,29 @@ subroutine overlap_dealloc( ) !~![ysl-e] - deallocate ( m_matrix, stat=ierr ) - if (ierr/=0) call io_error('Error deallocating m_matrix in overlap_dealloc') - deallocate ( u_matrix, stat=ierr ) - if (ierr/=0) call io_error('Error deallocating u_matrix in overlap_dealloc') +! if (on_root) then +! deallocate ( m_matrix, stat=ierr ) +! if (ierr/=0) call io_error('Error deallocating m_matrix in overlap_dealloc') +! endif +! deallocate ( m_matrix_local, stat=ierr ) +! if (ierr/=0) call io_error('Error deallocating m_matrix_local in overlap_dealloc') +! deallocate ( u_matrix, stat=ierr ) +! if (ierr/=0) call io_error('Error deallocating u_matrix in overlap_dealloc') + if (on_root) then + if (allocated( m_matrix)) then + deallocate ( m_matrix, stat=ierr ) + if (ierr/=0) call io_error('Error deallocating m_matrix in overlap_dealloc') + endif + endif + if (allocated( m_matrix_local)) then + deallocate ( m_matrix_local, stat=ierr ) + if (ierr/=0) call io_error('Error deallocating m_matrix_local in overlap_dealloc') + endif + if (allocated( u_matrix)) then + deallocate ( u_matrix, stat=ierr ) + if (ierr/=0) call io_error('Error deallocating u_matrix in overlap_dealloc') + endif + return @@ -551,10 +601,13 @@ subroutine overlap_project() use w90_constants use w90_io, only : io_error,io_stopwatch use w90_parameters, only : num_bands,num_wann,num_kpts,timing_level,& - u_matrix,m_matrix,nntot,nnlist + u_matrix,m_matrix,nntot,nnlist,& + m_matrix_local use w90_utility, only : utility_zgemm use w90_parameters, only : lsitesymmetry !RS: use w90_sitesym, only : sitesym_symmetrize_u_matrix !RS: + use w90_comms, only : my_node_id, num_nodes,& + comms_array_split, comms_scatterv, comms_gatherv implicit none @@ -567,9 +620,14 @@ subroutine overlap_project() complex(kind=dp), allocatable :: cwork(:) complex(kind=dp), allocatable :: cz(:,:) complex(kind=dp), allocatable :: cvdag(:,:) + ! Needed to split an array on different nodes + integer, dimension(0:num_nodes-1) :: counts + integer, dimension(0:num_nodes-1) :: displs if (timing_level>1) call io_stopwatch('overlap: project',1) + call comms_array_split(num_kpts,counts,displs) + allocate(svals(num_bands),stat=ierr) if (ierr/=0) call io_error('Error in allocating svals in overlap_project') allocate(cz(num_bands,num_bands),stat=ierr) @@ -638,16 +696,18 @@ subroutine overlap_project() ! so now we have the U's that rotate the wavefunctions at each k-point. ! the matrix elements M_ij have also to be updated - do nkp=1, num_kpts + do nkp=1, counts(my_node_id) do nn=1,nntot - nkp2=nnlist(nkp,nn) + nkp2=nnlist(nkp+displs(my_node_id),nn) ! cvdag = U^{dagger} . M (use as workspace) - call utility_zgemm(cvdag,u_matrix(:,:,nkp),'C',m_matrix(:,:,nn,nkp),'N',num_wann) + call utility_zgemm(cvdag,u_matrix(:,:,nkp+displs(my_node_id)),'C',m_matrix_local(:,:,nn,nkp),'N',num_wann) ! cz = cvdag . U call utility_zgemm(cz,cvdag,'N',u_matrix(:,:,nkp2),'N',num_wann) - m_matrix(:,:,nn,nkp) = cz(:,:) + m_matrix_local(:,:,nn,nkp) = cz(:,:) end do end do + call comms_gatherv(m_matrix_local,num_wann*num_wann*nntot*counts(my_node_id),& + m_matrix,num_wann*num_wann*nntot*counts,num_wann*num_wann*nntot*displs) deallocate(cwork,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cwork in overlap_project') diff --git a/src/parameters.F90 b/src/parameters.F90 index ba0090cd2..72e888973 100644 --- a/src/parameters.F90 +++ b/src/parameters.F90 @@ -409,6 +409,7 @@ module w90_parameters complex(kind=dp), allocatable, save, public :: a_matrix(:,:,:) complex(kind=dp), allocatable, save, public :: m_matrix_orig(:,:,:,:) + complex(kind=dp), allocatable, save, public :: m_matrix_orig_local(:,:,:,:) real(kind=dp), allocatable, save, public :: eigval(:,:) logical, save, public :: eig_found @@ -429,6 +430,7 @@ module w90_parameters complex(kind=dp), allocatable, save, public :: u_matrix(:,:,:) complex(kind=dp), allocatable, save, public :: m_matrix(:,:,:,:) + complex(kind=dp), allocatable, save, public :: m_matrix_local(:,:,:,:) ! RS: symmetry-adapted Wannier functions logical, public, save :: lsitesymmetry=.false. @@ -3710,12 +3712,12 @@ subroutine param_chkpt_dist endif call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts) - if (.not.on_root .and. .not.allocated(m_matrix)) then - allocate(m_matrix(num_wann,num_wann,nntot,num_kpts),stat=ierr) - if (ierr/=0)& - call io_error('Error allocating m_matrix in param_chkpt_dist') - endif - call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts) +! if (.not.on_root .and. .not.allocated(m_matrix)) then +! allocate(m_matrix(num_wann,num_wann,nntot,num_kpts),stat=ierr) +! if (ierr/=0)& +! call io_error('Error allocating m_matrix in param_chkpt_dist') +! endif +! call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts) call comms_bcast(have_disentangled,1) @@ -5719,6 +5721,7 @@ subroutine param_dist call comms_bcast(num_cg_steps,1) call comms_bcast(conv_tol,1) call comms_bcast(conv_window,1) + call comms_bcast(guiding_centres,1) call comms_bcast(wannier_plot,1) call comms_bcast(num_wannier_plot,1) if(num_wannier_plot>0) then diff --git a/src/wannierise.F90 b/src/wannierise.F90 index da6666527..80374de75 100644 --- a/src/wannierise.F90 +++ b/src/wannierise.F90 @@ -18,7 +18,8 @@ module w90_wannierise use w90_constants use w90_comms, only : on_root, my_node_id, num_nodes,& comms_bcast, comms_array_split,& - comms_gatherv, comms_allreduce + comms_gatherv, comms_allreduce,& + comms_scatterv implicit none @@ -176,8 +177,8 @@ subroutine wann_main ! sub vars passed into other subs allocate( csheet (num_wann, nntot, num_kpts), stat=ierr ) if (ierr/=0) call io_error('Error in allocating csheet in wann_main') - allocate( cdodq (num_wann, num_wann, num_kpts),stat=ierr ) - if (ierr/=0) call io_error('Error in allocating cdodq in wann_main') +! allocate( cdodq (num_wann, num_wann, num_kpts),stat=ierr ) +! if (ierr/=0) call io_error('Error in allocating cdodq in wann_main') allocate( sheet (num_wann, nntot, num_kpts), stat=ierr ) if (ierr/=0) call io_error('Error in allocating sheet in wann_main') allocate( rave (3, num_wann), stat=ierr ) @@ -210,7 +211,8 @@ subroutine wann_main end if end if - csheet=cmplx_1;cdodq=cmplx_0 +! csheet=cmplx_1;cdodq=cmplx_0 + csheet=cmplx_1 sheet=0.0_dp;rave=0.0_dp;r2ave=0.0_dp;rave2=0.0_dp;rguide=0.0_dp ! sub vars not passed into other subs @@ -233,10 +235,10 @@ subroutine wann_main if (ierr/=0) call io_error('Error in allocating u_matrix_loc in wann_main') allocate( m_matrix_loc (num_wann, num_wann, nntot, max(1,counts(my_node_id))),stat=ierr ) if (ierr/=0) call io_error('Error in allocating m_matrix_loc in wann_main') - allocate( m_matrix_1b (num_wann, num_wann, num_kpts),stat=ierr ) - if (ierr/=0) call io_error('Error in allocating m_matrix_1b in wann_main') - allocate( m_matrix_1b_loc (num_wann, num_wann, max(1,counts(my_node_id))),stat=ierr ) - if (ierr/=0) call io_error('Error in allocating m_matrix_1b_loc in wann_main') +! allocate( m_matrix_1b (num_wann, num_wann, num_kpts),stat=ierr ) +! if (ierr/=0) call io_error('Error in allocating m_matrix_1b in wann_main') +! allocate( m_matrix_1b_loc (num_wann, num_wann, max(1,counts(my_node_id))),stat=ierr ) +! if (ierr/=0) call io_error('Error in allocating m_matrix_1b_loc in wann_main') if(precond) then allocate(cdodq_precond_loc(num_wann,num_wann,max(1,counts(my_node_id))),stat=ierr) if (ierr/=0) call io_error('Error in allocating cdodq_precond_loc in wann_main') @@ -244,11 +246,13 @@ subroutine wann_main ! initialize local u and m matrices with global ones do nkp_loc = 1, counts(my_node_id) nkp = nkp_loc + displs(my_node_id) - m_matrix_loc (:,:,:, nkp_loc) = & - m_matrix (:,:,:, nkp) +! m_matrix_loc (:,:,:, nkp_loc) = & +! m_matrix (:,:,:, nkp) u_matrix_loc (:,:, nkp_loc) = & u_matrix (:,:, nkp) end do + call comms_scatterv(m_matrix_loc,num_wann*num_wann*nntot*counts(my_node_id),& + m_matrix,num_wann*num_wann*nntot*counts,num_wann*num_wann*nntot*displs) allocate( cdq_loc (num_wann, num_wann, max(1,counts(my_node_id))),stat=ierr ) if (ierr/=0) call io_error('Error in allocating cdq_loc in wann_main') @@ -410,7 +414,8 @@ subroutine wann_main u0_loc=u_matrix_loc if(optimisation<=0) then - write(page_unit) m_matrix +! write(page_unit) m_matrix + write(page_unit) m_matrix_loc rewind(page_unit) else m0_loc=m_matrix_loc @@ -457,7 +462,8 @@ subroutine wann_main if (.not.lfixstep) then u_matrix_loc=u0_loc if(optimisation<=0) then - read(page_unit) m_matrix +! read(page_unit) m_matrix + read(page_unit) m_matrix_loc rewind(page_unit) else m_matrix_loc=m0_loc @@ -529,14 +535,17 @@ subroutine wann_main ! end of the minimization loop ! the m matrix is sent by piece to avoid huge arrays - do nn = 1, nntot - m_matrix_1b_loc=m_matrix_loc(:,:,nn,:) - call comms_gatherv(m_matrix_1b_loc,num_wann*num_wann*counts(my_node_id),& - m_matrix_1b,num_wann*num_wann*counts,num_wann*num_wann*displs) - call comms_bcast(m_matrix_1b(1,1,1),num_wann*num_wann*num_kpts) - m_matrix(:,:,nn,:)=m_matrix_1b(:,:,:) - end do!nn - + ! But, I want to reduce the memory usage as much as possible. +! do nn = 1, nntot +! m_matrix_1b_loc=m_matrix_loc(:,:,nn,:) +! call comms_gatherv(m_matrix_1b_loc,num_wann*num_wann*counts(my_node_id),& +! m_matrix_1b,num_wann*num_wann*counts,num_wann*num_wann*displs) +! call comms_bcast(m_matrix_1b(1,1,1),num_wann*num_wann*num_kpts) +! m_matrix(:,:,nn,:)=m_matrix_1b(:,:,:) +! end do!nn + call comms_gatherv(m_matrix_loc,num_wann*num_wann*nntot*counts(my_node_id),& + m_matrix,num_wann*num_wann*nntot*counts,num_wann*num_wann*nntot*displs) + ! send u matrix call comms_gatherv(u_matrix_loc,num_wann*num_wann*counts(my_node_id),& u_matrix,num_wann*num_wann*counts,num_wann*num_wann*displs) @@ -587,11 +596,13 @@ subroutine wann_main ! write extra info regarding omega_invariant !~ if (iprint>2) call internal_svd_omega_i() - if (iprint>2) call wann_svd_omega_i() +! if (iprint>2) call wann_svd_omega_i() + if (iprint>2.and.on_root) call wann_svd_omega_i() ! write matrix elements to file !~ if (write_r2mn) call internal_write_r2mn() - if (write_r2mn) call wann_write_r2mn() +! if (write_r2mn) call wann_write_r2mn() + if (write_r2mn.and.on_root) call wann_write_r2mn() ! calculate and write projection of WFs on original bands in outer window if (have_disentangled .and. write_proj) call wann_calc_projection() @@ -624,10 +635,10 @@ subroutine wann_main if (ierr/=0) call io_error('Error in deallocating u_matrix_loc in wann_main') deallocate(m_matrix_loc,stat=ierr) if (ierr/=0) call io_error('Error in deallocating m_matrix_loc in wann_main') - deallocate(m_matrix_1b,stat=ierr) - if (ierr/=0) call io_error('Error in deallocating m_matrix_1b in wann_main') - deallocate(m_matrix_1b_loc,stat=ierr) - if (ierr/=0) call io_error('Error in deallocating m_matrix_1b_loc in wann_main') +! deallocate(m_matrix_1b,stat=ierr) +! if (ierr/=0) call io_error('Error in deallocating m_matrix_1b in wann_main') +! deallocate(m_matrix_1b_loc,stat=ierr) +! if (ierr/=0) call io_error('Error in deallocating m_matrix_1b_loc in wann_main') deallocate(cdq_loc,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cdq_loc in wann_main') deallocate(cdodq_loc,stat=ierr) @@ -661,8 +672,8 @@ subroutine wann_main if (ierr/=0) call io_error('Error in deallocating rave in wann_main') deallocate(sheet,stat=ierr) if (ierr/=0) call io_error('Error in deallocating sheet in wann_main') - deallocate(cdodq,stat=ierr) - if (ierr/=0) call io_error('Error in deallocating cdodq in wann_main') +! deallocate(cdodq,stat=ierr) +! if (ierr/=0) call io_error('Error in deallocating cdodq in wann_main') deallocate(csheet,stat=ierr) if (ierr/=0) call io_error('Error in deallocating csheet in wann_main') @@ -1372,7 +1383,7 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w) ! ! !=================================================================== use w90_constants, only : eps6 - use w90_parameters, only : num_wann,m_matrix,nntot,neigh, & + use w90_parameters, only : num_wann,nntot,neigh, & nnh,bk,bka,num_kpts,timing_level use w90_io, only : io_stopwatch use w90_utility, only : utility_inv3 @@ -1580,7 +1591,7 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread) !! Calculate the Wannier Function spread ! ! !=================================================================== - use w90_parameters, only : num_wann,m_matrix,nntot,wb,bk,num_kpts,& + use w90_parameters, only : num_wann,nntot,wb,bk,num_kpts,& omega_invariant,timing_level use w90_io, only : io_stopwatch @@ -1789,7 +1800,7 @@ subroutine wann_domega(csheet,sheet,rave,cdodq) ! Calculate the Gradient of the Wannier Function spread ! ! ! !=================================================================== - use w90_parameters, only : num_wann,wb,bk,nntot,m_matrix,num_kpts,timing_level + use w90_parameters, only : num_wann,wb,bk,nntot,num_kpts,timing_level use w90_io, only : io_stopwatch,io_error use w90_parameters, only : lsitesymmetry !RS: use w90_sitesym, only : sitesym_symmetrize_gradient !RS: @@ -2519,7 +2530,7 @@ subroutine wann_main_gamma allocate( counts(0:0), displs(0:0), stat=ierr ) if (ierr/=0) call io_error('Error in allocating counts and displs in wann_main_gamma') counts(0)=0;displs(0)=0 - + ! store original U before rotating !~ ! phase factor ph_g is applied to u_matrix !~ ! NB: ph_g is applied to u_matrix_opt if (have_disentangled) From 2309fed0ffa24820aa149e8e996a298a18ddaa37 Mon Sep 17 00:00:00 2001 From: Hyungjun Lee Date: Tue, 27 Mar 2018 21:27:42 +0200 Subject: [PATCH 2/3] into postw90 --- src/postw90/comms.F90 | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/postw90/comms.F90 b/src/postw90/comms.F90 index b5a337bb8..ac8b708e8 100644 --- a/src/postw90/comms.F90 +++ b/src/postw90/comms.F90 @@ -106,6 +106,7 @@ module w90_comms module procedure comms_gatherv_cmplx_2 module procedure comms_gatherv_cmplx_3 module procedure comms_gatherv_cmplx_3_4 + module procedure comms_gatherv_cmplx_4 end interface comms_gatherv interface comms_scatterv @@ -116,6 +117,7 @@ module w90_comms module procedure comms_scatterv_real_2 module procedure comms_scatterv_real_3 ! module procedure comms_scatterv_cmplx + module procedure comms_scatterv_cmplx_4 end interface comms_scatterv contains @@ -1078,6 +1080,34 @@ subroutine comms_gatherv_cmplx_3_4(array,localcount,rootglobalarray,counts,displ end subroutine comms_gatherv_cmplx_3_4 + subroutine comms_gatherv_cmplx_4(array,localcount,rootglobalarray,counts,displs) + !! Gather complex data to root node (for arrays of rank 4) + implicit none + + complex(kind=dp), dimension(:,:,:,:), intent(inout) :: array + integer, intent(in) :: localcount + complex(kind=dp), dimension(:,:,:,:), intent(inout) :: rootglobalarray + integer, dimension(num_nodes), intent(in) :: counts + integer, dimension(num_nodes), intent(in) :: displs + +#ifdef MPI + integer :: error + + call MPI_gatherv(array,localcount,MPI_double_complex,rootglobalarray,counts,& + displs,MPI_double_complex,root_id,mpi_comm_world,error) + + if(error.ne.MPI_success) then + call io_error('Error in comms_gatherv_cmplx_3') + end if + +#else + call zcopy(localcount,array,1,rootglobalarray,1) +#endif + + return + + end subroutine comms_gatherv_cmplx_4 + subroutine comms_gatherv_logical(array,localcount,rootglobalarray,counts,displs) !! Gather real data to root node @@ -1205,6 +1235,38 @@ subroutine comms_scatterv_real_3(array,localcount,rootglobalarray,counts,displs) end subroutine comms_scatterv_real_3 +subroutine comms_scatterv_cmplx_4(array,localcount,rootglobalarray,counts,displs) + !! Scatter complex data from root node (array of rank 4) + implicit none + + complex(kind=dp), dimension(:,:,:,:), intent(inout) :: array + !! local array for getting data + integer, intent(in) :: localcount + !! localcount elements will be fetched from the root node + complex(kind=dp), dimension(:,:,:,:), intent(inout) :: rootglobalarray + !! array on the root node from which data will be sent + integer, dimension(num_nodes), intent(in) :: counts + !! how data should be partitioned, see MPI documentation or function comms_array_split + integer, dimension(num_nodes), intent(in) :: displs + +#ifdef MPI + integer :: error + + call MPI_scatterv(rootglobalarray,counts,displs,MPI_double_complex,& + array,localcount,MPI_double_complex,root_id,mpi_comm_world,error) + + if(error.ne.MPI_success) then + call io_error('Error in comms_scatterv_cmplx_4') + end if + +#else + call zcopy(localcount,rootglobalarray,1,array,1) +#endif + + return + + end subroutine comms_scatterv_cmplx_4 + subroutine comms_scatterv_int_1(array,localcount,rootglobalarray,counts,displs) !! Scatter integer data from root node (array of rank 1) implicit none From efac5d9c2874e9e5e8e73aff540847be5a3da5de Mon Sep 17 00:00:00 2001 From: Hyungjun Lee Date: Wed, 28 Mar 2018 13:57:07 +0200 Subject: [PATCH 3/3] Add files via upload fix a bug when precond=.true. --- src/wannierise.F90 | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/wannierise.F90 b/src/wannierise.F90 index 80374de75..6da67767f 100644 --- a/src/wannierise.F90 +++ b/src/wannierise.F90 @@ -177,8 +177,8 @@ subroutine wann_main ! sub vars passed into other subs allocate( csheet (num_wann, nntot, num_kpts), stat=ierr ) if (ierr/=0) call io_error('Error in allocating csheet in wann_main') -! allocate( cdodq (num_wann, num_wann, num_kpts),stat=ierr ) -! if (ierr/=0) call io_error('Error in allocating cdodq in wann_main') + allocate( cdodq (num_wann, num_wann, num_kpts),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating cdodq in wann_main') allocate( sheet (num_wann, nntot, num_kpts), stat=ierr ) if (ierr/=0) call io_error('Error in allocating sheet in wann_main') allocate( rave (3, num_wann), stat=ierr ) @@ -211,8 +211,7 @@ subroutine wann_main end if end if -! csheet=cmplx_1;cdodq=cmplx_0 - csheet=cmplx_1 + csheet=cmplx_1;cdodq=cmplx_0 sheet=0.0_dp;rave=0.0_dp;r2ave=0.0_dp;rave2=0.0_dp;rguide=0.0_dp ! sub vars not passed into other subs @@ -672,8 +671,8 @@ subroutine wann_main if (ierr/=0) call io_error('Error in deallocating rave in wann_main') deallocate(sheet,stat=ierr) if (ierr/=0) call io_error('Error in deallocating sheet in wann_main') -! deallocate(cdodq,stat=ierr) -! if (ierr/=0) call io_error('Error in deallocating cdodq in wann_main') + deallocate(cdodq,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating cdodq in wann_main') deallocate(csheet,stat=ierr) if (ierr/=0) call io_error('Error in deallocating csheet in wann_main')