diff --git a/.travis.yml b/.travis.yml index f0c5249aa..94e2fb1ae 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,17 +7,22 @@ addons: - gfortran - libblas-dev - liblapack-dev + - openmpi-bin + - libopenmpi-dev env: ## Uncomment the following line if you want to run also the interface tests # - W90TESTSWITHINTERFACE=true - - W90TESTSWITHINTERFACE=false + - W90TESTSWITHINTERFACE=false W90BINARYPARALLEL=false + - W90TESTSWITHINTERFACE=false W90BINARYPARALLEL=true install: # Possibly install QE and other interface code - ./test-suite/external-codes/get-external-codes.sh # Install Wannier - - cp config/make.inc.gfort.travisci make.inc +#- cp config/make.inc.gfort.travisci make.inc + - ./test-suite/travis_copy_make.inc.sh - make -j default script: - ./test-suite/run_tests_travis.sh git: depth: 3 + diff --git a/CHANGE.log b/CHANGE.log index 8febc1894..28fd63f84 100644 --- a/CHANGE.log +++ b/CHANGE.log @@ -4,14 +4,20 @@ The Maximally-Localised Generalised Wannier Functions Code -v2.x.y (25th January 2017) +v2.x.y (DATE) * G0W0 interface implemented (A. Marrazzo (EPFL, CH) and S. Tsirkin (DIPC, Spain)). - Added two utilities (gw2wannier90.py and k_mapper.py) and example 23 on G0W0 bands interpolation with Yambo. + Added two utilities (gw2wannier90.py and k_mapper.py) and example 23 on + G0W0 bands interpolation with Yambo. +* Parallelized the disentanglement and wannierise subroutines + with respect to k-points. Other parts of the code have been + modified so that only the root process writes in the .wout file. + wannier90.x can now be run in parallel (see user guide) + (contributed by Guillaume Geranton, FZ Julich) v2.1.0 (13th January 2017) * Implementation of the symmetry-adapted Wannier functions (see R. Sakuma, Phys. Rev. B 87, 235109 (2013), courtesy of R. Sakuma (Lund University, Sweden), T. Koretsune (Riken, JP), - Y. Nomura (U. Tokyo, JP), Y. Nohara (Atomic-Scale Material + Y. Nomura (U. Tokyo, JP), Y. Nohara (Atomic-Scale Material Simulations, Co., Ltd.), R. Arita (Riken, JP)) * Streamlined the interface between wannier90 and tight-binding codes such as pythtb (new input variable: write_tb). Also, diff --git a/config/make.inc.gfort.traviscimpi b/config/make.inc.gfort.traviscimpi new file mode 100644 index 000000000..24c50f00d --- /dev/null +++ b/config/make.inc.gfort.traviscimpi @@ -0,0 +1,21 @@ +#=================================== +# gfortran for the travis-ci service +# This file for MPI builds +#=================================== +F90 = gfortran + +COMMS = mpi +MPIF90 = mpif90 + +# Options for debugging When we move to 4.9 add -fsanitize=leak +FCOPTS = -fstrict-aliasing -fno-omit-frame-pointer -fno-realloc-lhs -fcheck=bounds,do,recursion,pointer -ffree-form -Wall -Waliasing -Wsurprising -Wline-truncation -Wno-tabs -Wno-uninitialized -Wno-unused-dummy-argument -Wno-unused -Wno-character-truncation -O1 -g -fbacktrace +LDOPTS = -fstrict-aliasing -fno-omit-frame-pointer -fno-realloc-lhs -fcheck=bounds,do,recursion,pointer -ffree-form -Wall -Waliasing -Wsurprising -Wline-truncation -Wno-tabs -Wno-uninitialized -Wno-unused-dummy-argument -Wno-unused -Wno-character-truncation -O1 -g -fbacktrace + +#======================= +# System LAPACK and BLAS +# e.g. use +# sudo apt-get install libblas-dev liblapack-dev +# on Ubuntu +#======================= +LIBS = -llapack -lblas + diff --git a/doc/tutorial/tutorial.tex b/doc/tutorial/tutorial.tex index 73a040d57..a59821dfa 100644 --- a/doc/tutorial/tutorial.tex +++ b/doc/tutorial/tutorial.tex @@ -121,10 +121,8 @@ \sectiontitle{Parallel execution} \label{sec:parallel} -Presently, {\tt wannier90.x} is a serial-only executable, so it -\emph{cannot} be run in parallel using MPI libraries. On the contrary, -{\tt postw90.x} can be run in parallel to speed up the calculations, -using the MPI libraries. +{\tt postw90.x} and {\tt wannier90.x} can be run in parallel to speed up +the calculations, using the MPI libraries. To enable the parallel version to be built, you must specify some flags in the {\tt make.inc} file of {\tt wannier90} and {\tt postw90}; diff --git a/doc/user_guide/parameters.tex b/doc/user_guide/parameters.tex index 19e4b7898..7ad545d63 100644 --- a/doc/user_guide/parameters.tex +++ b/doc/user_guide/parameters.tex @@ -1,10 +1,11 @@ \chapter{Parameters}\label{chap:parameters} \section{Usage} -{\tt -\begin{quote} -wannier90.x [-pp] [seedname] -\end{quote} } +{\tt wannier90.x} can be run in parallel using MPI libraries to +reduce the computation time. + +For serial execution use: {\tt wannier90.x [-pp] [seedname]} + \begin{itemize} \item{ {\tt seedname}: If a seedname string is given the code will read its input from a file {\tt seedname.win}. The default value is {\tt wannier}. One can also equivalently provide the string @@ -14,6 +15,25 @@ \section{Usage} This information is written to the file {\tt seedname.nnkp}.} \end{itemize} +For parallel execution use: {\tt mpirun -np NUMPROCS wannier90.x [-pp] [seedname]} + +\begin{itemize} \item +{\tt NUMPROCS}: substitute with the number of processors that you want +to use. +\end{itemize} + +Note that the {\tt mpirun} command and command-line flags may be +different in your MPI implementation: read your MPI manual or ask your +computer administrator. + +Note also that this requires that the {\tt wannier90.x} executable has been +compiled in its parallel version (follow the instructions in the file +{\tt README.install} in the main directory of the wannier90 +distribution) and +that the MPI libraries and binaries are installed and correctly +configured on your machine. + + \section{{\tt seedname.win} File\label{sec:seednamefile}} The \wannier\ input file {\tt seedname.win} has a flexible free-form structure. diff --git a/doc/user_guide/postw90params.tex b/doc/user_guide/postw90params.tex index 250654588..a57c1f04b 100644 --- a/doc/user_guide/postw90params.tex +++ b/doc/user_guide/postw90params.tex @@ -3,9 +3,9 @@ \chapter{Parameters} \section{Introduction} The \texttt{wannier90.x} code described in Part~\ref{part:w90} -calculates the maximally-localized Wannier functions. The \texttt{wannier90.x} code is a -serial executable (i.e., it cannot be executed in parallel on different -CPUs). +calculates the maximally-localized Wannier functions. %The \texttt{wannier90.x} code is a +%serial executable (i.e., it cannot be executed in parallel on different +%CPUs). The \texttt{postw90.x} executable contains instead a series of modules that take the Wannier functions calculated by \texttt{wannier90.x} and diff --git a/src/Makefile.2 b/src/Makefile.2 index f1473a032..f0795f829 100644 --- a/src/Makefile.2 +++ b/src/Makefile.2 @@ -5,7 +5,8 @@ include ../../make.inc OBJS = constants.o io.o utility.o parameters.o hamiltonian.o overlap.o \ - kmesh.o disentangle.o ws_distance.o wannierise.o plot.o transport.o sitesym.o + kmesh.o disentangle.o ws_distance.o wannierise.o plot.o transport.o sitesym.o comms.o + OBJS2 = wannier_lib.o @@ -31,8 +32,8 @@ TEMP2 = $(F90) endif -wannier libs w90chk2chk serialobjs: POSTOPTS = -wannier libs w90chk2chk serialobjs: COMPILER = $(F90) +wannier libs w90chk2chk serialobjs: POSTOPTS = $(TEMP1) +wannier libs w90chk2chk serialobjs: COMPILER = $(TEMP2) wannier: ../../wannier90.x w90chk2chk: ../../w90chk2chk.x @@ -42,7 +43,7 @@ serialobjs: $(OBJS) $(COMPILER) ../w90chk2chk.F90 $(LDOPTS) $(OBJS) $(LIBS) -o ../../w90chk2chk.x ../../wannier90.x: $(OBJS) ../wannier_prog.F90 - $(COMPILER) ../wannier_prog.F90 $(LDOPTS) $(OBJS) $(LIBS) -o ../../wannier90.x + $(COMPILER) ../wannier_prog.F90 $(POSTOPTS) $(LDOPTS) $(OBJS) $(LIBS) -o ../../wannier90.x post: POSTOPTS = $(TEMP1) post: COMPILER = $(TEMP2) @@ -88,7 +89,7 @@ io.o: ../io.F90 constants.o utility.o: ../utility.F90 constants.o io.o $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../utility.F90 -parameters.o: ../parameters.F90 constants.o io.o utility.o +parameters.o: ../parameters.F90 constants.o io.o utility.o comms.o $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../parameters.F90 hamiltonian.o: ../hamiltonian.F90 ws_distance.o constants.o io.o utility.o parameters.o @@ -100,10 +101,10 @@ overlap.o: ../overlap.F90 constants.o io.o utility.o parameters.o sitesym.o kmesh.o: ../kmesh.F90 constants.o io.o utility.o parameters.o $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../kmesh.F90 -disentangle.o: ../disentangle.F90 constants.o io.o parameters.o sitesym.o +disentangle.o: ../disentangle.F90 constants.o io.o parameters.o sitesym.o comms.o $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../disentangle.F90 -wannierise.o: ../wannierise.F90 hamiltonian.o constants.o io.o utility.o parameters.o sitesym.o +wannierise.o: ../wannierise.F90 hamiltonian.o constants.o io.o utility.o parameters.o sitesym.o comms.o $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../wannierise.F90 plot.o: ../plot.F90 constants.o io.o utility.o parameters.o hamiltonian.o ws_distance.o diff --git a/src/disentangle.F90 b/src/disentangle.F90 index 281d7d520..4f3ce26ce 100644 --- a/src/disentangle.F90 +++ b/src/disentangle.F90 @@ -18,7 +18,19 @@ module w90_disentangle use w90_constants, only: dp,cmplx_0,cmplx_1 use w90_io, only: io_error,stdout,io_stopwatch - use w90_parameters + use w90_parameters, only : num_bands,num_wann,a_matrix,u_matrix_opt,& + u_matrix,m_matrix_orig,lwindow,dis_conv_window,devel_flag, & + nntot,timing_level,omega_invariant,u_matrix,lsitesymmetry, & + lenconfac,iprint,wbtot,dis_num_iter,dis_mix_ratio,dis_win_min, & + dis_win_max,dis_froz_min,dis_froz_max,dis_spheres_num, & + dis_spheres_first_wann,num_kpts,nnlist,ndimwin,wb,gamma_only, & + eigval,length_unit,dis_spheres,m_matrix,dis_conv_tol,frozen_states, & + optimisation,recip_lattice,kpt_latt + + use w90_comms, only : on_root, my_node_id, num_nodes,& + comms_bcast, comms_array_split,& + comms_gatherv, comms_allreduce + use w90_sitesym, only: sitesym_slim_d_matrix_band, & sitesym_replace_d_matrix_band,sitesym_symmetrize_u_matrix,& sitesym_symmetrize_zmatrix,sitesym_dis_extract_symmetry !RS: @@ -67,7 +79,7 @@ subroutine dis_main() if (timing_level>0) call io_stopwatch('dis: main',1) - write(stdout,'(/1x,a)') & + if (on_root) write(stdout,'(/1x,a)') & '*------------------------------- DISENTANGLE --------------------------------*' ! Allocate arrays @@ -86,10 +98,10 @@ subroutine dis_main() ! (Sec. III.G SMV) if (linner) then if (lsitesymmetry) call io_error('in symmetry-adapted mode, frozen window not implemented yet') !YN: RS: - write(stdout,'(3x,a)') 'Using an inner window (linner = T)' + if (on_root) write(stdout,'(3x,a)') 'Using an inner window (linner = T)' call dis_proj_froz() else - write(stdout,'(3x,a)') 'No inner window (linner = F)' + if (on_root) write(stdout,'(3x,a)') 'No inner window (linner = F)' endif ! Debug @@ -238,7 +250,7 @@ subroutine dis_main() ! Deallocate module arrays call internal_dealloc() - if (timing_level>0) call io_stopwatch('dis: main',2) + if (timing_level>0.and.on_root) call io_stopwatch('dis: main',2) return @@ -279,8 +291,8 @@ subroutine internal_check_orthonorm() enddo if (l.eq.m) then if (abs(ctmp - cmplx_1).gt.eps8) then - write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp - write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal' + if (on_root) write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp + if (on_root) write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal' ! write(stdout,'(1x,a)') 'Try re-running the calculation with the input keyword' ! write(stdout,'(1x,a)') ' devel_flag=orth-fix' ! write(stdout,'(1x,a)') 'Please report the sucess or failure of this to the Wannier90 developers' @@ -288,8 +300,8 @@ subroutine internal_check_orthonorm() endif else if (abs(ctmp).gt.eps8) then - write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp - write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal' + if (on_root) write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp + if (on_root) write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal' ! write(stdout,'(1x,a)') 'Try re-running the calculation with the input keyword' ! write(stdout,'(1x,a)') ' devel_flag=orth-fix' ! write(stdout,'(1x,a)') 'Please report the sucess or failure of this to the Wannier90 developers' @@ -300,7 +312,7 @@ subroutine internal_check_orthonorm() enddo enddo - if (timing_level>1) call io_stopwatch('dis: main: check_orthonorm',2) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: main: check_orthonorm',2) return @@ -322,7 +334,7 @@ subroutine internal_slim_m() integer :: nkp,nkp2,nn,i,j,m,n,ierr complex(kind=dp), allocatable :: cmtmp(:,:) - if (timing_level>1) call io_stopwatch('dis: main: slim_m',1) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: main: slim_m',1) allocate(cmtmp(num_bands,num_bands),stat=ierr) if (ierr/=0) call io_error('Error in allocating cmtmp in dis_main') @@ -349,7 +361,7 @@ subroutine internal_slim_m() deallocate(cmtmp,stat=ierr) if (ierr/=0) call io_error('Error deallocating cmtmp in dis_main') - if (timing_level>1) call io_stopwatch('dis: main: slim_m',2) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: main: slim_m',2) return @@ -390,7 +402,7 @@ subroutine internal_find_u() complex(kind=dp), allocatable :: cz(:,:) complex(kind=dp), allocatable :: cwork(:) - if (timing_level>1) call io_stopwatch('dis: main: find_u',1) + if (timing_level>1.and.on_root) call io_stopwatch('dis: main: find_u',1) ! Allocate arrays needed for ZGESVD allocate(svals(num_wann),stat=ierr) @@ -417,10 +429,10 @@ subroutine internal_find_u() call ZGESVD ('A', 'A', num_wann, num_wann, caa(:,:,nkp), num_wann, & svals, cz, num_wann, cv, num_wann, cwork, 4*num_wann, rwork, info) if (info.ne.0) then - write(stdout,*) ' ERROR: IN ZGESVD IN dis_main' - write(stdout,*) 'K-POINT NKP=', nkp, ' INFO=', info + if (on_root) write(stdout,*) ' ERROR: IN ZGESVD IN dis_main' + if (on_root) write(stdout,*) 'K-POINT NKP=', nkp, ' INFO=', info if (info.lt.0) then - write(stdout,*) 'THE ', -info, '-TH ARGUMENT HAD ILLEGAL VALUE' + if (on_root) write(stdout,*) 'THE ', -info, '-TH ARGUMENT HAD ILLEGAL VALUE' endif call io_error('dis_main: problem in ZGESVD 1') endif @@ -625,7 +637,7 @@ subroutine dis_windows() ! it is slimmed down to contain only those inside the ! energy window, stored in nb=1,...,ndimwin(nkp) - if (timing_level>1) call io_stopwatch('dis: windows',1) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: windows',1) ! Allocate module arrays allocate(nfirstwin(num_kpts),stat=ierr) @@ -641,33 +653,33 @@ subroutine dis_windows() linner = .false. - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '+----------------------------------------------------------------------------+' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| Energy Windows |' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| --------------- |' - write(stdout,'(1x,a,f10.5,a,f10.5,a)') & + if (on_root) write(stdout,'(1x,a,f10.5,a,f10.5,a)') & '| Outer: ',dis_win_min,' to ',dis_win_max,& ' (eV) |' if (frozen_states) then - write(stdout,'(1x,a,f10.5,a,f10.5,a)') & + if (on_root) write(stdout,'(1x,a,f10.5,a,f10.5,a)') & '| Inner: ',dis_froz_min,' to ',dis_froz_max,& ' (eV) |' else - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| No frozen states were specified |' endif - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '+----------------------------------------------------------------------------+' do nkp = 1, num_kpts ! Check which eigenvalues fall within the outer window if ( (eigval_opt(1,nkp).gt.dis_win_max).or.& (eigval_opt(num_bands,nkp).lt.dis_win_min) ) then - write(stdout,*) ' ERROR AT K-POINT: ', nkp - write(stdout,*) ' ENERGY WINDOW (eV): [',dis_win_min, ',', dis_win_max, ']' - write(stdout,*) ' EIGENVALUE RANGE (eV): [',& + if (on_root) write(stdout,*) ' ERROR AT K-POINT: ', nkp + if (on_root) write(stdout,*) ' ENERGY WINDOW (eV): [',dis_win_min, ',', dis_win_max, ']' + if (on_root) write(stdout,*) ' EIGENVALUE RANGE (eV): [',& eigval_opt(1,nkp),',',eigval_opt(num_bands,nkp),']' call io_error('dis_windows: The outer energy window contains no eigenvalues') endif @@ -713,7 +725,7 @@ subroutine dis_windows() !~~ GS-end if (ndimwin(nkp).lt.num_wann) then - write(stdout,483) 'Error at k-point ',nkp,& + if (on_root) write(stdout,483) 'Error at k-point ',nkp,& ' ndimwin=',ndimwin(nkp),' num_wann=',num_wann 483 format(1x,a17,i4,a8,i3,a9,i3) call io_error('dis_windows: Energy window contains fewer states than number of target WFs') @@ -750,11 +762,11 @@ subroutine dis_windows() ndimfroz(nkp) = kifroz_max - kifroz_min + 1 if (ndimfroz(nkp).gt.num_wann) then - write(stdout,401) nkp, ndimfroz(nkp),num_wann + if (on_root) write(stdout,401) nkp, ndimfroz(nkp),num_wann 401 format(' ERROR AT K-POINT ',i4,' THERE ARE ',i2, & ' BANDS INSIDE THE INNER WINDOW AND ONLY',i2, & ' TARGET BANDS') - write(stdout,402) (eigval_opt(i,nkp),i = imin, imax) + if (on_root) write(stdout,402) (eigval_opt(i,nkp),i = imin, imax) 402 format('BANDS: (eV)',10(F10.5,1X)) call io_error('dis_windows: More states in the frozen window than target WFs') endif @@ -772,11 +784,11 @@ subroutine dis_windows() lfrozen(indxfroz(i,nkp),nkp) = .true. enddo if (indxfroz(ndimfroz(nkp),nkp).ne.kifroz_max) then - write(stdout,*) ' Error at k-point ', nkp, ' frozen band #', i - write(stdout,*) ' ndimfroz=', ndimfroz(nkp) - write(stdout,*) ' kifroz_min=', kifroz_min - write(stdout,*) ' kifroz_max=', kifroz_max - write(stdout,*) ' indxfroz(i,nkp)=', indxfroz(i,nkp) + if (on_root) write(stdout,*) ' Error at k-point ', nkp, ' frozen band #', i + if (on_root) write(stdout,*) ' ndimfroz=', ndimfroz(nkp) + if (on_root) write(stdout,*) ' kifroz_min=', kifroz_min + if (on_root) write(stdout,*) ' kifroz_max=', kifroz_max + if (on_root) write(stdout,*) ' indxfroz(i,nkp)=', indxfroz(i,nkp) call io_error('dis_windows: Something fishy...') endif endif @@ -792,8 +804,8 @@ subroutine dis_windows() enddo if ( i.ne.ndimwin(nkp) - ndimfroz(nkp) ) then - write(stdout,*) ' Error at k-point: ',nkp - write(stdout,'(3(a,i5))') ' i: ',i,' ndimwin: ',ndimwin(nkp),& + if (on_root) write(stdout,*) ' Error at k-point: ',nkp + if (on_root) write(stdout,'(3(a,i5))') ' i: ',i,' ndimwin: ',ndimwin(nkp),& ' ndimfroz: ',ndimfroz(nkp) call io_error('dis_windows: i .ne. (ndimwin-ndimfroz) at k-point') endif @@ -830,51 +842,51 @@ subroutine dis_windows() !~![ysl-e] if (iprint>1) then - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| K-points with Frozen States |' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| --------------------------- |' i=0 do nkp=1,num_kpts if (ndimfroz(nkp).gt.0) then i=i+1 if (i.eq.1) then - write(stdout,'(1x,a,i6)',advance='no') '|',nkp + if (on_root) write(stdout,'(1x,a,i6)',advance='no') '|',nkp else if ((i.gt.1) .and. (i.lt.12)) then - write(stdout,'(i6)',advance='no') nkp + if (on_root) write(stdout,'(i6)',advance='no') nkp else if (i.eq.12) then - write(stdout,'(i6,a)') nkp,' |' + if (on_root) write(stdout,'(i6,a)') nkp,' |' i=0 endif endif enddo if (i.ne.0) then do j=1,12-i - write(stdout,'(6x)',advance='no') + if (on_root) write(stdout,'(6x)',advance='no') enddo - write(stdout,'(a)') ' |' + if (on_root) write(stdout,'(a)') ' |' endif - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '+----------------------------------------------------------------------------+' endif - write(stdout,'(3x,a,i4)') 'Number of target bands to extract: ',num_wann + if (on_root) write(stdout,'(3x,a,i4)') 'Number of target bands to extract: ',num_wann if (iprint>1) then - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '+----------------------------------------------------------------------------+' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| Windows |' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| ------- |' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| K-point Ndimwin Ndimfroz Nfirstwin |' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| ---------------------------------------------- |' do nkp=1,num_kpts - write(stdout,403) nkp,ndimwin(nkp),ndimfroz(nkp),nfirstwin(nkp) + if (on_root) write(stdout,403) nkp,ndimwin(nkp),ndimfroz(nkp),nfirstwin(nkp) enddo 403 format(1x,'|',14x,i6,7x,i6,7x,i6,6x,i6,18x,'|') - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '+----------------------------------------------------------------------------+' endif @@ -949,12 +961,12 @@ subroutine dis_project() if (timing_level>1) call io_stopwatch('dis: project',1) - write(stdout,'(/1x,a)') & + if (on_root) write(stdout,'(/1x,a)') & ' Unitarised projection of Wannier functions ' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & ' ------------------------------------------ ' - write(stdout,'(3x,a)') 'A_mn = --> S = A.A^+ --> U = S^-1/2.A' - write(stdout,'(3x,a)',advance='no') 'In dis_project...' + if (on_root) write(stdout,'(3x,a)') 'A_mn = --> S = A.A^+ --> U = S^-1/2.A' + if (on_root) write(stdout,'(3x,a)',advance='no') 'In dis_project...' allocate(catmpmat(num_bands,num_bands,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error in allocating catmpmat in dis_project') @@ -992,10 +1004,10 @@ subroutine dis_project() num_bands, svals, cz, num_bands, cvdag, num_bands, cwork, & 4*num_bands, rwork, info) if (info.ne.0) then - write(stdout,*) ' ERROR: IN ZGESVD IN dis_project' - write(stdout,*) ' K-POINT NKP=', nkp, ' INFO=', info + if (on_root) write(stdout,*) ' ERROR: IN ZGESVD IN dis_project' + if (on_root) write(stdout,*) ' K-POINT NKP=', nkp, ' INFO=', info if (info.lt.0) then - write(stdout,*) ' THE ', -info, '-TH ARGUMENT HAD ILLEGAL VALUE' + if (on_root) write(stdout,*) ' THE ', -info, '-TH ARGUMENT HAD ILLEGAL VALUE' endif call io_error('dis_project: problem in ZGESVD 1') endif @@ -1045,19 +1057,19 @@ subroutine dis_project() ctmp2 = ctmp2 + u_matrix_opt(m,j,nkp) * conjg(u_matrix_opt(m,i,nkp)) enddo if ( (i.eq.j).and.(abs(ctmp2-cmplx_1).gt.eps5) ) then - write(stdout,*) ' ERROR: unitarity of initial U' - write(stdout,'(1x,a,i2)') 'nkp= ', nkp - write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j - write(stdout,'(1x,a,f12.6,1x,f12.6)') & + if (on_root) write(stdout,*) ' ERROR: unitarity of initial U' + if (on_root) write(stdout,'(1x,a,i2)') 'nkp= ', nkp + if (on_root) write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j + if (on_root) write(stdout,'(1x,a,f12.6,1x,f12.6)') & '[u_matrix_opt.transpose(u_matrix_opt)]_ij= ',& real(ctmp2,dp),aimag(ctmp2) call io_error('dis_project: Error in unitarity of initial U in dis_project') endif if ( (i.ne.j) .and. (abs(ctmp2).gt.eps5) ) then - write(stdout,*) ' ERROR: unitarity of initial U' - write(stdout,'(1x,a,i2)') 'nkp= ', nkp - write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j - write(stdout,'(1x,a,f12.6,1x,f12.6)') & + if (on_root) write(stdout,*) ' ERROR: unitarity of initial U' + if (on_root) write(stdout,'(1x,a,i2)') 'nkp= ', nkp + if (on_root) write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j + if (on_root) write(stdout,'(1x,a,f12.6,1x,f12.6)') & '[u_matrix_opt.transpose(u_matrix_opt)]_ij= ', & real(ctmp2,dp),aimag(ctmp2) call io_error('dis_project: Error in unitarity of initial U in dis_project') @@ -1080,7 +1092,7 @@ subroutine dis_project() deallocate(catmpmat,stat=ierr) if (ierr/=0) call io_error('Error in deallocating catmpmat in dis_project') - write(stdout,'(a)') ' done' + if (on_root) write(stdout,'(a)') ' done' if (timing_level>1) call io_stopwatch('dis: project',2) @@ -1156,7 +1168,7 @@ subroutine dis_proj_froz() if (timing_level>1) call io_stopwatch('dis: proj_froz',1) - write(stdout,'(3x,a)',advance='no') 'In dis_proj_froz...' + if (on_root) write(stdout,'(3x,a)',advance='no') 'In dis_proj_froz...' allocate(iwork(5*num_bands),stat=ierr) if (ierr/=0) call io_error('Error allocating iwork in dis_proj_froz') @@ -1231,8 +1243,8 @@ subroutine dis_proj_froz() do n = 1, ndimwin(nkp) do m = 1, n if (abs(cqpq(m,n) - conjg(cqpq(n,m))).gt.eps8) then - write(stdout,*) ' matrix CQPQ is not hermitian' - write(stdout,*) ' k-point ', nkp + if (on_root) write(stdout,*) ' matrix CQPQ is not hermitian' + if (on_root) write(stdout,*) ' k-point ', nkp call io_error('dis_proj_froz: error') endif enddo @@ -1261,20 +1273,20 @@ subroutine dis_proj_froz() ! DEBUG if (info.lt.0) then - write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX' - write(stdout,*) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' + if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX' + if (on_root) write(stdout,*) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' call io_error('dis_proj_frozen: error') elseif (info.gt.0) then - write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX' - write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE' + if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX' + if (on_root) write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE' call io_error('dis_proj_frozen: error') endif ! ENDDEBUG ! DEBUG if (m.ne.ndimwin(nkp)) then - write(stdout,*) ' *** ERROR *** in dis_proj_froz' - write(stdout,*) ' Number of eigenvalues/vectors obtained is', & + if (on_root) write(stdout,*) ' *** ERROR *** in dis_proj_froz' + if (on_root) write(stdout,*) ' Number of eigenvalues/vectors obtained is', & m, ' not equal to the number asked,', ndimwin(nkp) call io_error('dis_proj_frozen: error') endif @@ -1283,12 +1295,12 @@ subroutine dis_proj_froz() ! DEBUG ! check that the eigenvalues are between 0 and 1 if (iprint>2) then - write(stdout,'(/a,i3,a,i3,a,i3,a)') ' K-point ', nkp, ' ndimwin: ', & + if (on_root) write(stdout,'(/a,i3,a,i3,a,i3,a)') ' K-point ', nkp, ' ndimwin: ', & ndimwin(nkp),' we want the',num_wann - ndimfroz(nkp),& ' leading eigenvector(s) of QPQ' endif do j = 1, ndimwin(nkp) - if (iprint>2) write(stdout,'(a,i3,a,f16.12)') ' lambda(', j, ')=', w(j) + if (iprint>2.and.on_root) write(stdout,'(a,i3,a,f16.12)') ' lambda(', j, ')=', w(j) !~[aam] if ( (w(j).lt.eps8).or.(w(j).gt.1.0_dp + eps8) ) then if ( (w(j).lt.-eps8).or.(w(j).gt.1.0_dp + eps8) ) then call io_error('dis_proj_frozen: error - Eigenvalues not between 0 and 1') @@ -1321,7 +1333,7 @@ subroutine dis_proj_froz() end if end do if(nzero>0) then - if(iprint>2) then + if(iprint>2.and.on_root) then write(stdout,*) ' ' write(stdout,'(1x,a,i0,a)') 'An eigenvalue of QPQ is close to zero at kpoint '& ,nkp,'. Using safety check.' @@ -1336,7 +1348,7 @@ subroutine dis_proj_froz() counter=counter+1 end do - if(iprint>2) then + if(iprint>2.and.on_root) then do loop_f=1,ndimwin(nkp) write(stdout,'(1x,a,i4,a,es13.6)') 'Eigenvector number',loop_f,' Eigenvalue: ',w(loop_f) do loop_v=1,ndimwin(nkp) @@ -1369,7 +1381,7 @@ subroutine dis_proj_froz() end do end do - if(iprint>2) then + if(iprint>2.and.on_root) then write(rep,'(i4)') num_wann - ndimfroz(nkp) write(stdout,'(1x,a,'//trim(rep)//'(i0,1x))') 'We use the following eigenvectors: ' & ,vmap(1:(num_wann - ndimfroz(nkp))) @@ -1404,7 +1416,7 @@ subroutine dis_proj_froz() ! PICK THE num_wann-nDIMFROZ(NKP) LEADING EIGENVECTORS AS TRIAL STATES ! and PUT THEM RIGHT AFTER THE FROZEN STATES IN u_matrix_opt do l = ndimfroz(nkp) + 1, num_wann - write(stdout,*) 'il=',il + if (on_root) write(stdout,*) 'il=',il u_matrix_opt(1:ndimwin(nkp),l,nkp) = cz(1:ndimwin(nkp),il) il = il + 1 enddo @@ -1459,7 +1471,7 @@ subroutine dis_proj_froz() deallocate(iwork,stat=ierr) if (ierr/=0) call io_error('Error deallocating iwork in dis_proj_froz') - write(stdout,'(a)') ' done' + if (on_root) write(stdout,'(a)') ' done' if (timing_level>1) call io_stopwatch('dis: proj_froz',2) @@ -1478,7 +1490,8 @@ subroutine dis_extract() ! ! !==================================================================! - use w90_io, only: io_time + + use w90_io, only: io_wallclocktime use w90_sitesym, only: ir2ik,ik2ir,nkptirr,nsymmetry,kptsym !YN: RS: implicit none @@ -1523,10 +1536,19 @@ subroutine dis_extract() integer :: icompflag,iter,ndiff real(kind=dp) :: womegai,wkomegai,womegai1,rsum,delta_womegai real(kind=dp), allocatable :: wkomegai1(:) + + ! for MPI + real(kind=dp), allocatable :: wkomegai1_loc(:) + complex(kind=dp), allocatable :: camp_loc(:,:,:) + complex(kind=dp), allocatable :: u_matrix_opt_loc(:,:,:) + complex(kind=dp), allocatable :: ceamp(:,:,:) complex(kind=dp), allocatable :: camp(:,:,:) - complex(kind=dp), allocatable :: czmat_in(:,:,:) - complex(kind=dp), allocatable :: czmat_out(:,:,:) + ! complex(kind=dp), allocatable :: czmat_in(:,:,:) + ! complex(kind=dp), allocatable :: czmat_out(:,:,:) + ! the z-matrices are now stored in local arrays + complex(kind=dp), allocatable :: czmat_in_loc(:,:,:) + complex(kind=dp), allocatable :: czmat_out_loc(:,:,:) complex(kind=dp), allocatable :: cham(:,:,:) integer, allocatable :: iwork(:) @@ -1543,11 +1565,16 @@ subroutine dis_extract() logical :: dis_converged complex(kind=dp) :: lambda(num_wann,num_wann) !RS: - if (timing_level>1) call io_stopwatch('dis: extract',1) + ! Needed to split an array on different nodes + integer, dimension(0:num_nodes-1) :: counts + integer, dimension(0:num_nodes-1) :: displs + integer :: nkp_loc - write(stdout,'(/1x,a)') & + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract',1) + + if (on_root) write(stdout,'(/1x,a)') & ' Extraction of optimally-connected subspace ' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & ' ------------------------------------------ ' allocate(cwb(num_wann,num_bands),stat=ierr) @@ -1573,12 +1600,24 @@ subroutine dis_extract() allocate(cz(num_bands,num_bands),stat=ierr) if (ierr/=0) call io_error('Error allocating cz in dis_extract') + ! for MPI + call comms_array_split(num_kpts,counts,displs) + allocate(u_matrix_opt_loc(num_bands,num_wann,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error allocating u_matrix_opt_loc in dis_extract') + ! Copy matrix elements from global U matrix to local U matrix + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) + u_matrix_opt_loc(:,:,nkp_loc) = u_matrix_opt(:,:,nkp) + enddo + allocate(wkomegai1_loc(counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error allocating wkomegai1_loc in dis_extract') + allocate(czmat_in_loc(num_bands,num_bands,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error allocating czmat_in_loc in dis_extract') + allocate(czmat_out_loc(num_bands,num_bands,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error allocating czmat_out_loc in dis_extract') + allocate(wkomegai1(num_kpts),stat=ierr) if (ierr/=0) call io_error('Error allocating wkomegai1 in dis_extract') - allocate(czmat_in(num_bands,num_bands,num_kpts),stat=ierr) - if (ierr/=0) call io_error('Error allocating czmat_in in dis_extract') - allocate(czmat_out(num_bands,num_bands,num_kpts),stat=ierr) - if (ierr/=0) call io_error('Error allocating czmat_out in dis_extract') allocate(history(dis_conv_window),stat=ierr) if (ierr/=0) call io_error('Error allocating history in dis_extract') @@ -1619,9 +1658,9 @@ subroutine dis_extract() ! DEBUG if (iprint>2) then - write(stdout,'(a,/)') ' Original eigenvalues inside outer window:' + if (on_root) write(stdout,'(a,/)') ' Original eigenvalues inside outer window:' do nkp = 1, num_kpts - write(stdout,'(a,i3,3x,20(f9.5,1x))') ' K-point ', nkp,& + if (on_root) write(stdout,'(a,i3,3x,20(f9.5,1x))') ' K-point ', nkp,& ( eigval_opt(i, nkp), i = 1, ndimwin (nkp) ) enddo endif @@ -1630,11 +1669,11 @@ subroutine dis_extract() ! TO DO: Check if this is the best place to initialize icompflag icompflag = 0 - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '+---------------------------------------------------------------------+<-- DIS' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '| Iter Omega_I(i-1) Omega_I(i) Delta (frac.) Time |<-- DIS' - write(stdout,'(1x,a)') & + if (on_root) write(stdout,'(1x,a)') & '+---------------------------------------------------------------------+<-- DIS' dis_converged = .false. @@ -1644,18 +1683,22 @@ subroutine dis_extract() ! ------------------ do iter = 1, dis_num_iter - if (timing_level>1) call io_stopwatch('dis: extract_1',1) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_1',1) if (iter.eq.1) then ! Initialize Z matrix at k points w/ non-frozen states - do nkp = 1, num_kpts - if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_in(:,:,nkp)) + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) + if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_in_loc(:,:,nkp_loc)) enddo - if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_in,lwindow) !RS: + + if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_in_loc,lwindow) !RS: + else ! [iter.ne.1] ! Update Z matrix at k points with non-frozen states, using a mixing sch - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) if (lsitesymmetry) then !YN: RS: if (ir2ik(ik2ir(nkp)).ne.nkp) cycle !YN: RS: endif !YN: RS: @@ -1663,20 +1706,20 @@ subroutine dis_extract() ndimk = ndimwin(nkp) - ndimfroz(nkp) do i=1,ndimk do j=1,i - czmat_in(j,i,nkp) = & - cmplx(dis_mix_ratio,0.0_dp,dp) * czmat_out(j,i,nkp) & - + cmplx(1.0_dp-dis_mix_ratio,0.0_dp,dp) * czmat_in(j,i,nkp) + czmat_in_loc(j,i,nkp_loc) = & + cmplx(dis_mix_ratio,0.0_dp,dp) * czmat_out_loc(j,i,nkp_loc) & + + cmplx(1.0_dp-dis_mix_ratio,0.0_dp,dp) * czmat_in_loc(j,i,nkp_loc) ! hermiticity - czmat_in(i,j,nkp) = conjg(czmat_in(j,i,nkp)) + czmat_in_loc(i,j,nkp_loc) = conjg(czmat_in_loc(j,i,nkp_loc)) enddo enddo endif enddo endif ! [if iter=1] - if (timing_level>1) call io_stopwatch('dis: extract_1',2) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_1',2) - if (timing_level>1) call io_stopwatch('dis: extract_2',1) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_2',1) womegai1 = 0.0_dp ! wkomegai1 is defined by Eq. (18) of SMV. @@ -1684,13 +1727,14 @@ subroutine dis_extract() ! every k (before updating any k), so that for iter>1 overlaps are with ! non-frozen neighboring states from the previous iteration - wkomegai1 = real(num_wann,dp) * wbtot + wkomegai1_loc = real(num_wann,dp) * wbtot if (lsitesymmetry) then !RS: do nkp=1,nkptirr !RS: - wkomegai1(ir2ik(nkp))=wkomegai1(ir2ik(nkp))*nsymmetry/count(kptsym(:,nkp).eq.ir2ik(nkp)) !RS: + wkomegai1_loc(ir2ik(nkp))=wkomegai1_loc(ir2ik(nkp))*nsymmetry/count(kptsym(:,nkp).eq.ir2ik(nkp)) !RS: enddo !RS: endif !RS: - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) if ( ndimfroz(nkp).gt.0 ) then if (lsitesymmetry) call io_error('not implemented in symmetry-adapted mode') !YN: RS: do nn=1,nntot @@ -1706,43 +1750,51 @@ subroutine dis_extract() rsum = rsum + real(cww(m,n),dp)**2 + aimag(cww(m,n))**2 enddo enddo - wkomegai1(nkp) = wkomegai1(nkp) - wb(nn)*rsum + wkomegai1_loc(nkp_loc) = wkomegai1_loc(nkp_loc) - wb(nn)*rsum enddo endif enddo - if (timing_level>1) call io_stopwatch('dis: extract_2',2) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_2',2) - if (timing_level>1) call io_stopwatch('dis: extract_3',1) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_3',1) + + ! send chunks of wkomegai1 to root node + call comms_gatherv(wkomegai1_loc(1),counts(my_node_id),wkomegai1(1),counts,displs) + ! send back the whole wkomegai1 array to other nodes + call comms_bcast(wkomegai1(1),num_kpts) ! Refine optimal subspace at k points w/ non-frozen states - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) if (lsitesymmetry) then !RS: if (ir2ik(ik2ir(nkp)).ne.nkp) cycle !RS: end if !RS: if (lsitesymmetry) then !RS: - call sitesym_dis_extract_symmetry(nkp,ndimwin(nkp),czmat_in,lambda,u_matrix_opt) !RS: + + call sitesym_dis_extract_symmetry(nkp,ndimwin(nkp),czmat_in_loc,lambda,u_matrix_opt) !RS: + do j=1,num_wann !RS: - wkomegai1(nkp)=wkomegai1(nkp)-real(lambda(j,j),kind=dp) !RS: + wkomegai1_loc(nkp_loc)=wkomegai1(nkp_loc)-real(lambda(j,j),kind=dp) !RS: enddo !RS: else !RS: if ( num_wann.gt.ndimfroz(nkp) ) then ! Diagonalize Z matrix do j = 1, ndimwin(nkp) - ndimfroz(nkp) do i = 1, j - cap(i + ( (j - 1) * j) / 2) = czmat_in(i,j,nkp) + cap(i + ( (j - 1) * j) / 2) = czmat_in_loc(i,j,nkp_loc) enddo enddo ndiff = ndimwin(nkp) - ndimfroz(nkp) call ZHPEVX ('V', 'A', 'U', ndiff, cap, 0.0_dp, 0.0_dp, 0, 0, & -1.0_dp, m, w, cz, num_bands, cwork, rwork, iwork, ifail, info) if (info.lt.0) then - write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX' - write(stdout,*) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' + if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX' + if (on_root) write(stdout,*) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' call io_error(' dis_extract: error') endif if (info.gt.0) then - write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX' - write(stdout,*) info, ' EIGENVECTORS FAILED TO CONVERGE' + if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX' + if (on_root) write(stdout,*) info, ' EIGENVECTORS FAILED TO CONVERGE' call io_error(' dis_extract: error') endif @@ -1752,12 +1804,12 @@ subroutine dis_extract() m = ndimfroz(nkp) do j = ndimwin(nkp) - num_wann + 1, ndimwin(nkp) - ndimfroz(nkp) m = m + 1 - wkomegai1(nkp) = wkomegai1(nkp) - w(j) - u_matrix_opt(1:ndimwin(nkp),m,nkp) = cmplx_0 + wkomegai1_loc(nkp_loc) = wkomegai1_loc(nkp_loc) - w(j) + u_matrix_opt_loc(1:ndimwin(nkp),m,nkp_loc) = cmplx_0 ndimk=ndimwin(nkp)-ndimfroz(nkp) do i=1,ndimk p=indxnfroz(i,nkp) - u_matrix_opt(p,m,nkp) = cz(i,j) + u_matrix_opt_loc(p,m,nkp_loc) = cz(i,j) enddo enddo endif @@ -1766,7 +1818,7 @@ subroutine dis_extract() ! Now that we have contribs. from both frozen and non-frozen states to ! wkomegai1(nkp), add it to womegai1 - womegai1 = womegai1 + wkomegai1(nkp) + womegai1 = womegai1 + wkomegai1_loc(nkp_loc) if(index(devel_flag,'compspace')>0) then @@ -1777,18 +1829,20 @@ subroutine dis_extract() if (iter.eq.dis_num_iter) then allocate(camp(num_bands,num_bands,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error allocating camp in dis_extract') + allocate(camp_loc(num_bands,num_bands,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error allocating ucamp_loc in dis_extract') if (ndimwin(nkp).gt.num_wann) then do j = 1, ndimwin(nkp) - num_wann if ( num_wann.gt.ndimfroz(nkp) ) then ! USE THE NON-LEADING EIGENVECTORS OF THE Z-MATRIX - camp(1:ndimwin(nkp),j,nkp)=cz(1:ndimwin(nkp),j) + camp_loc(1:ndimwin(nkp),j,nkp_loc)=cz(1:ndimwin(nkp),j) else ! Then num_wann=NDIMFROZ(NKP) ! USE THE ORIGINAL NON-FROZEN BLOCH EIGENSTATES do i = 1,ndimwin(nkp) - camp(i,j,nkp) = cmplx_0 - if (i.eq.indxnfroz(j,nkp)) camp(i,j,nkp) = cmplx_1 + camp_loc(i,j,nkp_loc) = cmplx_0 + if (i.eq.indxnfroz(j,nkp)) camp_loc(i,j,nkp_loc) = cmplx_1 enddo endif enddo @@ -1801,10 +1855,34 @@ subroutine dis_extract() enddo ! [Loop over k points (nkp)] + if (lsitesymmetry) call sitesym_symmetrize_u_matrix(num_bands,u_matrix_opt,lwindow) !RS: - if (timing_level>1) call io_stopwatch('dis: extract_3',2) + ! send chunks of wkomegai1 to root node + call comms_gatherv(wkomegai1_loc(1),counts(my_node_id),wkomegai1(1),counts,displs) + ! send back the whole wkomegai1 array to other nodes + call comms_bcast(wkomegai1(1),num_kpts) + + call comms_allreduce(womegai1,1,'SUM') + + if ( num_wann.gt.ndimfroz(nkp) ) then + call comms_gatherv(u_matrix_opt_loc(1,1,1),num_bands*num_wann*counts(my_node_id),& + u_matrix_opt(1,1,1),num_bands*num_wann*counts,num_bands*num_wann*displs) + call comms_bcast(u_matrix_opt(1,1,1),num_bands*num_wann*num_kpts) + endif + + if(index(devel_flag,'compspace')>0) then + if (iter.eq.dis_num_iter) then + call comms_gatherv(camp_loc(1,1,1),num_bands*num_bands*counts(my_node_id),& + camp(1,1,1),num_bands*num_bands*counts,num_bands*num_bands*displs) + + call comms_bcast(camp(1,1,1),num_bands*num_bands*num_kpts) + endif + endif + + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_3',2) + womegai1 = womegai1 / real(num_kpts,dp) ! DEBUG @@ -1844,10 +1922,11 @@ subroutine dis_extract() ! Compute womegai using the updated subspaces at all k, i.e., ! replacing (i-1) by (i) in Eq. (12) SMV - if (timing_level>1) call io_stopwatch('dis: extract_4',1) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_4',1) womegai = 0.0_dp - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) wkomegai=0.0_dp do nn=1,nntot nkp2=nnlist(nkp,nn) @@ -1867,41 +1946,46 @@ subroutine dis_extract() wkomegai = real(num_wann,dp) * wbtot - wkomegai womegai = womegai + wkomegai enddo + + call comms_allreduce(womegai,1,'SUM') + womegai = womegai / real(num_kpts,dp) ! [Loop over k (nkp)] - if (timing_level>1) call io_stopwatch('dis: extract_4',2) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_4',2) delta_womegai = womegai1/womegai - 1.0_dp - write(stdout,124) iter,womegai1*lenconfac**2,womegai*lenconfac**2,& - delta_womegai,io_time() + if (on_root) write(stdout,124) iter,womegai1*lenconfac**2,womegai*lenconfac**2,& + delta_womegai,io_wallclocktime() 124 format(2x,i6,3x,f14.8,3x,f14.8,6x,es10.3,2x,f8.2,4x,'<-- DIS') ! Construct the updated Z matrix, CZMAT_OUT, at k points w/ non-frozen s - do nkp = 1, num_kpts - if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_out(:,:,nkp)) + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) + if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_out_loc(:,:,nkp_loc)) enddo - if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_out,lwindow) !RS: + + if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_out_loc,lwindow) !RS: call internal_test_convergence() if (dis_converged) then - write(stdout,'(/13x,a,es10.3,a,i2,a)') & + if (on_root) write(stdout,'(/13x,a,es10.3,a,i2,a)') & '<<< Delta <',dis_conv_tol,& ' over ',dis_conv_window,' iterations >>>' - write(stdout,'(13x,a)') '<<< Disentanglement convergence criteria satisfied >>>' + if (on_root) write(stdout,'(13x,a)') '<<< Disentanglement convergence criteria satisfied >>>' exit endif enddo ! [BIG ITERATION LOOP (iter)] - deallocate(czmat_out,stat=ierr) - if (ierr/=0) call io_error('Error deallocating czmat_out in dis_extract') - deallocate(czmat_in,stat=ierr) - if (ierr/=0) call io_error('Error deallocating czmat_in in dis_extract') + deallocate(czmat_out_loc,stat=ierr) + if (ierr/=0) call io_error('Error deallocating czmat_out_loc in dis_extract') + deallocate(czmat_in_loc,stat=ierr) + if (ierr/=0) call io_error('Error deallocating czmat_in_loc in dis_extract') allocate(ceamp(num_bands,num_bands,num_kpts),stat=ierr) if (ierr/=0) call io_error('Error allocating ceamp in dis_extract') @@ -1909,28 +1993,28 @@ subroutine dis_extract() if (ierr/=0) call io_error('Error allocating cham in dis_extract') if (.not.dis_converged) then - write(stdout,'(/5x,a)') & + if (on_root) write(stdout,'(/5x,a)') & '<<< Warning: Maximum number of disentanglement iterations reached >>>' - write(stdout,'(10x,a)') '<<< Disentanglement convergence criteria not satisfied >>>' + if (on_root) write(stdout,'(10x,a)') '<<< Disentanglement convergence criteria not satisfied >>>' endif if(index(devel_flag,'compspace')>0) then if (icompflag.eq.1) then if (iprint>2) then - write(stdout,('(/4x,a)')) & + if (on_root) write(stdout,('(/4x,a)')) & 'WARNING: Complement subspace has zero dimensions at the following k-points:' i=0 - write(stdout,'(4x)',advance='no') + if (on_root) write(stdout,'(4x)',advance='no') do nkp=1,num_kpts if (ndimwin(nkp).eq.num_wann) then i=i+1 if (i.le.12) then - write(stdout,'(i6)',advance='no') nkp + if (on_root) write(stdout,'(i6)',advance='no') nkp else i=1 - write(stdout,'(/4x)',advance='no') - write(stdout,'(i6)',advance='no') nkp + if (on_root) write(stdout,'(/4x)',advance='no') + if (on_root) write(stdout,'(i6)',advance='no') nkp endif endif enddo @@ -1944,7 +2028,7 @@ subroutine dis_extract() ! Write the final womegai. This should remain unchanged during the ! subsequent minimization of Omega_tilde in wannierise.f90 ! We store it in the checkpoint file as a sanity check - write(stdout,'(/8x,a,f14.8,a/)') 'Final Omega_I ',& + if (on_root) write(stdout,'(/8x,a,f14.8,a/)') 'Final Omega_I ',& womegai*lenconfac**2,' ('//trim(length_unit)//'^2)' ! Set public variable omega_invariant @@ -1973,13 +2057,13 @@ subroutine dis_extract() m, w, cz, num_bands, cwork, rwork, iwork, ifail, info) if (info.lt.0) then - write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' - write(stdout,*) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' + if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' + if (on_root) write(stdout,*) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' call io_error(' dis_extract: error') endif if (info.gt.0) then - write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' - write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE' + if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' + if (on_root) write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE' call io_error(' dis_extract: error') endif @@ -2001,9 +2085,9 @@ subroutine dis_extract() ! DEBUG if (iprint>2) then - write(stdout,'(/,a,/)') ' Eigenvalues inside optimal subspace:' + if (on_root) write(stdout,'(/,a,/)') ' Eigenvalues inside optimal subspace:' do nkp = 1, num_kpts - write(stdout,'(a,i3,2x,20(f9.5,1x))') ' K-point ', & + if (on_root) write(stdout,'(a,i3,2x,20(f9.5,1x))') ' K-point ', & nkp, (eigval_opt(i,nkp), i = 1, num_wann) enddo endif @@ -2028,8 +2112,8 @@ subroutine dis_extract() if (icompflag.eq.1) then if (iprint>2) then - write(stdout,*) 'AT SOME K-POINT(S) COMPLEMENT SUBSPACE HAS ZERO DIMENSIONALITY' - write(stdout,*) '=> DID NOT CREATE FILE COMPSPACE.DAT' + if (on_root) write(stdout,*) 'AT SOME K-POINT(S) COMPLEMENT SUBSPACE HAS ZERO DIMENSIONALITY' + if (on_root) write(stdout,*) '=> DID NOT CREATE FILE COMPSPACE.DAT' endif else ! DIAGONALIZE THE HAMILTONIAN IN THE COMPLEMENT SUBSPACE, WRITE THE @@ -2053,13 +2137,13 @@ subroutine dis_extract() call ZHPEVX ('V', 'A', 'U', ndiff, cap, 0.0_dp, 0.0_dp, 0, 0, & -1.0_dp, m, w, cz, num_bands, cwork, rwork, iwork, ifail, info) if (info.lt.0) then - write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' - write(stdout,*) 'THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' + if (on_root) write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' + if (on_root) write(stdout,*) 'THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE' call io_error(' dis_extract: error') endif if (info.gt.0) then - write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' - write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE' + if (on_root) write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN' + if (on_root) write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE' call io_error(' dis_extract: error') endif ! CALCULATE AMPLITUDES OF THE ENERGY EIGENVECTORS IN THE COMPLEMENT SUBS @@ -2095,8 +2179,16 @@ subroutine dis_extract() deallocate(camp,stat=ierr) if (ierr/=0) call io_error('Error deallocating camp in dis_extract') end if + if(allocated(camp_loc)) then + deallocate(camp_loc,stat=ierr) + if (ierr/=0) call io_error('Error deallocating camp_loc in dis_extract') + endif deallocate(ceamp,stat=ierr) if (ierr/=0) call io_error('Error deallocating ceamp in dis_extract') + deallocate(u_matrix_opt_loc,stat=ierr) + if (ierr/=0) call io_error('Error deallocating u_matrix_opt_loc in dis_extract') + deallocate(wkomegai1_loc,stat=ierr) + if (ierr/=0) call io_error('Error deallocating wkomegai1_loc in dis_extract') deallocate(wkomegai1,stat=ierr) if (ierr/=0) call io_error('Error deallocating wkomegai1 in dis_extract') @@ -2122,10 +2214,10 @@ subroutine dis_extract() deallocate(cwb,stat=ierr) if (ierr/=0) call io_error('Error deallocating cwb in dis_extract') - write(stdout,'(1x,a/)') & + if (on_root) write(stdout,'(1x,a/)') & '+----------------------------------------------------------------------------+' - if (timing_level>1) call io_stopwatch('dis: extract',2) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract',2) return @@ -2185,7 +2277,7 @@ subroutine internal_zmatrix(nkp,cmtrx) integer :: l,m,n,p,q,nn,nkp2,ndimk complex(kind=dp) :: csum - if (timing_level>1) call io_stopwatch('dis: extract: zmatrix',1) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract: zmatrix',1) cmtrx=cmplx_0 ndimk=ndimwin(nkp)-ndimfroz(nkp) @@ -2208,7 +2300,7 @@ subroutine internal_zmatrix(nkp,cmtrx) enddo enddo - if (timing_level>1) call io_stopwatch('dis: extract: zmatrix',2) + if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract: zmatrix',2) return diff --git a/src/hamiltonian.F90 b/src/hamiltonian.F90 index acfb57461..8a74d7ee3 100644 --- a/src/hamiltonian.F90 +++ b/src/hamiltonian.F90 @@ -17,6 +17,7 @@ module w90_hamiltonian !! This is a simplified routine, more sophisticated properties !! are found in postw90 (e.g. w90_get_oper) use w90_constants, only : dp + use w90_comms, only : on_root implicit none @@ -411,14 +412,15 @@ subroutine internal_translate_centres() ! NEVER overwrite wannier_centres !wannier_centres = r_home - write(stdout,'(1x,a)') 'Translated centres' - write(stdout,'(4x,a,3f10.6)') 'translation centre in fractional coordinate:',translation_centre_frac(:) - do iw=1,num_wann - write(stdout,888) iw,(r_home(ind,iw)*lenconfac,ind=1,3) - end do - write(stdout,'(1x,a78)') repeat('-',78) - write(stdout,*) - + if (on_root) then + write(stdout,'(1x,a)') 'Translated centres' + write(stdout,'(4x,a,3f10.6)') 'translation centre in fractional coordinate:',translation_centre_frac(:) + do iw=1,num_wann + write(stdout,888) iw,(r_home(ind,iw)*lenconfac,ind=1,3) + end do + write(stdout,'(1x,a78)') repeat('-',78) + write(stdout,*) + endif wannier_centres_translated = r_home deallocate(r_frac,stat=ierr) @@ -583,7 +585,7 @@ subroutine hamiltonian_wigner_seitz(count_pts) if(count_pts) return - if(iprint>=3) then + if(iprint>=3.and.on_root) then write(stdout,'(1x,i4,a,/)') nrpts, ' lattice points in Wigner-Seitz supercell:' do i=1,nrpts write(stdout,'(4x,a,3(i3,1x),a,i2)') ' vector ', irvec(1,i),irvec(2,i),& diff --git a/src/io.F90 b/src/io.F90 index d159a283f..6382bd0bd 100644 --- a/src/io.F90 +++ b/src/io.F90 @@ -17,7 +17,6 @@ module w90_io use w90_constants, only : dp - implicit none private @@ -59,6 +58,7 @@ module w90_io public :: io_print_timings public :: io_get_seedname public :: io_time + public :: io_wallclocktime public :: io_date public :: io_error public :: io_file_unit @@ -211,23 +211,31 @@ subroutine io_error ( error_msg ) #ifdef MPI character(len=50) :: filename - integer :: stderr,ierr,whoami + integer :: stderr,ierr,whoami,num_nodes call mpi_comm_rank(mpi_comm_world, whoami, ierr) - if(whoami>99999) then - write(filename,'(a,a,I0,a)')trim(seedname),'.node_',whoami,'.werr' - else - write(filename,'(a,a,I5.5,a)')trim(seedname),'.node_',whoami,'.werr' - endif - stderr=io_file_unit() - open(unit=stderr,file=trim(filename),form='formatted',err=105) - write(stderr, '(1x,a)') trim(error_msg) - close(stderr) + call mpi_comm_size(mpi_comm_world, num_nodes, ierr) + if(num_nodes>1) then + if(whoami>99999) then + write(filename,'(a,a,I0,a)')trim(seedname),'.node_',whoami,'.werr' + else + write(filename,'(a,a,I5.5,a)')trim(seedname),'.node_',whoami,'.werr' + endif + stderr=io_file_unit() + open(unit=stderr,file=trim(filename),form='formatted',err=105) + write(stderr, '(1x,a)') trim(error_msg) + close(stderr) + end if 105 write(*,'(1x,a)') trim(error_msg) 106 write(*,'(1x,a,I0,a)') "Error on node ", & whoami, ": examine the output/error files for details" + if(whoami==0) then + write(stdout,*) 'Exiting.......' + write(stdout, '(1x,a)') trim(error_msg) + end if + call MPI_abort(MPI_comm_world,1,ierr) #else @@ -308,6 +316,36 @@ function io_time() return end function io_time + + !==================================================================! + function io_wallclocktime() + !==================================================================! + ! ! + ! Returns elapsed wall clock time in seconds since its first call ! + ! ! + !=================================================================== + use w90_constants, only : dp + implicit none + + real(kind=dp) :: io_wallclocktime + + integer :: c0,c1 + integer :: rate + logical :: first=.true. + save first, rate, c0 + + if (first) then + + call system_clock(c0, rate) + io_wallclocktime = 0.0_dp + first = .false. + else + call system_clock(c1) + io_wallclocktime = real(c1 - c0)/real(rate) + endif + return + end function io_wallclocktime + !=========================================== function io_file_unit() !=========================================== diff --git a/src/kmesh.F90 b/src/kmesh.F90 index d9bd993bf..d06a71ad9 100644 --- a/src/kmesh.F90 +++ b/src/kmesh.F90 @@ -24,6 +24,7 @@ module w90_kmesh use w90_constants, only : dp use w90_parameters + use w90_comms, only : on_root implicit none @@ -91,7 +92,7 @@ subroutine kmesh_get() if (timing_level>0) call io_stopwatch('kmesh: get',1) - write(stdout,'(/1x,a)') & + if (on_root) write(stdout,'(/1x,a)') & '*---------------------------------- K-MESH ----------------------------------*' ! Sort the cell neighbours so we loop in order of distance from the home shell @@ -131,28 +132,31 @@ subroutine kmesh_get() dnn1 = eta enddo - write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' - write(stdout,'(1x,a)') '| Distance to Nearest-Neighbour Shells |' - write(stdout,'(1x,a)') '| ------------------------------------ |' - if (lenconfac.eq.1.0_dp) then - write(stdout,'(1x,a)') '| Shell Distance (Ang^-1) Multiplicity |' - write(stdout,'(1x,a)') '| ----- ----------------- ------------ |' - else - write(stdout,'(1x,a)') '| Shell Distance (Bohr^-1) Multiplicity |' - write(stdout,'(1x,a)') '| ----- ------------------ ------------ |' + if (on_root) then + write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' + write(stdout,'(1x,a)') '| Distance to Nearest-Neighbour Shells |' + write(stdout,'(1x,a)') '| ------------------------------------ |' + if (lenconfac.eq.1.0_dp) then + write(stdout,'(1x,a)') '| Shell Distance (Ang^-1) Multiplicity |' + write(stdout,'(1x,a)') '| ----- ----------------- ------------ |' + else + write(stdout,'(1x,a)') '| Shell Distance (Bohr^-1) Multiplicity |' + write(stdout,'(1x,a)') '| ----- ------------------ ------------ |' + endif + do ndnn = 1, ndnntot + write(stdout,'(1x,a,11x,i3,17x,f10.6,19x,i4,12x,a)') '|',ndnn,dnn(ndnn)/lenconfac,multi(ndnn),'|' + enddo + write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' endif - do ndnn = 1, ndnntot - write(stdout,'(1x,a,11x,i3,17x,f10.6,19x,i4,12x,a)') '|',ndnn,dnn(ndnn)/lenconfac,multi(ndnn),'|' - enddo - write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' - if(iprint>=4) then ! Write out all the bvectors - write(stdout,'(1x,"|",76(" "),"|")') - write(stdout,'(1x,a)') '| Complete list of b-vectors and their lengths |' - write(stdout,'(1x,"|",76(" "),"|")') - write(stdout,'(1x,"+",76("-"),"+")') + if (on_root) then + write(stdout,'(1x,"|",76(" "),"|")') + write(stdout,'(1x,a)') '| Complete list of b-vectors and their lengths |' + write(stdout,'(1x,"|",76(" "),"|")') + write(stdout,'(1x,"+",76("-"),"+")') + endif allocate( bvec_tmp(3,maxval(multi)),stat=ierr) if (ierr/=0) call io_error('Error allocating bvec_tmp in kmesh_get') @@ -162,14 +166,14 @@ subroutine kmesh_get() call kmesh_get_bvectors(multi(shell),1,dnn(shell),bvec_tmp(:,1:multi(shell))) do loop=1,multi(shell) counter=counter+1 - write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector ',counter,': (', & + if (on_root)write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector ',counter,': (', & bvec_tmp(:,loop)/lenconfac,')',dnn(shell)/lenconfac,' |' end do end do deallocate( bvec_tmp) if (ierr/=0) call io_error('Error deallocating bvec_tmp in kmesh_get') - write(stdout,'(1x,"|",76(" "),"|")') - write(stdout,'(1x,"+",76("-"),"+")') + if (on_root)write(stdout,'(1x,"|",76(" "),"|")') + if (on_root)write(stdout,'(1x,"+",76("-"),"+")') end if @@ -183,19 +187,20 @@ subroutine kmesh_get() call kmesh_shell_fixed(multi,dnn,bweight) end if - write(stdout,'(1x,a)',advance='no') '| The following shells are used: ' - do ndnn=1,num_shells - if (ndnn.eq.num_shells) then - write(stdout,'(i3,1x)',advance='no') shell_list(ndnn) - else - write(stdout,'(i3,",")',advance='no') shell_list(ndnn) - endif - enddo - do l=1,11-num_shells - write(stdout,'(4x)',advance='no') - enddo - write(stdout,'("|")') - + if (on_root) then + write(stdout,'(1x,a)',advance='no') '| The following shells are used: ' + do ndnn=1,num_shells + if (ndnn.eq.num_shells) then + write(stdout,'(i3,1x)',advance='no') shell_list(ndnn) + else + write(stdout,'(i3,",")',advance='no') shell_list(ndnn) + endif + enddo + do l=1,11-num_shells + write(stdout,'(4x)',advance='no') + enddo + write(stdout,'("|")') + endif end if nntot=0 @@ -204,26 +209,28 @@ subroutine kmesh_get() end do if(nntot>num_nnmax) then - write(stdout,'(a,i2,a)') ' **WARNING: kmesh has found >',num_nnmax,' nearest neighbours**' - write(stdout,'(a)') ' ' - write(stdout,'(a)') ' This is probably caused by an error in your unit cell specification' - write(stdout,'(a)') ' ' - write(stdout,'(a)') ' If you think this is not the problem; please send your *.win file to the ' - write(stdout,'(a)') ' wannier90 developers' - write(stdout,'(a)') ' ' - write(stdout,'(a)') ' The problem may be caused by having accidentally degenerate shells of ' - write(stdout,'(a)') ' kpoints. The solution is then to rerun wannier90 specifying the b-vectors ' - write(stdout,'(a)') ' in each shell. Give devel_flag=kmesh_degen in the *.win file' - write(stdout,'(a)') ' and create a *.kshell file:' - write(stdout,'(a)') ' ' - write(stdout,'(a)') ' $> cat hexagonal.kshell' - write(stdout,'(a)') ' $> 1 2' - write(stdout,'(a)') ' $> 5 6 7 8' - write(stdout,'(a)') ' ' - write(stdout,'(a)') ' Where each line is a new shell (so num_shells in total)' - write(stdout,'(a)') ' The elements are the bvectors labelled according to the following ' - write(stdout,'(a)') ' list (last column is distance)' - write(stdout,'(a)') ' ' + if (on_root) then + write(stdout,'(a,i2,a)') ' **WARNING: kmesh has found >',num_nnmax,' nearest neighbours**' + write(stdout,'(a)') ' ' + write(stdout,'(a)') ' This is probably caused by an error in your unit cell specification' + write(stdout,'(a)') ' ' + write(stdout,'(a)') ' If you think this is not the problem; please send your *.win file to the ' + write(stdout,'(a)') ' wannier90 developers' + write(stdout,'(a)') ' ' + write(stdout,'(a)') ' The problem may be caused by having accidentally degenerate shells of ' + write(stdout,'(a)') ' kpoints. The solution is then to rerun wannier90 specifying the b-vectors ' + write(stdout,'(a)') ' in each shell. Give devel_flag=kmesh_degen in the *.win file' + write(stdout,'(a)') ' and create a *.kshell file:' + write(stdout,'(a)') ' ' + write(stdout,'(a)') ' $> cat hexagonal.kshell' + write(stdout,'(a)') ' $> 1 2' + write(stdout,'(a)') ' $> 5 6 7 8' + write(stdout,'(a)') ' ' + write(stdout,'(a)') ' Where each line is a new shell (so num_shells in total)' + write(stdout,'(a)') ' The elements are the bvectors labelled according to the following ' + write(stdout,'(a)') ' list (last column is distance)' + write(stdout,'(a)') ' ' + endif allocate( bvec_tmp(3,maxval(multi)),stat=ierr) if (ierr/=0) call io_error('Error allocating bvec_tmp in kmesh_get') @@ -233,11 +240,11 @@ subroutine kmesh_get() call kmesh_get_bvectors(multi(shell),1,dnn(shell),bvec_tmp(:,1:multi(shell))) do loop=1,multi(shell) counter=counter+1 - write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector ',counter,': (', & + if (on_root) write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector ',counter,': (', & bvec_tmp(:,loop)/lenconfac,')',dnn(shell)/lenconfac,' |' end do end do - write(stdout,'(a)') ' ' + if (on_root) write(stdout,'(a)') ' ' deallocate( bvec_tmp) if (ierr/=0) call io_error('Error deallocating bvec_tmp in kmesh_get') @@ -275,9 +282,11 @@ subroutine kmesh_get() ! Comment: Now we have bk(3,nntot,num_kps) 09/04/2006 - write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' - write(stdout,'(1x,a)') '| Shell # Nearest-Neighbours |' - write(stdout,'(1x,a)') '| ----- -------------------- |' + if (on_root) then + write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' + write(stdout,'(1x,a)') '| Shell # Nearest-Neighbours |' + write(stdout,'(1x,a)') '| ----- -------------------- |' + endif if(index(devel_flag,'kmesh_degen')==0) then ! ! Standard routine @@ -354,9 +363,9 @@ subroutine kmesh_get() do ndnnx=1, num_shells ndnn = shell_list(ndnnx) - write(stdout,'(1x,a,24x,i3,13x,i3,33x,a)') '|',ndnn,nnshell(1,ndnn),'|' + if (on_root) write(stdout,'(1x,a,24x,i3,13x,i3,33x,a)') '|',ndnn,nnshell(1,ndnn),'|' end do - write(stdout,'(1x,"+",76("-"),"+")') + if (on_root) write(stdout,'(1x,"+",76("-"),"+")') do nkp = 1, num_kpts @@ -372,7 +381,7 @@ subroutine kmesh_get() bbn = bbn + bk_local(i,nnx,nkp) * bk_local(i,nnx,nkp) enddo if (abs(sqrt(bb1)-sqrt(bbn)).gt.kmesh_tol) then - write(stdout,'(1x,2f10.6)') bb1,bbn + if (on_root) write(stdout,'(1x,2f10.6)') bb1,bbn call io_error('Non-symmetric k-point neighbours!') endif enddo @@ -398,11 +407,11 @@ subroutine kmesh_get() enddo enddo if ( (i.eq.j) .and. (abs(ddelta-1.0_dp).gt.kmesh_tol) ) then - write(stdout,'(1x,2i3,f12.8)') i,j,ddelta + if (on_root) write(stdout,'(1x,2i3,f12.8)') i,j,ddelta call io_error('Eq. (B1) not satisfied in kmesh_get (1)') endif if ( (i.ne.j) .and. (abs(ddelta).gt.kmesh_tol) ) then - write(stdout,'(1x,2i3,f12.8)') i,j,ddelta + if (on_root) write(stdout,'(1x,2i3,f12.8)') i,j,ddelta call io_error('Eq. (B1) not satisfied in kmesh_get (2)') endif enddo @@ -410,8 +419,8 @@ subroutine kmesh_get() enddo end if - write(stdout,'(1x,a)') '| Completeness relation is fully satisfied [Eq. (B1), PRB 56, 12847 (1997)] |' - write(stdout,'(1x,"+",76("-"),"+")') + if (on_root) write(stdout,'(1x,a)') '| Completeness relation is fully satisfied [Eq. (B1), PRB 56, 12847 (1997)] |' + if (on_root) write(stdout,'(1x,"+",76("-"),"+")') ! wbtot = 0.0_dp @@ -447,35 +456,36 @@ subroutine kmesh_get() enddo if (na.ne.nnh) call io_error('Did not find right number of bk directions') - - if (lenconfac.eq.1.0_dp) then - write(stdout,'(1x,a)') '| b_k Vectors (Ang^-1) and Weights (Ang^2) |' - write(stdout,'(1x,a)') '| ---------------------------------------- |' - else - write(stdout,'(1x,a)') '| b_k Vectors (Bohr^-1) and Weights (Bohr^2) |' - write(stdout,'(1x,a)') '| ------------------------------------------ |' - endif - write(stdout,'(1x,a)') '| No. b_k(x) b_k(y) b_k(z) w_b |' - write(stdout,'(1x,a)') '| --- -------------------------------- -------- |' - do i = 1, nntot - write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') & - i,(bk_local(j,i,1)/lenconfac,j=1,3),wb_local(i)*lenconfac**2 - enddo - write(stdout,'(1x,"+",76("-"),"+")') - if (lenconfac.eq.1.0_dp) then - write(stdout,'(1x,a)') '| b_k Directions (Ang^-1) |' - write(stdout,'(1x,a)') '| ----------------------- |' - else - write(stdout,'(1x,a)') '| b_k Directions (Bohr^-1) |' - write(stdout,'(1x,a)') '| ------------------------ |' + if (on_root) then + if (lenconfac.eq.1.0_dp) then + write(stdout,'(1x,a)') '| b_k Vectors (Ang^-1) and Weights (Ang^2) |' + write(stdout,'(1x,a)') '| ---------------------------------------- |' + else + write(stdout,'(1x,a)') '| b_k Vectors (Bohr^-1) and Weights (Bohr^2) |' + write(stdout,'(1x,a)') '| ------------------------------------------ |' + endif + write(stdout,'(1x,a)') '| No. b_k(x) b_k(y) b_k(z) w_b |' + write(stdout,'(1x,a)') '| --- -------------------------------- -------- |' + do i = 1, nntot + write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') & + i,(bk_local(j,i,1)/lenconfac,j=1,3),wb_local(i)*lenconfac**2 + enddo + write(stdout,'(1x,"+",76("-"),"+")') + if (lenconfac.eq.1.0_dp) then + write(stdout,'(1x,a)') '| b_k Directions (Ang^-1) |' + write(stdout,'(1x,a)') '| ----------------------- |' + else + write(stdout,'(1x,a)') '| b_k Directions (Bohr^-1) |' + write(stdout,'(1x,a)') '| ------------------------ |' + endif + write(stdout,'(1x,a)') '| No. x y z |' + write(stdout,'(1x,a)') '| --- -------------------------------- |' + do i = 1, nnh + write(stdout,'(1x,"|",11x,i3,5x,3f12.6,21x,"|")') i,(bka(j,i)/lenconfac,j=1,3) + enddo + write(stdout,'(1x,"+",76("-"),"+")') + write(stdout,*) ' ' endif - write(stdout,'(1x,a)') '| No. x y z |' - write(stdout,'(1x,a)') '| --- -------------------------------- |' - do i = 1, nnh - write(stdout,'(1x,"|",11x,i3,5x,3f12.6,21x,"|")') i,(bka(j,i)/lenconfac,j=1,3) - enddo - write(stdout,'(1x,"+",76("-"),"+")') - write(stdout,*) ' ' ! find index array @@ -490,7 +500,7 @@ subroutine kmesh_get() enddo ! check found if (neigh(nkp,na).eq.0) then - write(stdout,*) ' nkp,na=',nkp,na + if (on_root) write(stdout,*) ' nkp,na=',nkp,na call io_error('kmesh_get: failed to find neighbours for this kpoint') endif enddo @@ -573,24 +583,26 @@ subroutine kmesh_get() if (na.ne.nnh) call io_error('Did not find right number of b-vectors in gamma_only option') - write(stdout,'(1x,"+",76("-"),"+")') - write(stdout,'(1x,a)') '| Gamma-point: number of the b-vectors is reduced by half |' - write(stdout,'(1x,"+",76("-"),"+")') - if (lenconfac.eq.1.0_dp) then - write(stdout,'(1x,a)') '| b_k Vectors (Ang^-1) and Weights (Ang^2) |' - write(stdout,'(1x,a)') '| ---------------------------------------- |' - else - write(stdout,'(1x,a)') '| b_k Vectors (Bohr^-1) and Weights (Bohr^2) |' - write(stdout,'(1x,a)') '| ------------------------------------------ |' + if (on_root) then + write(stdout,'(1x,"+",76("-"),"+")') + write(stdout,'(1x,a)') '| Gamma-point: number of the b-vectors is reduced by half |' + write(stdout,'(1x,"+",76("-"),"+")') + if (lenconfac.eq.1.0_dp) then + write(stdout,'(1x,a)') '| b_k Vectors (Ang^-1) and Weights (Ang^2) |' + write(stdout,'(1x,a)') '| ---------------------------------------- |' + else + write(stdout,'(1x,a)') '| b_k Vectors (Bohr^-1) and Weights (Bohr^2) |' + write(stdout,'(1x,a)') '| ------------------------------------------ |' + endif + write(stdout,'(1x,a)') '| No. b_k(x) b_k(y) b_k(z) w_b |' + write(stdout,'(1x,a)') '| --- -------------------------------- -------- |' + do i = 1, nntot + write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') & + i,(bk(j,i,1)/lenconfac,j=1,3),wb(i)*lenconfac**2 + enddo + write(stdout,'(1x,"+",76("-"),"+")') + write(stdout,*) ' ' endif - write(stdout,'(1x,a)') '| No. b_k(x) b_k(y) b_k(z) w_b |' - write(stdout,'(1x,a)') '| --- -------------------------------- -------- |' - do i = 1, nntot - write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') & - i,(bk(j,i,1)/lenconfac,j=1,3),wb(i)*lenconfac**2 - enddo - write(stdout,'(1x,"+",76("-"),"+")') - write(stdout,*) ' ' deallocate(nnlist_tmp, stat=ierr ) if (ierr/=0) call io_error('Error in deallocating nnlist_tmp in kmesh_get') @@ -760,31 +772,41 @@ subroutine kmesh_dealloc() !======================================== ! !! Release memory from the kmesh module - ! + ! This routine now check to see if arrays + ! are allocated, as there are some code + ! paths that will not allocate on all nodes !======================================== use w90_io, only : io_error implicit none integer :: ierr ! Deallocate real arrays that are public - if (.not. explicit_nnkpts) then - deallocate(bk, stat=ierr ) - if (ierr/=0) call io_error('Error in deallocating bk in kmesh_dealloc') - deallocate(bka, stat=ierr ) - if (ierr/=0) call io_error('Error in deallocating bka in kmesh_dealloc') - deallocate(wb, stat=ierr ) - if (ierr/=0) call io_error('Error in deallocating wb in kmesh_dealloc') + if(allocated(bk))then + deallocate(bk, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating bk in kmesh_dealloc') + endif + if(allocated(bka))then + deallocate(bka, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating bka in kmesh_dealloc') + endif + if(allocated(wb))then + deallocate(wb, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating wb in kmesh_dealloc') end if ! Deallocate integer arrays that are public - if (.not. explicit_nnkpts) then - deallocate(neigh, stat=ierr ) - if (ierr/=0) call io_error('Error in deallocating neigh in kmesh_dealloc') + if(allocated(neigh))then + deallocate(neigh, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating neigh in kmesh_dealloc') end if - deallocate(nncell, stat=ierr ) - if (ierr/=0) call io_error('Error in deallocating nncell in kmesh_dealloc') - deallocate(nnlist, stat=ierr ) - if (ierr/=0) call io_error('Error in deallocating nnlist in kmesh_dealloc') + if(allocated(nncell))then + deallocate(nncell, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating nncell in kmesh_dealloc') + endif + if(allocated(nnlist))then + deallocate(nnlist, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating nnlist in kmesh_dealloc') + endif return @@ -930,7 +952,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) if (ierr/=0) call io_error('Error allocating bvector in kmesh_shell_automatic') bvector=0.0_dp;bweight=0.0_dp - write(stdout,'(1x,a)') '| The b-vectors are chosen automatically |' + if (on_root) write(stdout,'(1x,a)') '| The b-vectors are chosen automatically |' b1sat=.false. do shell=1,search_shells @@ -939,7 +961,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) ! get the b vectors for the new shell call kmesh_get_bvectors(multi(shell),1,dnn(shell),bvector(:,1:multi(shell),cur_shell)) - if(iprint>=3) then + if(iprint>=3.and.on_root) then write(stdout,'(1x,a8,1x,I2,a14,1x,I2,49x,a)') '| Shell:',shell,' Multiplicity:',multi(shell), '|' do loop=1,multi(shell) write(stdout,'(1x,a10,I2,1x,a1,4x,3f12.6,5x,a9,9x,a)') '| b-vector ',loop,':', & @@ -963,7 +985,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) end if if(lpar) then - if(iprint>=3) then + if(iprint>=3.and.on_root) then write(stdout,'(1x,a)') '| This shell is linearly dependent on existing shells: Trying next shell |' end if cycle @@ -1007,7 +1029,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) info=0 call dgesvd('A','A',max_shells,num_shells,amat,max_shells,singv,umat,max_shells,vmat,num_shells,work,lwork,info) if(info<0) then - write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_automatic: Argument',abs(info),'of dgesvd is incorrect' + if (on_root) write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_automatic: Argument',abs(info),'of dgesvd is incorrect' call io_error('kmesh_shell_automatic: Problem with Singular Value Decomposition') else if (info>0) then call io_error('kmesh_shell_automatic: Singular Value Decomposition did not converge') @@ -1017,7 +1039,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) if(num_shells==1) then call io_error('kmesh_shell_automatic: Singular Value Decomposition has found a very small singular value') else - write(stdout,'(1x,a)') '| SVD found small singular value, Rejecting this shell and trying the next |' + if (on_root) write(stdout,'(1x,a)') '| SVD found small singular value, Rejecting this shell and trying the next |' b1sat=.false. num_shells=num_shells-1 goto 200 @@ -1038,7 +1060,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) tmp3 = matmul(transpose(vmat),tmp2) bweight(1:num_shells) = tmp3 - if(iprint>=2) then + if(iprint>=2.and.on_root) then do loop_s=1,num_shells write(stdout,'(1x,a,I2,a,f12.7,5x,a8,36x,a)') '| Shell: ',loop_s,& ' w_b ', bweight(loop_s)*lenconfac**2,'('//trim(length_unit)//'^2)','|' @@ -1066,13 +1088,13 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) if(.not.b1sat) then if(shell=3) then - write(stdout,'(1x,a,24x,a1)') '| B1 condition is not satisfied: Adding another shell','|' + if (on_root) write(stdout,'(1x,a,24x,a1)') '| B1 condition is not satisfied: Adding another shell','|' elseif(shell==search_shells) then - write(stdout,*) ' ' - write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells' - write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid' - write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)' - write(stdout,*) ' ' + if (on_root) write(stdout,*) ' ' + if (on_root) write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells' + if (on_root) write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid' + if (on_root) write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)' + if (on_root) write(stdout,*) ' ' call io_error('kmesh_get_automatic') end if end if @@ -1102,11 +1124,11 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight) end do if(.not. b1sat) then - write(stdout,*) ' ' - write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells' - write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid' - write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)' - write(stdout,*) ' ' + if (on_root) write(stdout,*) ' ' + if (on_root) write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells' + if (on_root) write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid' + if (on_root) write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)' + if (on_root) write(stdout,*) ' ' call io_error('kmesh_get_automatic') end if @@ -1156,7 +1178,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight) bvector=0.0_dp;bweight=0.0_dp amat=0.0_dp;umat=0.0_dp;vmat=0.0_dp;smat=0.0_dp;singv=0.0_dp - write(stdout,'(1x,a)') '| The b-vectors are set in the win file |' + if (on_root) write(stdout,'(1x,a)') '| The b-vectors are set in the win file |' do shell=1,num_shells @@ -1165,7 +1187,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight) bvector(:,1:multi(shell_list(shell)),shell)) end do - if(iprint>=3) then + if(iprint>=3.and.on_root) then do shell=1,num_shells write(stdout,'(1x,a8,1x,I2,a14,1x,I2,49x,a)') '| Shell:',shell,' Multiplicity:',multi(shell_list(shell)), '|' do loop=1,multi(shell_list(shell)) @@ -1190,7 +1212,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight) info=0 call dgesvd('A','A',max_shells,num_shells,amat,max_shells,singv,umat,max_shells,vmat,num_shells,work,lwork,info) if(info<0) then - write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect' + if (on_root) write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect' call io_error('kmesh_shell_fixed: Problem with Singular Value Decomposition') else if (info>0) then call io_error('kmesh_shell_fixed: Singular Value Decomposition did not converge') @@ -1205,7 +1227,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight) end do bweight(1:num_shells)=matmul(transpose(vmat),matmul(smat,matmul(transpose(umat),target))) - if(iprint>=2) then + if(iprint>=2.and.on_root) then do loop_s=1,num_shells ! write(stdout,'(1x,a,I2,a,f12.7,49x,a)') '| Shell: ',loop_s,' w_b ', bweight(loop_s),'|' write(stdout,'(1x,a,I2,a,f12.7,5x,a8,36x,a)') '| Shell: ',loop_s,& @@ -1284,7 +1306,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight) if (ierr/=0) call io_error('Error allocating bvector in kmesh_shell_fixed') bvector=0.0_dp;bweight=0.0_dp - write(stdout,'(1x,a)') '| The b-vectors are defined in the kshell file |' + if (on_root) write(stdout,'(1x,a)') '| The b-vectors are defined in the kshell file |' counter=1 do shell=1,search_shells @@ -1352,7 +1374,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight) - if(iprint>=3) then + if(iprint>=3.and.on_root) then do shell=1,num_shells write(stdout,'(1x,a8,1x,I2,a14,1x,I2,49x,a)') '| Shell:',shell,' Multiplicity:',multi(shell), '|' do loop=1,multi(shell) @@ -1390,7 +1412,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight) info=0 call dgesvd('A','A',max_shells,num_shells,amat,max_shells,singv,umat,max_shells,vmat,num_shells,work,lwork,info) if(info<0) then - write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect' + if (on_root) write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect' call io_error('kmesh_shell_fixed: Problem with Singular Value Decomposition') else if (info>0) then call io_error('kmesh_shell_fixed: Singular Value Decomposition did not converge') @@ -1405,7 +1427,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight) end do bweight(1:num_shells)=matmul(transpose(vmat),matmul(smat,matmul(transpose(umat),target))) - if(iprint>=2) then + if(iprint>=2.and.on_root) then do loop_s=1,num_shells write(stdout,'(1x,a,I2,a,f12.7,5x,a8,36x,a)') '| Shell: ',loop_s,& ' w_b ', bweight(loop_s)*lenconfac**2,'('//trim(length_unit)//'^2)','|' diff --git a/src/overlap.F90 b/src/overlap.F90 index 8e0e0eed8..ea7e43c9d 100644 --- a/src/overlap.F90 +++ b/src/overlap.F90 @@ -12,6 +12,7 @@ ! https://github.com/wannier-developers/wannier90 ! !------------------------------------------------------------! + module w90_overlap !! This module reads in the overlap (Mmn) and Projections (Amn) !! and performs simple operations on them. @@ -19,6 +20,7 @@ module w90_overlap use w90_constants, only : dp,cmplx_0,cmplx_1 use w90_parameters, only : disentanglement use w90_io, only : stdout + use w90_comms, only : on_root,comms_bcast implicit none @@ -83,35 +85,18 @@ subroutine overlap_read( ) endif - if (index(devel_flag,'f77input')>0) then - ! This block left for the short term as a means - ! to quickly benchmark against the old f77 code - ! Read U_matrix and M_matrix from file - open(20,file='wannier0.dat',form='formatted',status='unknown') - do i=1,num_wann - do j=1,num_wann - do nkp=1,num_kpts - read(20,*) u_matrix(i,j,nkp) - do nn=1,nntot - read(20,*) m_matrix(i,j,nn,nkp) - end do - end do - end do - end do - close(20) - - else + if(on_root) then ! Read M_matrix_orig from file mmn_in=io_file_unit() open(unit=mmn_in,file=trim(seedname)//'.mmn',& form='formatted',status='old',action='read',err=101) - write(stdout,'(/a)',advance='no') ' Reading overlaps from '//trim(seedname)//'.mmn : ' + if(on_root) write(stdout,'(/a)',advance='no') ' Reading overlaps from '//trim(seedname)//'.mmn : ' ! Read the comment line read(mmn_in,'(a)',err=103,end=103) dummy - write(stdout,'(a)') trim(dummy) + if(on_root) write(stdout,'(a)') trim(dummy) ! Read the number of bands, k-points and nearest neighbours read(mmn_in,*,err=103,end=103) nb_tmp,nkp_tmp,nntot_tmp @@ -153,7 +138,7 @@ subroutine overlap_read( ) endif end do if (nn.eq.0) then - write(stdout,'(/a,i8,2i5,i4,2x,3i3)') & + if(on_root) write(stdout,'(/a,i8,2i5,i4,2x,3i3)') & ' Error reading '//trim(seedname)//'.mmn:',ncount,nkp,nkp2,nn,nnl,nnm,nnn call io_error('Neighbour not found') end if @@ -166,21 +151,27 @@ subroutine overlap_read( ) end do deallocate(mmn_tmp,stat=ierr) if (ierr/=0) call io_error('Error in deallocating mmn_tmp in overlap_read') - close(mmn_in) + endif + + if(disentanglement) then + call comms_bcast(m_matrix_orig(1,1,1,1),num_bands*num_bands*nntot*num_kpts) + else + call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts) + endif - - if(.not. use_bloch_phases) then + if(.not. use_bloch_phases) then + if(on_root) then ! Read A_matrix from file wannier.amn amn_in=io_file_unit() open(unit=amn_in,file=trim(seedname)//'.amn',form='formatted',status='old',err=102) - write(stdout,'(/a)',advance='no') ' Reading projections from '//trim(seedname)//'.amn : ' + if(on_root) write(stdout,'(/a)',advance='no') ' Reading projections from '//trim(seedname)//'.amn : ' ! Read the comment line read(amn_in,'(a)',err=104,end=104) dummy - write(stdout,'(a)') trim(dummy) + if(on_root) write(stdout,'(a)') trim(dummy) ! Read the number of bands, k-points and wannier functions read(amn_in,*,err=104,end=104) nb_tmp, nkp_tmp, nw_tmp @@ -206,18 +197,24 @@ subroutine overlap_read( ) u_matrix(m,n,nkp) = cmplx(a_real,a_imag,kind=dp) end do end if - close(amn_in) - + endif + + if(disentanglement) then + call comms_bcast(a_matrix(1,1,1),num_bands*num_wann*num_kpts) else - - do n=1,num_kpts - do m=1,num_wann - u_matrix(m,m,n)=cmplx_1 - end do - end do + call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts) + endif - end if + else + + do n=1,num_kpts + do m=1,num_wann + u_matrix(m,m,n)=cmplx_1 + end do + end do + + end if ! If post-processing a Car-Parinello calculation (gamma only) ! then rotate M and A to the basis of Kohn-Sham eigenstates @@ -261,7 +258,6 @@ subroutine overlap_read( ) !~ end if ![ysl-e] - endif if (timing_level>0) call io_stopwatch('overlap: read',2) diff --git a/src/parameters.F90 b/src/parameters.F90 index 69453efce..23ab9e188 100644 --- a/src/parameters.F90 +++ b/src/parameters.F90 @@ -12,12 +12,14 @@ ! https://github.com/wannier-developers/wannier90 ! !------------------------------------------------------------! + module w90_parameters !! This module contains parameters to control the actions of wannier90. !! Also routines to read the parameters and write them out again. use w90_constants, only : dp use w90_io, only : stdout,maxlen + use w90_comms, only : on_root,num_nodes implicit none @@ -450,6 +452,8 @@ module w90_parameters public :: param_lib_set_atoms public :: param_memory_estimate public :: param_get_smearing_type + public :: param_dist + public :: param_chkpt_dist contains @@ -487,6 +491,10 @@ subroutine param_read ( ) ! default value is symmetrize_eps=0.001 call param_get_keyword('symmetrize_eps',found,r_value=symmetrize_eps)!YN: +!jry if (lsitesymmetry.and.num_nodes>1) then +!jry call io_error('Error: site symmetry can not be used in parallel mode') +!jry end if + !%%%%%%%%%%%%%%%% ! Transport @@ -571,7 +579,7 @@ subroutine param_read ( ) ! AAM_2016-09-16: some changes to logic to patch a problem with uninitialised num_bands in library mode ! num_bands = -1 call param_get_keyword('num_bands',found,i_value=i_temp) - if(found.and.library) write(stdout,'(/a)') ' Ignoring in input file' + if(found.and.library.and.on_root) write(stdout,'(/a)') ' Ignoring in input file' if (.not. library .and. .not.effective_model) then if(found) num_bands=i_temp if(.not.found) num_bands=num_wann @@ -596,7 +604,7 @@ subroutine param_read ( ) ! mp_grid=-99 call param_get_keyword_vector('mp_grid',found,3,i_value=iv_temp) - if(found.and.library) write(stdout,'(a)') ' Ignoring in input file' + if(found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring in input file' if(.not.library .and. .not.effective_model) then if(found) mp_grid=iv_temp if (.not. found) then @@ -615,7 +623,7 @@ subroutine param_read ( ) if ( gamma_only .and. (num_kpts.ne.1) ) & call io_error('Error: gamma_only is true, but num_kpts > 1') else - if (found) write(stdout,'(a)') ' Ignoring in input file' + if (found.and.on_root) write(stdout,'(a)') ' Ignoring in input file' endif ![ysl-e] @@ -662,7 +670,7 @@ subroutine param_read ( ) if (.not.library) then spinors=ltmp else - if (found) write(stdout,'(a)') ' Ignoring in input file' + if (found.and.on_root) write(stdout,'(a)') ' Ignoring in input file' endif ! if(spinors .and. (2*(num_wann/2))/=num_wann) & ! call io_error('Error: For spinor WF num_wann must be even') @@ -1383,7 +1391,7 @@ subroutine param_read ( ) do k=1,num_kpts do n=1,num_bands read(eig_unit,*,err=106,end=106) i,j,eigval(n,k) - if ((i.ne.n).or.(j.ne.k)) then + if ((((i.ne.n).or.(j.ne.k))).and.on_root) then write(stdout,'(a)') 'Found a mismatch in '//trim(seedname)//'.eig' write(stdout,'(a,i0,a,i0)') 'Wanted band : ',n,' found band : ',i write(stdout,'(a,i0,a,i0)') 'Wanted kpoint: ',k,' found kpoint: ',j @@ -1765,7 +1773,7 @@ subroutine param_read ( ) call param_get_keyword('skip_b1_tests', found, l_value=skip_B1_tests) call param_get_keyword_block('unit_cell_cart',found,3,3,r_value=real_lattice_tmp) - if(found.and.library) write(stdout,'(a)') ' Ignoring in input file' + if(found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring in input file' if (.not. library) then real_lattice=transpose(real_lattice_tmp) if(.not. found) call io_error('Error: Did not find the cell information in the input file') @@ -1783,7 +1791,7 @@ subroutine param_read ( ) end if call param_get_keyword_block('kpoints',found,num_kpts,3,r_value=kpt_cart) - if(found.and.library) write(stdout,'(a)') ' Ignoring in input file' + if(found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring in input file' if (.not. library .and. .not.effective_model) then kpt_latt=kpt_cart if(.not. found) call io_error('Error: Did not find the kpoint information in the input file') @@ -1898,9 +1906,9 @@ subroutine param_read ( ) ! Atoms if (.not.library) num_atoms=0 call param_get_block_length('atoms_frac',found,i_temp) - if (found.and.library) write(stdout,'(a)') ' Ignoring in input file' + if (found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring in input file' call param_get_block_length('atoms_cart',found2,i_temp2,lunits) - if (found2.and.library) write(stdout,'(a)') ' Ignoring in input file' + if (found2.and.library.and.on_root) write(stdout,'(a)') ' Ignoring in input file' if (.not.library) then if (found.and.found2) call io_error('Error: Cannot specify both atoms_frac and atoms_cart') if (found .and. i_temp>0) then @@ -1925,7 +1933,7 @@ subroutine param_read ( ) 302 continue - if ( any(len_trim(in_data(:))>0 )) then + if ( any(len_trim(in_data(:))>0 ).and.on_root) then write(stdout,'(1x,a)') 'The following section of file '//trim(seedname)//'.win contained unrecognised keywords' write(stdout,*) do loop=1,num_lines @@ -3308,14 +3316,14 @@ subroutine param_read_chkpt() real(kind=dp) :: tmp_latt(3,3), tmp_kpt_latt(3,num_kpts) integer :: tmp_excl_bands(1:num_exclude_bands),tmp_mp_grid(1:3) - write(stdout,'(1x,3a)') 'Reading restart information from file ',trim(seedname),'.chk :' + if (on_root) write(stdout,'(1x,3a)') 'Reading restart information from file ',trim(seedname),'.chk :' chk_unit=io_file_unit() open(unit=chk_unit,file=trim(seedname)//'.chk',status='old',form='unformatted',err=121) ! Read comment line read(chk_unit) header - write(stdout,'(1x,a)',advance='no') trim(header) + if (on_root) write(stdout,'(1x,a)',advance='no') trim(header) ! Consistency checks read(chk_unit) ntmp ! Number of bands @@ -3419,7 +3427,7 @@ subroutine param_read_chkpt() close(chk_unit) - write(stdout,'(a/)') ' ... done' + if (on_root) write(stdout,'(a/)') ' ... done' return @@ -3439,6 +3447,74 @@ subroutine param_read_chkpt() end subroutine param_read_chkpt + !===========================================================! + subroutine param_chkpt_dist + !===========================================================! + ! ! + !! Distribute the chk files + ! ! + !===========================================================! + + use w90_constants, only : dp,cmplx_0,cmplx_i,twopi + use w90_io, only : io_error,io_file_unit,& + io_date,io_time,io_stopwatch + use w90_comms, only : on_root,comms_bcast + + implicit none + + integer :: ierr,loop_kpt,m,i,j + + call comms_bcast(checkpoint,len(checkpoint)) + + if (.not.on_root .and. .not.allocated(u_matrix)) then + allocate(u_matrix(num_wann,num_wann,num_kpts),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating u_matrix in param_chkpt_dist') + endif + call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts) + + if (.not.on_root .and. .not.allocated(m_matrix)) then + allocate(m_matrix(num_wann,num_wann,nntot,num_kpts),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating m_matrix in param_chkpt_dist') + endif + call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts) + + call comms_bcast(have_disentangled,1) + + if (have_disentangled) then + if(.not.on_root) then + + if (.not.allocated(u_matrix_opt)) then + allocate(u_matrix_opt(num_bands,num_wann,num_kpts),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating u_matrix_opt in param_chkpt_dist') + endif + + if (.not.allocated(lwindow)) then + allocate(lwindow(num_bands,num_kpts),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating lwindow in param_chkpt_dist') + endif + + if (.not.allocated(ndimwin)) then + allocate(ndimwin(num_kpts),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating ndimwin in param_chkpt_dist') + endif + + end if + + call comms_bcast(u_matrix_opt(1,1,1),num_bands*num_wann*num_kpts) + call comms_bcast(lwindow(1,1),num_bands*num_kpts) + call comms_bcast(ndimwin(1),num_kpts) + call comms_bcast(omega_invariant,1) + end if + call comms_bcast(wannier_centres(1,1),3*num_wann) + call comms_bcast(wannier_spreads(1),num_wann) + + end subroutine param_chkpt_dist + !=======================================! subroutine param_in_file !=======================================! @@ -5280,37 +5356,37 @@ subroutine param_memory_estimate if(disentanglement) & mem_wan= mem_wan+ num_wann*num_wann*nntot*num_kpts*size_cmplx !m_matrix - write(stdout,'(1x,a)') '*============================================================================*' - write(stdout,'(1x,a)') '| MEMORY ESTIMATE |' - write(stdout,'(1x,a)') '| Maximum RAM allocated during each phase of the calculation |' - write(stdout,'(1x,a)') '*============================================================================*' - if(disentanglement) & - write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis)/(1024**2),' Mb' - write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb' - if(optimisation>0 .and. iprint>1 ) then - write(stdout,'(1x,a)') '| |' - write(stdout,'(1x,a)') '| N.B. by setting optimisation=0 memory usage will be reduced to: |' - if (disentanglement) & - write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis- & - max(mem_dis1,mem_dis2)+mem_dis1)/(1024**2),' Mb' - if(gamma_only) then - write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb' - else - write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan-mem_wan1)/(1024**2),' Mb' - end if - write(stdout,'(1x,a)') '| However, this will result in more i/o and slow down the calculation |' - endif - - write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'plot_wannier:',(mem_param+mem_wan)/(1024**2),' Mb' - - if (ispostw90) then - if (boltzwann) & - write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'BoltzWann:',(mem_param+mem_bw)/(1024**2),' Mb' - end if + if (on_root) then + write(stdout,'(1x,a)') '*============================================================================*' + write(stdout,'(1x,a)') '| MEMORY ESTIMATE |' + write(stdout,'(1x,a)') '| Maximum RAM allocated during each phase of the calculation |' + write(stdout,'(1x,a)') '*============================================================================*' + if(disentanglement) & + write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis)/(1024**2),' Mb' + write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb' + if(optimisation>0 .and. iprint>1 ) then + write(stdout,'(1x,a)') '| |' + write(stdout,'(1x,a)') '| N.B. by setting optimisation=0 memory usage will be reduced to: |' + if (disentanglement) & + write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis- & + max(mem_dis1,mem_dis2)+mem_dis1)/(1024**2),' Mb' + if(gamma_only) then + write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb' + else + write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan-mem_wan1)/(1024**2),' Mb' + end if + write(stdout,'(1x,a)') '| However, this will result in more i/o and slow down the calculation |' + endif - write(stdout,'(1x,a)') '*----------------------------------------------------------------------------*' - write(stdout,*) ' ' + if (ispostw90) then + if (boltzwann) & + write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'BoltzWann:',(mem_param+mem_bw)/(1024**2),' Mb' + end if + write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'plot_wannier:',(mem_param+mem_wan)/(1024**2),' Mb' + write(stdout,'(1x,a)') '*----------------------------------------------------------------------------*' + write(stdout,*) ' ' + endif ! if(disentanglement) then ! write(*,'(a12,f12.4,a)') 'Disentangle',(mem_param+mem_dis)/(1024**2),' Mb' @@ -5322,4 +5398,369 @@ subroutine param_memory_estimate end subroutine param_memory_estimate + !===========================================================! + subroutine param_dist + !===========================================================! + ! ! + !! distribute the parameters across processors ! + ! ! + !===========================================================! + + use w90_constants, only : dp,cmplx_0,cmplx_i,twopi + use w90_io, only : io_error,io_file_unit,io_date,io_time,& + io_stopwatch + use w90_comms, only : comms_bcast + + integer :: ierr + + call comms_bcast(effective_model,1) + call comms_bcast(eig_found,1) + call comms_bcast(postproc_setup,1) + if(.not.effective_model) then + call comms_bcast(mp_grid(1),3) + call comms_bcast(num_kpts,1) + call comms_bcast(num_bands,1) + endif + call comms_bcast(num_wann,1) + call comms_bcast(timing_level,1) + call comms_bcast(iprint,1) + call comms_bcast(energy_unit,1) + call comms_bcast(length_unit,1) + call comms_bcast(wvfn_formatted,1) + call comms_bcast(spn_formatted,1) + call comms_bcast(berry_uHu_formatted,1) + call comms_bcast(spin,1) + call comms_bcast(num_dump_cycles,1) + call comms_bcast(num_print_cycles,1) + call comms_bcast(num_atoms,1) ! Ivo: not used in postw90, right? + call comms_bcast(num_species,1) ! Ivo: not used in postw90, right? + call comms_bcast(real_lattice(1,1),9) + call comms_bcast(recip_lattice(1,1),9) + call comms_bcast(real_metric(1,1),9) + call comms_bcast(recip_metric(1,1),9) + call comms_bcast(cell_volume,1) + call comms_bcast(dos_energy_step,1) + call comms_bcast(dos_adpt_smr,1) + call comms_bcast(dos_smr_index,1) + call comms_bcast(dos_kmesh_spacing,1) + call comms_bcast(dos_kmesh(1),3) + call comms_bcast(dos_adpt_smr_max,1) + call comms_bcast(dos_smr_fixed_en_width,1) + call comms_bcast(dos_adpt_smr_fac,1) + call comms_bcast(num_dos_project,1) + call comms_bcast(num_exclude_bands,1) + if(num_exclude_bands>0) then + if(.not.on_root) then + allocate(exclude_bands(num_exclude_bands), stat=ierr ) + if (ierr/=0) & + call io_error('Error in allocating exclude_bands in param_dist') + endif + call comms_bcast(exclude_bands(1),num_exclude_bands) + end if + + call comms_bcast(gamma_only,1) + call comms_bcast(dis_win_min,1) + call comms_bcast(dis_win_max,1) + call comms_bcast(dis_froz_min,1) + call comms_bcast(dis_froz_max,1) + call comms_bcast(dis_num_iter,1) + call comms_bcast(dis_mix_ratio,1) + call comms_bcast(dis_conv_tol,1) + call comms_bcast(dis_conv_window,1) + call comms_bcast(dis_spheres_first_wann,1) + call comms_bcast(dis_spheres_num,1) + if(dis_spheres_num>0) then + if(.not.on_root) then + allocate(dis_spheres(4,dis_spheres_num), stat=ierr ) + if (ierr/=0) & + call io_error('Error in allocating dis_spheres in param_dist') + endif + call comms_bcast(dis_spheres(1,1),4*dis_spheres_num) + end if + call comms_bcast(num_iter,1) + call comms_bcast(num_cg_steps,1) + call comms_bcast(conv_tol,1) + call comms_bcast(conv_window,1) + call comms_bcast(wannier_plot,1) + call comms_bcast(num_wannier_plot,1) + if(num_wannier_plot>0) then + if(.not.on_root) then + allocate(wannier_plot_list(num_wannier_plot), stat=ierr ) + if (ierr/=0) & + call io_error('Error in allocating wannier_plot_list in param_dist') + endif + call comms_bcast(wannier_plot_list(1),num_wannier_plot) + end if + call comms_bcast(wannier_plot_supercell(1),3) + call comms_bcast(wannier_plot_format,len(wannier_plot_format)) + call comms_bcast(wannier_plot_mode,len(wannier_plot_mode)) + call comms_bcast(write_u_matrices,1) + call comms_bcast(bands_plot,1) + call comms_bcast(bands_num_points,1) + call comms_bcast(bands_plot_format,len(bands_plot_format)) + call comms_bcast(bands_plot_mode,len(bands_plot_mode)) + call comms_bcast(num_bands_project,1) + + if(num_bands_project>0) then + if(.not.on_root) then + allocate(bands_plot_project(num_bands_project), stat=ierr ) + if (ierr/=0) & + call io_error('Error in allocating bands_plot_project in param_dist') + endif + call comms_bcast(bands_plot_project(1),num_bands_project) + end if + call comms_bcast(bands_plot_dim,1) + call comms_bcast(write_hr,1) + call comms_bcast(write_rmn,1) + call comms_bcast(write_tb,1) + call comms_bcast(hr_cutoff,1) + call comms_bcast(dist_cutoff,1) + call comms_bcast(dist_cutoff_mode,len(dist_cutoff_mode)) + call comms_bcast(dist_cutoff_hc,1) + call comms_bcast(one_dim_axis,len(one_dim_axis)) + call comms_bcast(use_ws_distance,1) +! call comms_bcast(ws_distance_tol,1) + call comms_bcast(fermi_surface_plot,1) + call comms_bcast(fermi_surface_num_points,1) + call comms_bcast(fermi_surface_plot_format,len(fermi_surface_plot_format)) + call comms_bcast(fermi_energy,1) !! used? + call comms_bcast(berry,1) + call comms_bcast(berry_task,len(berry_task)) + call comms_bcast(berry_kmesh_spacing,1) + call comms_bcast(berry_kmesh(1),3) + call comms_bcast(berry_curv_adpt_kmesh,1) + call comms_bcast(berry_curv_adpt_kmesh_thresh,1) + call comms_bcast(berry_curv_unit,len(berry_curv_unit)) + call comms_bcast(kubo_adpt_smr,1) + call comms_bcast(kubo_adpt_smr_fac,1) + call comms_bcast(kubo_adpt_smr_max,1) + call comms_bcast(kubo_smr_fixed_en_width,1) + call comms_bcast(kubo_smr_index,1) + call comms_bcast(kubo_eigval_max,1) + call comms_bcast(kubo_nfreq,1) + call comms_bcast(nfermi,1) + call comms_bcast(dos_energy_min,1) + call comms_bcast(dos_energy_max,1) + call comms_bcast(spin_kmesh_spacing,1) + call comms_bcast(spin_kmesh(1),3) + call comms_bcast(wanint_kpoint_file,1) + + call comms_bcast(devel_flag,len(devel_flag)) + call comms_bcast(spin_moment,1) + call comms_bcast(spin_axis_polar,1) + call comms_bcast(spin_axis_azimuth,1) + call comms_bcast(spin_decomp,1) + call comms_bcast(use_degen_pert,1) + call comms_bcast(degen_thr,1) + call comms_bcast(num_valence_bands,1) + call comms_bcast(dos,1) + call comms_bcast(dos_task,len(dos_task)) + call comms_bcast(kpath,1) + call comms_bcast(kpath_task,len(kpath_task)) + call comms_bcast(kpath_bands_colour,len(kpath_bands_colour)) + call comms_bcast(kslice,1) + call comms_bcast(kslice_task,len(kslice_task)) + call comms_bcast(transl_inv,1) + call comms_bcast(num_elec_per_state,1) + call comms_bcast(scissors_shift,1) + ! + +! ---------------------------------------------- + call comms_bcast(geninterp,1) + call comms_bcast(geninterp_alsofirstder,1) + call comms_bcast(geninterp_single_file,1) + ! [gp-begin, Apr 12, 2012] + ! BoltzWann variables + call comms_bcast(boltzwann,1) + call comms_bcast(boltz_calc_also_dos,1) + call comms_bcast(boltz_2d_dir_num,1) + call comms_bcast(boltz_dos_energy_step,1) + call comms_bcast(boltz_dos_energy_min,1) + call comms_bcast(boltz_dos_energy_max,1) + call comms_bcast(boltz_dos_adpt_smr,1) + call comms_bcast(boltz_dos_smr_fixed_en_width,1) + call comms_bcast(boltz_dos_adpt_smr_fac,1) + call comms_bcast(boltz_dos_adpt_smr_max,1) + call comms_bcast(boltz_mu_min,1) + call comms_bcast(boltz_mu_max,1) + call comms_bcast(boltz_mu_step,1) + call comms_bcast(boltz_temp_min,1) + call comms_bcast(boltz_temp_max,1) + call comms_bcast(boltz_temp_step,1) + call comms_bcast(boltz_kmesh_spacing,1) + call comms_bcast(boltz_kmesh(1),3) + call comms_bcast(boltz_tdf_energy_step,1) + call comms_bcast(boltz_relax_time,1) + call comms_bcast(boltz_TDF_smr_fixed_en_width,1) + call comms_bcast(boltz_TDF_smr_index,1) + call comms_bcast(boltz_dos_smr_index,1) + call comms_bcast(boltz_bandshift,1) + call comms_bcast(boltz_bandshift_firstband,1) + call comms_bcast(boltz_bandshift_energyshift,1) + ! [gp-end] + call comms_bcast(use_ws_distance,1) + call comms_bcast(disentanglement,1) + + + call comms_bcast(transport,1) + call comms_bcast(tran_easy_fix,1) + call comms_bcast(transport_mode,len(transport_mode)) + call comms_bcast(tran_win_min,1) + call comms_bcast(tran_win_max,1) + call comms_bcast(tran_energy_step,1) + call comms_bcast(tran_num_bb,1) + call comms_bcast(tran_num_ll,1) + call comms_bcast(tran_num_rr,1) + call comms_bcast(tran_num_cc,1) + call comms_bcast(tran_num_lc,1) + call comms_bcast(tran_num_cr,1) + call comms_bcast(tran_num_bandc,1) + call comms_bcast(tran_write_ht,1) + call comms_bcast(tran_read_ht ,1) + call comms_bcast(tran_use_same_lead,1) + call comms_bcast(tran_num_cell_ll,1) + call comms_bcast(tran_num_cell_rr,1) + call comms_bcast(tran_group_threshold,1) + call comms_bcast(translation_centre_frac(1),3) + call comms_bcast(num_shells,1) + call comms_bcast(skip_B1_tests,1) + call comms_bcast(explicit_nnkpts,1) + + + call comms_bcast(calc_only_A,1) + call comms_bcast(use_bloch_phases,1) + call comms_bcast(restart,len(restart)) + call comms_bcast(write_r2mn,1) + call comms_bcast(num_guide_cycles,1) + call comms_bcast(num_no_guide_iter,1) + call comms_bcast(fixed_step,1) + call comms_bcast(trial_step,1) + call comms_bcast(precond,1) + call comms_bcast(write_proj,1) + call comms_bcast(timing_level,1) + call comms_bcast(spinors,1) + call comms_bcast(num_elec_per_state,1) + call comms_bcast(translate_home_cell,1) + call comms_bcast(write_xyz,1) + call comms_bcast(write_hr_diag,1) + call comms_bcast(conv_noise_amp,1) + call comms_bcast(conv_noise_num,1) + call comms_bcast(wannier_plot_radius,1) + call comms_bcast(kmesh_tol,1) + call comms_bcast(optimisation,1) + call comms_bcast(write_vdw_data,1) + call comms_bcast(lenconfac,1) + call comms_bcast(lfixstep,1) + call comms_bcast(lsitesymmetry,1) + call comms_bcast(frozen_states,1) + + call comms_bcast(num_proj,1) + if(num_proj>0) then + if(.not.on_root) then + allocate( proj_site(3,num_proj),stat=ierr) + if (ierr/=0) call io_error('Error allocating proj_site in param_dist') + endif + call comms_bcast(proj_site(1,1),3*num_proj) + endif + + + ! These variables are different from the ones above in that they are + ! allocatable, and in param_read they were allocated on the root node only + ! + if(.not.on_root) then + allocate(fermi_energy_list(nfermi),stat=ierr) + if (ierr/=0) call io_error(& + 'Error allocating fermi_energy_read in postw90_param_dist') + allocate(kubo_freq_list(kubo_nfreq),stat=ierr) + if (ierr/=0) call io_error(& + 'Error allocating kubo_freq_list in postw90_param_dist') + allocate(dos_project(num_dos_project),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating dos_project in postw90_param_dist') + if(.not.effective_model) then + if (eig_found) then + allocate(eigval(num_bands,num_kpts),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating eigval in postw90_param_dist') + end if + allocate(kpt_latt(3,num_kpts),stat=ierr) + if (ierr/=0)& + call io_error('Error allocating kpt_latt in postw90_param_dist') + endif + end if + if(nfermi>0) call comms_bcast(fermi_energy_list(1),nfermi) + if(kubo_nfreq>0) call comms_bcast(kubo_freq_list(1),kubo_nfreq) + if(num_dos_project>0) call comms_bcast(dos_project(1),num_dos_project) + if(.not.effective_model) then + if (eig_found) then + call comms_bcast(eigval(1,1),num_bands*num_kpts) + end if + call comms_bcast(kpt_latt(1,1),3*num_kpts) + endif + + + + if(.not.effective_model.and..not.explicit_nnkpts) then + + call comms_bcast(nnh,1) + call comms_bcast(nntot,1) + call comms_bcast(wbtot,1) + + if(.not. on_root) then + allocate(nnlist(num_kpts,nntot), stat=ierr ) + if (ierr/=0)& + call io_error('Error in allocating nnlist in param_dist') + allocate(neigh(num_kpts,nntot/2), stat=ierr ) + if (ierr/=0)& + call io_error('Error in allocating neigh in param_dist') + allocate(nncell(3,num_kpts,nntot), stat=ierr ) + if (ierr/=0)& + call io_error('Error in allocating nncell in param_dist') + allocate(wb(nntot), stat=ierr ) + if (ierr/=0)& + call io_error('Error in allocating wb in param_dist') + allocate(bka(3,nntot/2), stat=ierr ) + if (ierr/=0)& + call io_error('Error in allocating bka in param_dist') + allocate(bk(3,nntot,num_kpts), stat=ierr ) + if (ierr/=0)& + call io_error('Error in allocating bk in param_dist') + end if + + call comms_bcast(nnlist(1,1),num_kpts*nntot) + call comms_bcast(neigh(1,1),num_kpts*nntot/2) + call comms_bcast(nncell(1,1,1),3*num_kpts*nntot) + call comms_bcast(wb(1),nntot) + call comms_bcast(bka(1,1),3*nntot/2) + call comms_bcast(bk(1,1,1),3*nntot*num_kpts) + + endif + + call comms_bcast(omega_total,1) + call comms_bcast(omega_tilde,1) + call comms_bcast(omega_invariant,1) + call comms_bcast(have_disentangled,1) + + if(.not.on_root) then + allocate(wannier_centres(3,num_wann),stat=ierr) + if (ierr/=0) call io_error('Error allocating wannier_centres in param_dist') + wannier_centres=0.0_dp + allocate(wannier_spreads(num_wann),stat=ierr) + if (ierr/=0) call io_error('Error in allocating wannier_spreads in param_dist') + wannier_spreads=0.0_dp + if (disentanglement) then + allocate(ndimwin(num_kpts),stat=ierr) + if (ierr/=0) call io_error('Error allocating ndimwin in param_dist') + allocate(lwindow(num_bands,num_kpts),stat=ierr) + if (ierr/=0) call io_error('Error allocating lwindow in param_dist') + endif + endif + + + + + end subroutine param_dist + + + end module w90_parameters diff --git a/src/postw90/comms.F90 b/src/postw90/comms.F90 index a9cd4975f..513fcf42b 100644 --- a/src/postw90/comms.F90 +++ b/src/postw90/comms.F90 @@ -18,7 +18,6 @@ ! ! !------------------------------------------------------------! - module w90_comms !! This module handles all of the communications @@ -99,7 +98,7 @@ module w90_comms interface comms_gatherv ! module procedure comms_gatherv_int ! to be done module procedure comms_gatherv_real -! module procedure comms_gatherv_cmplx + module procedure comms_gatherv_cmplx end interface comms_gatherv interface comms_scatterv @@ -846,6 +845,47 @@ subroutine comms_gatherv_real(array,localcount,rootglobalarray,counts,displs) end subroutine comms_gatherv_real + + ! Array: local array for sending data; localcount elements will be sent + ! to the root node + ! rootglobalarray: array on the root node to which data will be sent + ! counts, displs : how data should be partitioned, see MPI documentation or + ! function comms_array_split + subroutine comms_gatherv_cmplx(array,localcount,rootglobalarray,counts,displs) + + implicit none + + complex(kind=dp), intent(inout) :: array + integer, intent(in) :: localcount + complex(kind=dp), intent(inout) :: rootglobalarray + integer, dimension(num_nodes), intent(in) :: counts + integer, dimension(num_nodes), intent(in) :: displs + +#ifdef MPI + integer :: error + + call MPI_gatherv(array,localcount,MPI_double_complex,rootglobalarray,counts,& + displs,MPI_double_complex,root_id,mpi_comm_world,error) + + if(error.ne.MPI_success) then + call io_error('Error in comms_gatherv_cmplx') + end if + +#else + call zcopy(localcount,array,1,rootglobalarray,1) +#endif + + return + + end subroutine comms_gatherv_cmplx + + + ! Array: local array for getting data; localcount elements will be fetched + ! from the root node + ! rootglobalarray: array on the root node from which data will be sent + ! counts, displs : how data should be partitioned, see MPI documentation or + ! function comms_array_split + subroutine comms_scatterv_real(array,localcount,rootglobalarray,counts,displs) !! Scatter data from root node implicit none diff --git a/src/postw90/postw90_common.F90 b/src/postw90/postw90_common.F90 index 21557c6d5..720f7c8b9 100644 --- a/src/postw90/postw90_common.F90 +++ b/src/postw90/postw90_common.F90 @@ -372,9 +372,7 @@ subroutine pw90common_wanint_param_dist call io_error('Error allocating kpt_latt in postw90_param_dist') endif end if - if (nfermi /= 0) then - call comms_bcast(fermi_energy_list(1),nfermi) - end if + if(nfermi>0) call comms_bcast(fermi_energy_list(1),nfermi) call comms_bcast(kubo_freq_list(1),kubo_nfreq) call comms_bcast(dos_project(1),num_dos_project) if(.not.effective_model) then diff --git a/src/wannier_prog.F90 b/src/wannier_prog.F90 index 4bcd948ba..4885ad034 100644 --- a/src/wannier_prog.F90 +++ b/src/wannier_prog.F90 @@ -15,6 +15,7 @@ ! functions", ! ! Computer Physics Communications 185, 2309 (2014), ! ! http://dx.doi.org/10.1016/j.cpc.2014.05.003 ! + ! ! ! in any publications arising from the use of this code. ! ! ! @@ -64,103 +65,138 @@ program wannier use w90_wannierise use w90_plot use w90_transport + use w90_comms, only : on_root,num_nodes, comms_setup, comms_end, comms_bcast, my_node_id use w90_sitesym !YN: + implicit none real(kind=dp) time0,time1,time2 character(len=9) :: stat,pos,cdate,ctime logical :: wout_found + integer :: len_seedname - time0=io_time() + call comms_setup library = .false. - call io_get_seedname() + time0=io_time() + + if (on_root) then + call io_get_seedname() + len_seedname = len(seedname) + end if + call comms_bcast(len_seedname,1) + call comms_bcast(seedname,len_seedname) - stdout=io_file_unit() - open(unit=stdout,file=trim(seedname)//'.werr') - call io_date(cdate,ctime) - write(stdout,*) 'Wannier90: Execution started on ',cdate,' at ',ctime - call param_read() - close(stdout,status='delete') - if (restart.eq.' ') then - stat='replace' - pos ='rewind' - else - inquire(file=trim(seedname)//'.wout',exist=wout_found) - if (wout_found) then - stat='old' - else + + if(on_root) then + stdout=io_file_unit() + open(unit=stdout,file=trim(seedname)//'.werr') + call io_date(cdate,ctime) + write(stdout,*) 'Wannier90: Execution started on ',cdate,' at ',ctime + + call param_read + close(stdout,status='delete') + + if (restart.eq.' ') then stat='replace' + pos ='rewind' + else + inquire(file=trim(seedname)//'.wout',exist=wout_found) + if (wout_found) then + stat='old' + else + stat='replace' + endif + pos='append' endif - pos='append' - endif - - stdout=io_file_unit() - open(unit=stdout,file=trim(seedname)//'.wout',status=trim(stat),position=trim(pos)) - call param_write_header() - call param_write() - time1=io_time() - write(stdout,'(1x,a25,f11.3,a)') 'Time to read parameters ',time1-time0,' (sec)' + stdout=io_file_unit() + open(unit=stdout,file=trim(seedname)//'.wout',status=trim(stat),position=trim(pos)) + call param_write_header() + if(num_nodes==1) then +#ifdef MPI + write(stdout,'(/,1x,a)') 'Running in serial (with parallel executable)' +#else + write(stdout,'(/,1x,a)') 'Running in serial (with serial executable)' +#endif + else + write(stdout,'(/,1x,a,i3,a/)')& + 'Running in parallel on ',num_nodes,' CPUs' + endif + call param_write() + + time1=io_time() + write(stdout,'(1x,a25,f11.3,a)') 'Time to read parameters ',time1-time0,' (sec)' + + + if (.not. explicit_nnkpts) call kmesh_get + time2=io_time() + write(stdout,'(1x,a25,f11.3,a)')& + 'Time to get kmesh ',time2-time1,' (sec)' + + call param_memory_estimate + end if + + ! We now distribute the parameters to the other nodes + call param_dist + if(gamma_only.and.num_nodes>1) & + call io_error('Gamma point branch is serial only at the moment') if (transport .and. tran_read_ht) goto 3003 - if (.not. explicit_nnkpts) call kmesh_get() - call param_memory_estimate() - ! Sort out restarts if (restart.eq.' ') then ! start a fresh calculation - write(stdout,'(1x,a/)') 'Starting a new Wannier90 calculation ...' + if (on_root) write(stdout,'(1x,a/)') 'Starting a new Wannier90 calculation ...' else ! restart a previous calculation - call param_read_chkpt() -!~ call param_read_um + if(on_root) call param_read_chkpt() + call param_chkpt_dist + select case (restart) case ('default') ! continue from where last checkpoint was written - write(stdout,'(/1x,a)',advance='no') 'Resuming a previous Wannier90 calculation ' + if (on_root) write(stdout,'(/1x,a)',advance='no') 'Resuming a previous Wannier90 calculation ' if (checkpoint.eq.'postdis') then - write(stdout,'(a/)') 'from wannierisation ...' + if (on_root) write(stdout,'(a/)') 'from wannierisation ...' goto 1001 ! go to wann_main elseif (checkpoint.eq.'postwann') then - write(stdout,'(a/)') 'from plotting ...' + if (on_root) write(stdout,'(a/)') 'from plotting ...' goto 2002 ! go to plot_main else - write(stdout,'(/a/)') + if (on_root) write(stdout,'(/a/)') call io_error('Value of checkpoint not recognised in wann_prog') endif case ('wannierise') ! continue from wann_main irrespective of value of last checkpoint - write(stdout,'(1x,a/)') 'Restarting Wannier90 from wannierisation ...' + if (on_root) write(stdout,'(1x,a/)') 'Restarting Wannier90 from wannierisation ...' goto 1001 case ('plot') ! continue from plot_main irrespective of value of last checkpoint - write(stdout,'(1x,a/)') 'Restarting Wannier90 from plotting routines ...' + if (on_root) write(stdout,'(1x,a/)') 'Restarting Wannier90 from plotting routines ...' goto 2002 case ('transport') ! continue from tran_main irrespective of value of last checkpoint - write(stdout,'(1x,a/)') 'Restarting Wannier90 from transport routines ...' + if (on_root) write(stdout,'(1x,a/)') 'Restarting Wannier90 from transport routines ...' goto 3003 case default ! for completeness... (it is already trapped in param_read) call io_error('Value of restart not recognised in wann_prog') end select endif + if (postproc_setup) then - call kmesh_write() + if(on_root) call kmesh_write() call kmesh_dealloc() call param_dealloc() - write(stdout,'(1x,a25,f11.3,a)') 'Time to write kmesh ',io_time(),' (sec)' - write(stdout,'(/a)') ' Exiting... '//trim(seedname)//'.nnkp written.' + if (on_root) write(stdout,'(1x,a25,f11.3,a)') 'Time to write kmesh ',io_time(),' (sec)' + if (on_root) write(stdout,'(/a)') ' Exiting... '//trim(seedname)//'.nnkp written.' + call comms_end stop endif - time2=io_time() - write(stdout,'(1x,a25,f11.3,a)') 'Time to get kmesh ',time2-time1,' (sec)' - - if (lsitesymmetry) call sitesym_read() !YN: + if (lsitesymmetry) call sitesym_read() ! update this to read on root and bcast - JRY call overlap_read() time1=io_time() - write(stdout,'(/1x,a25,f11.3,a)') 'Time to read overlaps ',time1-time2,' (sec)' + if (on_root) write(stdout,'(/1x,a25,f11.3,a)') 'Time to read overlaps ',time1-time2,' (sec)' have_disentangled = .false. @@ -168,10 +204,10 @@ program wannier call dis_main() have_disentangled=.true. time2=io_time() - write(stdout,'(1x,a25,f11.3,a)') 'Time to disentangle bands',time2-time1,' (sec)' + if(on_root) write(stdout,'(1x,a25,f11.3,a)') 'Time to disentangle bands',time2-time1,' (sec)' endif - call param_write_chkpt('postdis') + if (on_root) call param_write_chkpt('postdis') !~ call param_write_um 1001 time2=io_time() @@ -183,26 +219,30 @@ program wannier end if time1=io_time() - write(stdout,'(1x,a25,f11.3,a)') 'Time for wannierise ',time1-time2,' (sec)' - - call param_write_chkpt('postwann') - -2002 time2=io_time() - - if (wannier_plot .or. bands_plot .or. fermi_surface_plot .or. write_hr) then - call plot_main() - time1=io_time() - write(stdout,'(1x,a25,f11.3,a)') 'Time for plotting ',time1-time2,' (sec)' - end if - -3003 time2=io_time() + if (on_root) write(stdout,'(1x,a25,f11.3,a)') 'Time for wannierise ',time1-time2,' (sec)' + + if (on_root) call param_write_chkpt('postwann') + +2002 continue + if (on_root) then + time2=io_time() + if (wannier_plot .or. bands_plot .or. fermi_surface_plot .or. write_hr) then + call plot_main() + time1=io_time() + write(stdout,'(1x,a25,f11.3,a)') 'Time for plotting ',time1-time2,' (sec)' + end if + endif - if (transport) then - call tran_main() - time1=io_time() - write(stdout,'(1x,a25,f11.3,a)') 'Time for transport ',time1-time2,' (sec)' - if (tran_read_ht) goto 4004 - end if +3003 continue + if (on_root) then + time2=io_time() + if (transport) then + call tran_main() + time1=io_time() + write(stdout,'(1x,a25,f11.3,a)') 'Time for transport ',time1-time2,' (sec)' + if (tran_read_ht) goto 4004 + end if + endif call tran_dealloc() call hamiltonian_dealloc() @@ -213,15 +253,18 @@ program wannier 4004 continue - write(stdout,'(1x,a25,f11.3,a)') 'Total Execution Time ',io_time(),' (sec)' + if (on_root) then + write(stdout,'(1x,a25,f11.3,a)') 'Total Execution Time ',io_time(),' (sec)' - if (timing_level>0) call io_print_timings() + if (timing_level>0) call io_print_timings() - write(stdout,*) - write(stdout,'(1x,a)') 'All done: wannier90 exiting' + write(stdout,*) + write(stdout,'(1x,a)') 'All done: wannier90 exiting' - close(stdout) + close(stdout) + endif + call comms_end end program wannier diff --git a/src/wannierise.F90 b/src/wannierise.F90 index 91eb46c92..95c32acda 100644 --- a/src/wannierise.F90 +++ b/src/wannierise.F90 @@ -16,6 +16,9 @@ module w90_wannierise !! Main routines for the minimisation of the spread use w90_constants + use w90_comms, only : on_root, my_node_id, num_nodes,& + comms_bcast, comms_array_split,& + comms_gatherv, comms_allreduce implicit none @@ -26,12 +29,30 @@ module w90_wannierise ! Data to avoid large allocation within iteration loop real(kind=dp), allocatable :: rnkb (:,:,:) + real(kind=dp), allocatable :: rnkb_loc (:,:,:) real(kind=dp), allocatable :: ln_tmp(:,:,:) + real(kind=dp), allocatable :: ln_tmp_loc(:,:,:) + + ! for MPI + complex(kind=dp), allocatable :: u_matrix_loc(:,:,:) + complex(kind=dp), allocatable :: m_matrix_loc(:,:,:,:) + complex(kind=dp), allocatable :: m_matrix_1b(:,:,:) + complex(kind=dp), allocatable :: m_matrix_1b_loc(:,:,:) + complex(kind=dp), allocatable :: cdq_loc(:,:,:) ! the only large array sent + ! from process to process + ! in the main loop + complex(kind=dp), allocatable :: cdodq_loc(:,:,:) + integer, allocatable :: counts(:) + integer, allocatable :: displs(:) + logical :: first_pass !! Used to trigger the calculation of the invarient spread !! we only need to do this on entering wann_main (_gamma) +#ifdef MPI + include 'mpif.h' +#endif type localisation_vars !! Contributions to the spread @@ -59,7 +80,7 @@ subroutine wann_main ! ! !=================================================================== use w90_constants, only : dp,cmplx_1,cmplx_0 - use w90_io, only : stdout,io_error,io_time,io_stopwatch & + use w90_io, only : stdout,io_error,io_wallclocktime,io_stopwatch & ,io_file_unit use w90_parameters, only : num_wann,num_cg_steps,num_iter,nnlist, & nntot,wbtot,u_matrix,m_matrix,num_kpts,iprint,num_print_cycles, & @@ -96,6 +117,7 @@ subroutine wann_main complex(kind=dp), allocatable :: cdodq_r(:,:,:) complex(kind=dp), allocatable :: k_to_r(:,:) complex(kind=dp), allocatable :: cdodq_precond(:,:,:) + complex(kind=dp), allocatable :: cdodq_precond_loc(:,:,:) real(kind=dp), allocatable :: sheet (:,:,:) real(kind=dp), allocatable :: rave(:,:),r2ave(:),rave2(:) real(kind=dp), dimension(3) :: rvec_cart @@ -103,10 +125,14 @@ subroutine wann_main !local arrays not passed into subroutines complex(kind=dp), allocatable :: cwschur1 (:), cwschur2 (:) complex(kind=dp), allocatable :: cwschur3 (:), cwschur4 (:) - complex(kind=dp), allocatable :: cdq(:,:,:),cdqkeep(:,:,:) + complex(kind=dp), allocatable :: cdq(:,:,:)!,cdqkeep(:,:,:) + ! cdqkeep is replaced by cdqkeep_loc + complex(kind=dp), allocatable :: cdqkeep_loc(:,:,:) complex(kind=dp), allocatable :: cz (:,:) complex(kind=dp), allocatable :: cmtmp(:,:),tmp_cdq(:,:) - complex(kind=dp), allocatable :: m0(:,:,:,:),u0(:,:,:) + ! complex(kind=dp), allocatable :: m0(:,:,:,:),u0(:,:,:) + ! m0 and u0 are replaced by m0_loc and u0_loc + complex(kind=dp), allocatable :: m0_loc(:,:,:,:),u0_loc(:,:,:) complex(kind=dp), allocatable :: cwork(:) real(kind=dp), allocatable :: evals(:) real(kind=dp), allocatable :: rwork(:) @@ -114,7 +140,7 @@ subroutine wann_main real(kind=dp) :: doda0 real(kind=dp) :: falphamin,alphamin real(kind=dp) :: gcfac,gcnorm1,gcnorm0 - integer :: i,n,iter,ind,ierr,iw,ncg,info + integer :: i,n,iter,ind,ierr,iw,ncg,info,nkp,nkp_loc,nn logical :: lprint,ldump,lquad real(kind=dp), allocatable :: history(:) real(kind=dp) :: save_spread @@ -124,7 +150,7 @@ subroutine wann_main real(kind=dp) :: alpha_precond integer :: irpt,loop_kpt - if (timing_level>0) call io_stopwatch('wann: main',1) + if (timing_level>0.and.on_root) call io_stopwatch('wann: main',1) first_pass=.true. @@ -134,12 +160,12 @@ subroutine wann_main if (ierr/=0) call io_error('Error allocating history in wann_main') ! module data - if(optimisation>0) then - allocate( m0 (num_wann, num_wann, nntot, num_kpts),stat=ierr) - end if - if (ierr/=0) call io_error('Error in allocating m0 in wann_main') - allocate( u0 (num_wann, num_wann, num_kpts),stat=ierr) - if (ierr/=0) call io_error('Error in allocating u0 in wann_main') +! if(optimisation>0) then +! allocate( m0 (num_wann, num_wann, nntot, num_kpts),stat=ierr) +! end if +! if (ierr/=0) call io_error('Error in allocating m0 in wann_main') +! allocate( u0 (num_wann, num_wann, num_kpts),stat=ierr) +! if (ierr/=0) call io_error('Error in allocating u0 in wann_main') allocate( rnkb (num_wann, nntot, num_kpts),stat=ierr ) if (ierr/=0) call io_error('Error in allocating rnkb in wann_main') allocate( ln_tmp (num_wann, nntot, num_kpts), stat=ierr ) @@ -173,7 +199,7 @@ subroutine wann_main ! this method of computing the preconditioning is much more efficient, but requires more RAM if(optimisation >= 3) then allocate(k_to_r(num_kpts,nrpts),stat=ierr) - if (ierr/=0) call io_error('Error in allocating cdodq_precond in wann_main') + if (ierr/=0) call io_error('Error in allocating k_to_r in wann_main') do irpt=1,nrpts do loop_kpt=1,num_kpts @@ -194,12 +220,53 @@ subroutine wann_main if (ierr/=0) call io_error('Error in allocating cwshur3 in wann_main') allocate( cdq (num_wann, num_wann, num_kpts),stat=ierr ) if (ierr/=0) call io_error('Error in allocating cdq in wann_main') + + ! for MPI + allocate( counts(0:num_nodes-1), displs(0:num_nodes-1), stat=ierr ) + if (ierr/=0) call io_error('Error in allocating counts and displs in wann_main') + call comms_array_split(num_kpts,counts,displs) + allocate( rnkb_loc (num_wann, nntot, counts(my_node_id)),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating rnkb_loc in wann_main') + allocate( ln_tmp_loc (num_wann, nntot, counts(my_node_id)), stat=ierr ) + if (ierr/=0) call io_error('Error in allocating ln_tmp_loc in wann_main') + allocate( u_matrix_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating u_matrix_loc in wann_main') + allocate( m_matrix_loc (num_wann, num_wann, nntot, counts(my_node_id)),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating m_matrix_loc in wann_main') + allocate( m_matrix_1b (num_wann, num_wann, num_kpts),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating m_matrix_1b in wann_main') + allocate( m_matrix_1b_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating m_matrix_1b_loc in wann_main') + if(precond) then + allocate(cdodq_precond_loc(num_wann,num_wann,counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error in allocating cdodq_precond_loc in wann_main') + end if + ! initialize local u and m matrices with global ones + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) + m_matrix_loc (:,:,:, nkp_loc) = & + m_matrix (:,:,:, nkp) + u_matrix_loc (:,:, nkp_loc) = & + u_matrix (:,:, nkp) + end do + + allocate( cdq_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating cdq_loc in wann_main') + allocate( cdodq_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating cdodq_loc in wann_main') + allocate( cdqkeep_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) + if (ierr/=0) call io_error('Error in allocating cdqkeep_loc in wann_main') + if(optimisation>0) then + allocate( m0_loc (num_wann, num_wann, nntot, counts(my_node_id)),stat=ierr) + end if + if (ierr/=0) call io_error('Error in allocating m0_loc in wann_main') + allocate( u0_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr) + if (ierr/=0) call io_error('Error in allocating u0_loc in wann_main') + allocate( cz (num_wann, num_wann),stat=ierr ) if (ierr/=0) call io_error('Error in allocating cz in wann_main') allocate( cmtmp (num_wann, num_wann),stat=ierr ) if (ierr/=0) call io_error('Error in allocating cmtmp in wann_main') - allocate( cdqkeep (num_wann, num_wann, num_kpts),stat=ierr ) - if (ierr/=0) call io_error('Error in allocating cdqkeep in wann_main') allocate(tmp_cdq(num_wann,num_wann),stat=ierr) if (ierr/=0) call io_error('Error in allocating tmp_cdq in wann_main') allocate( evals (num_wann),stat=ierr) @@ -211,7 +278,7 @@ subroutine wann_main cwschur1=cmplx_0; cwschur2=cmplx_0; cwschur3=cmplx_0; cwschur4=cmplx_0 - cdq=cmplx_0; cz=cmplx_0; cmtmp=cmplx_0; cdqkeep=cmplx_0 + cdq=cmplx_0; cz=cmplx_0; cmtmp=cmplx_0; cdqkeep_loc=cmplx_0; cdq_loc=cmplx_0;! buff=cmplx_0; gcnorm1=0.0_dp; gcnorm0=0.0_dp @@ -227,17 +294,18 @@ subroutine wann_main ! end if end if - write(stdout,*) - write(stdout,'(1x,a)') '*------------------------------- WANNIERISE ---------------------------------*' - write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV' - if (lenconfac.eq.1.0_dp) then - write(stdout,'(1x,a)') '| Iter Delta Spread RMS Gradient Spread (Ang^2) Time |<-- CONV' - else - write(stdout,'(1x,a)') '| Iter Delta Spread RMS Gradient Spread (Bohr^2) Time |<-- CONV' + if (on_root) then + write(stdout,*) + write(stdout,'(1x,a)') '*------------------------------- WANNIERISE ---------------------------------*' + write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV' + if (lenconfac.eq.1.0_dp) then + write(stdout,'(1x,a)') '| Iter Delta Spread RMS Gradient Spread (Ang^2) Time |<-- CONV' + else + write(stdout,'(1x,a)') '| Iter Delta Spread RMS Gradient Spread (Bohr^2) Time |<-- CONV' + endif + write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV' + write(stdout,*) endif - write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV' - write(stdout,*) - irguide=0 if (guiding_centres.and.(num_no_guide_iter.le.0)) then @@ -263,21 +331,23 @@ subroutine wann_main old_spread%om_tot = 0.0_dp ! print initial state - write(stdout,'(1x,a78)') repeat('-',78) - write(stdout,'(1x,a)') 'Initial State' - do iw=1,num_wann - write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),& - (r2ave(iw) - rave2(iw))*lenconfac**2 - end do - write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3), (sum(r2ave)-sum(rave2))*lenconfac**2 - write(stdout,*) - write(stdout,'(1x,i6,2x,E12.3,2x,F15.10,2x,F18.10,3x,F8.2,2x,a)') & - iter,(wann_spread%om_tot-old_spread%om_tot)*lenconfac**2,sqrt(abs(gcnorm1))*lenconfac,& - wann_spread%om_tot*lenconfac**2,io_time(),'<-- CONV' - write(stdout,'(8x,a,F15.7,a,F15.7,a,F15.7,a)') & - 'O_D=',wann_spread%om_d*lenconfac**2,' O_OD=',wann_spread%om_od*lenconfac**2,& - ' O_TOT=',wann_spread%om_tot*lenconfac**2,' <-- SPRD' - write(stdout,'(1x,a78)') repeat('-',78) + if (on_root) then + write(stdout,'(1x,a78)') repeat('-',78) + write(stdout,'(1x,a)') 'Initial State' + do iw=1,num_wann + write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),& + (r2ave(iw) - rave2(iw))*lenconfac**2 + end do + write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3), (sum(r2ave)-sum(rave2))*lenconfac**2 + write(stdout,*) + write(stdout,'(1x,i6,2x,E12.3,2x,F15.10,2x,F18.10,3x,F8.2,2x,a)') & + iter,(wann_spread%om_tot-old_spread%om_tot)*lenconfac**2,sqrt(abs(gcnorm1))*lenconfac,& + wann_spread%om_tot*lenconfac**2,io_wallclocktime(),'<-- CONV' + write(stdout,'(8x,a,F15.7,a,F15.7,a,F15.7,a)') & + 'O_D=',wann_spread%om_d*lenconfac**2,' O_OD=',wann_spread%om_od*lenconfac**2,& + ' O_TOT=',wann_spread%om_tot*lenconfac**2,' <-- SPRD' + write(stdout,'(1x,a78)') repeat('-',78) + endif lconverged=.false. ; lfirst=.true. ; lrandom=.false. conv_count=0 ; noise_count=0 @@ -287,6 +357,7 @@ subroutine wann_main open(unit=page_unit,status='scratch',form='unformatted') endif + ! main iteration loop do iter=1,num_iter @@ -297,7 +368,7 @@ subroutine wann_main ldump=.false. if ( (num_dump_cycles.gt.0) .and. (mod(iter,num_dump_cycles).eq.0) ) ldump=.true. - if(lprint) write(stdout,'(1x,a,i6)') 'Cycle: ',iter + if(lprint.and.on_root) write(stdout,'(1x,a,i6)') 'Cycle: ',iter if ( guiding_centres.and.(iter.gt.num_no_guide_iter) & .and.(mod(iter,num_guide_cycles).eq.0) ) then @@ -306,9 +377,15 @@ subroutine wann_main endif ! calculate gradient of omega - call wann_domega(csheet,sheet,rave,cdodq) + + if (lsitesymmetry.or.precond) then + call wann_domega(csheet,sheet,rave,cdodq) + else + call wann_domega(csheet,sheet,rave)!,cdodq) fills only cdodq_loc + endif - if ( lprint .and. iprint>2 ) & + + if ( lprint .and. iprint>2 .and. on_root) & write(stdout,*) ' LINE --> Iteration :',iter ! calculate search direction (cdq) @@ -316,7 +393,7 @@ subroutine wann_main if (lsitesymmetry) call sitesym_symmetrize_gradient(2,cdq) !RS: ! save search direction - cdqkeep(:,:,:) = cdq(:,:,:) + cdqkeep_loc(:,:,:) = cdq_loc(:,:,:) ! check whether we're doing fixed step lengths if (lfixstep) then @@ -327,16 +404,16 @@ subroutine wann_main else ! take trial step - cdq(:,:,:)=cdqkeep(:,:,:)*( trial_step / (4.0_dp*wbtot) ) + cdq_loc(:,:,:)=cdqkeep_loc(:,:,:)*( trial_step / (4.0_dp*wbtot) ) ! store original U and M before rotating - u0=u_matrix + u0_loc=u_matrix_loc if(optimisation<=0) then write(page_unit) m_matrix rewind(page_unit) else - m0=m_matrix + m0_loc=m_matrix_loc endif ! update U and M @@ -351,7 +428,7 @@ subroutine wann_main endif ! print line search information - if ( lprint .and. iprint>2 ) then + if ( lprint .and. iprint>2 .and. on_root) then write(stdout,*) ' LINE --> Spread at initial point :',wann_spread%om_tot*lenconfac**2 if (.not.lfixstep) & write(stdout,*) ' LINE --> Spread at trial step :',trial_spread%om_tot*lenconfac**2 @@ -374,22 +451,22 @@ subroutine wann_main if (lfixstep.or.lquad) then ! take optimal step - cdq(:,:,:) = cdqkeep(:,:,:) * ( alphamin / (4.0_dp*wbtot) ) + cdq_loc(:,:,:) = cdqkeep_loc(:,:,:) * ( alphamin / (4.0_dp*wbtot) ) ! if doing a line search then restore original U and M before rotating if (.not.lfixstep) then - u_matrix=u0 + u_matrix_loc=u0_loc if(optimisation<=0) then read(page_unit) m_matrix rewind(page_unit) else - m_matrix=m0 + m_matrix_loc=m0_loc endif endif ! update U and M call internal_new_u_and_m() - + call wann_spread_copy(wann_spread,old_spread) ! calculate the new centers and spread @@ -405,7 +482,7 @@ subroutine wann_main ! print the new centers and spreads - if(lprint) then + if(lprint .and. on_root) then do iw=1,num_wann write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),& (r2ave(iw) - rave2(iw))*lenconfac**2 @@ -416,7 +493,7 @@ subroutine wann_main write(stdout,'(1x,i6,2x,E12.3,2x,F15.10,2x,F18.10,3x,F8.2,2x,a)') & iter,(wann_spread%om_tot-old_spread%om_tot)*lenconfac**2,& sqrt(abs(gcnorm1))*lenconfac,& - wann_spread%om_tot*lenconfac**2,io_time(),'<-- CONV' + wann_spread%om_tot*lenconfac**2,io_wallclocktime(),'<-- CONV' write(stdout,'(8x,a,F15.7,a,F15.7,a,F15.7,a)') & 'O_D=',wann_spread%om_d*lenconfac**2,& ' O_OD=',wann_spread%om_od*lenconfac**2,& @@ -436,7 +513,7 @@ subroutine wann_main omega_total = wann_spread%om_tot omega_tilde = wann_spread%om_d + wann_spread%om_od - if (ldump) call param_write_chkpt('postdis') + if (ldump.and.on_root) call param_write_chkpt('postdis') if (conv_window.gt.1) call internal_test_convergence() @@ -451,38 +528,55 @@ subroutine wann_main enddo ! end of the minimization loop - - write(stdout,'(1x,a)') 'Final State' - do iw=1,num_wann - write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),& - (r2ave(iw) - rave2(iw))*lenconfac**2 - end do - write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3),& - (sum(r2ave)-sum(rave2))*lenconfac**2 - write(stdout,*) - write(stdout,'(3x,a21,a,f15.9)') ' Spreads ('//trim(length_unit)//'^2)',& - ' Omega I = ',wann_spread%om_i*lenconfac**2 - write(stdout,'(3x,a,f15.9)') ' ================ Omega D = ',& - wann_spread%om_d*lenconfac**2 - write(stdout,'(3x,a,f15.9)') ' Omega OD = ',& - wann_spread%om_od*lenconfac**2 - write(stdout,'(3x,a21,a,f15.9)') 'Final Spread ('//trim(length_unit)//'^2)',& - ' Omega Total = ',wann_spread%om_tot*lenconfac**2 - write(stdout,'(1x,a78)') repeat('-',78) + ! the m matrix is sent by piece to avoid huge arrays + do nn = 1, nntot + m_matrix_1b_loc=m_matrix_loc(:,:,nn,:) + call comms_gatherv(m_matrix_1b_loc(1,1,1),num_wann*num_wann*counts(my_node_id),& + m_matrix(1,1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs) + call comms_bcast(m_matrix_1b(1,1,1),num_wann*num_wann*num_kpts) + m_matrix(:,:,nn,:)=m_matrix_1b(:,:,:) + end do!nn + + ! send u matrix + call comms_gatherv(u_matrix_loc(1,1,1),num_wann*num_wann*counts(my_node_id),& + u_matrix(1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs) + call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts) + + if (on_root) then + write(stdout,'(1x,a)') 'Final State' + do iw=1,num_wann + write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),& + (r2ave(iw) - rave2(iw))*lenconfac**2 + end do + write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3),& + (sum(r2ave)-sum(rave2))*lenconfac**2 + write(stdout,*) + write(stdout,'(3x,a21,a,f15.9)') ' Spreads ('//trim(length_unit)//'^2)',& + ' Omega I = ',wann_spread%om_i*lenconfac**2 + write(stdout,'(3x,a,f15.9)') ' ================ Omega D = ',& + wann_spread%om_d*lenconfac**2 + write(stdout,'(3x,a,f15.9)') ' Omega OD = ',& + wann_spread%om_od*lenconfac**2 + write(stdout,'(3x,a21,a,f15.9)') 'Final Spread ('//trim(length_unit)//'^2)',& + ' Omega Total = ',wann_spread%om_tot*lenconfac**2 + write(stdout,'(1x,a78)') repeat('-',78) + endif if (write_xyz) call wann_write_xyz() if(write_hr_diag) then call hamiltonian_setup() call hamiltonian_get_hr() - write(stdout,*) - write(stdout,'(1x,a)') 'On-site Hamiltonian matrix elements' - write(stdout,'(3x,a)') ' n <0n|H|0n> (eV)' - write(stdout,'(3x,a)') '-------------------------' - do i=1,num_wann - write(stdout,'(3x,i3,5x,f12.6)') i,real(ham_r(i,i,rpt_origin),kind=dp) - enddo - write(stdout,*) + if (on_root) then + write(stdout,*) + write(stdout,'(1x,a)') 'On-site Hamiltonian matrix elements' + write(stdout,'(3x,a)') ' n <0n|H|0n> (eV)' + write(stdout,'(3x,a)') '-------------------------' + do i=1,num_wann + write(stdout,'(3x,i3,5x,f12.6)') i,real(ham_r(i,i,rpt_origin),kind=dp) + enddo + write(stdout,*) + endif endif if (guiding_centres) call wann_phases(csheet,sheet,rguide,irguide) @@ -514,14 +608,33 @@ subroutine wann_main if (ierr/=0) call io_error('Error in deallocating evals in wann_main') deallocate(tmp_cdq,stat=ierr) if (ierr/=0) call io_error('Error in deallocating tmp_cdq in wann_main') - deallocate(cdqkeep,stat=ierr) - if (ierr/=0) call io_error('Error in deallocating cdqkeep in wann_main') deallocate(cmtmp,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cmtmp in wann_main') deallocate(cz,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cz in wann_main') deallocate(cdq,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cdq in wann_main') + + ! for MPI + deallocate( ln_tmp_loc , stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating ln_tmp_loc in wann_main') + deallocate( rnkb_loc,stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating rnkb_loc in wann_main') + deallocate(u_matrix_loc,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating u_matrix_loc in wann_main') + deallocate(m_matrix_loc,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating m_matrix_loc in wann_main') + deallocate(m_matrix_1b,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating m_matrix_1b in wann_main') + deallocate(m_matrix_1b_loc,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating m_matrix_1b_loc in wann_main') + deallocate(cdq_loc,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating cdq_loc in wann_main') + deallocate(cdodq_loc,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating cdodq_loc in wann_main') + deallocate(cdqkeep_loc,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating cdqkeep_loc in wann_main') + deallocate(cwschur3,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cwschur3 in wann_main') deallocate(cwschur1,stat=ierr) @@ -535,6 +648,8 @@ subroutine wann_main if (ierr/=0) call io_error('Error in deallocating cdodq_r in wann_main') deallocate(cdodq_precond,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cdodq_precond in wann_main') + deallocate(cdodq_precond_loc,stat=ierr) + if (ierr/=0) call io_error('Error in deallocating cdodq_precond_loc in wann_main') end if ! deallocate sub vars passed into other subs @@ -557,17 +672,17 @@ subroutine wann_main deallocate( rnkb,stat=ierr ) if (ierr/=0) call io_error('Error in deallocating rnkb in wann_main') - deallocate(u0, stat=ierr) - if (ierr/=0) call io_error('Error in deallocating u0 in wann_main') + deallocate(u0_loc, stat=ierr) + if (ierr/=0) call io_error('Error in deallocating u0_loc in wann_main') if(optimisation>0) then - deallocate(m0, stat=ierr) - if (ierr/=0) call io_error('Error in deallocating m0 in wann_main') + deallocate(m0_loc, stat=ierr) + if (ierr/=0) call io_error('Error in deallocating m0_loc in wann_main') end if deallocate(history,stat=ierr) if (ierr/=0) call io_error('Error deallocating history in wann_main') - if (timing_level>0) call io_stopwatch('wann: main',2) + if (timing_level>0.and.on_root) call io_stopwatch('wann: main',2) return @@ -679,7 +794,7 @@ subroutine internal_random_noise() ! cdq is a num_wann x num_wann x num_kpts anti-hermitian array ! to which we add a random anti-hermitian matrix - do ikp=1,num_kpts + do ikp=1,counts(my_node_id) do iw=1,num_wann call random_seed() call random_number(noise_real(:,iw)) @@ -697,7 +812,7 @@ subroutine internal_random_noise() enddo enddo ! Add noise to search direction - cdq(:,:,ikp) = cdq(:,:,ikp) + conv_noise_amp * cnoise(:,:) + cdq_loc(:,:,ikp) = cdq_loc(:,:,ikp) + conv_noise_amp * cnoise(:,:) enddo ! Deallocate @@ -729,13 +844,18 @@ subroutine internal_search_direction() complex(kind=dp) :: zdotc - if (timing_level>1) call io_stopwatch('wann: main: search_direction',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: main: search_direction',1) + + ! gcnorm1 = Tr[gradient . gradient] -- NB gradient is anti-Hermitian + ! gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdodq,1),dp) if (precond) then ! compute cdodq_precond cdodq_r(:,:,:) = 0 ! intermediary gradient in R space cdodq_precond(:,:,:) = 0 + cdodq_precond_loc(:,:,:) = 0 +! cdodq_precond(:,:,:) = complx_0 ! convert to real space in cdodq_r ! Two algorithms: either double loop or GEMM. GEMM is much more efficient but requires more RAM @@ -786,14 +906,18 @@ subroutine internal_search_direction() enddo enddo end if + cdodq_precond_loc(:,:,1:counts(my_node_id))=cdodq_precond(:,:,1+displs(my_node_id):displs(my_node_id)+counts(my_node_id)) + end if ! gcnorm1 = Tr[gradient . gradient] -- NB gradient is anti-Hermitian if(precond) then - gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq_precond,1,cdodq,1),dp) +! gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq_precond,1,cdodq,1),dp) + gcnorm1 = real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_precond_loc,1,cdodq_loc,1),dp) else - gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdodq,1),dp) + gcnorm1 = real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_loc,1,cdodq_loc,1),dp) end if + call comms_allreduce(gcnorm1,1,'SUM') ! calculate cg_coefficient if ( (iter.eq.1) .or. (ncg.ge.num_cg_steps) ) then @@ -804,7 +928,7 @@ subroutine internal_search_direction() gcfac = gcnorm1/gcnorm0 ! Fletcher-Reeves CG coefficient ! prevent CG coefficient from getting too large if (gcfac.gt.3.0_dp) then - if ( lprint .and. iprint>2 ) & + if ( lprint .and. iprint>2 .and. on_root) & write(stdout,*) ' LINE --> CG coeff too large. Resetting :',gcfac gcfac = 0.0_dp ncg = 0 @@ -821,48 +945,56 @@ subroutine internal_search_direction() gcnorm0 = gcnorm1 ! calculate search direction + if(precond) then - cdq(:,:,:) = cdodq_precond(:,:,:) + cdqkeep(:,:,:) * gcfac + cdq_loc(:,:,:) = cdodq_precond_loc(:,:,:) + cdqkeep_loc(:,:,:) * gcfac !! JRY not MPI else - cdq(:,:,:) = cdodq(:,:,:) + cdqkeep(:,:,:) * gcfac + cdq_loc(:,:,:) = cdodq_loc(:,:,:) + cdqkeep_loc(:,:,:) * gcfac end if + ! add some random noise to search direction, if required if (lrandom) then - write(stdout,'(a,i3,a,i3,a)') & + if (on_root) write(stdout,'(a,i3,a,i3,a)') & ' [ Adding random noise to search direction. Time ',noise_count,' / ',conv_noise_num,' ]' call internal_random_noise() endif ! calculate gradient along search direction - Tr[gradient . search direction] ! NB gradient is anti-hermitian - doda0 = -real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdq,1),dp) + doda0 = -real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_loc,1,cdq_loc,1),dp) + + call comms_allreduce(doda0,1,'SUM') + doda0 = doda0 / (4.0_dp*wbtot) ! check search direction is not uphill if (doda0.gt.0.0_dp) then ! if doing a CG step then reset CG if (ncg.gt.0) then - if ( lprint .and. iprint>2 ) & + if ( lprint .and. iprint>2 .and. on_root) & write(stdout,*) ' LINE --> Search direction uphill: resetting CG' - cdq(:,:,:) = cdodq(:,:,:) + cdq_loc(:,:,:) = cdodq_loc(:,:,:) if (lrandom) call internal_random_noise() ncg = 0 gcfac = 0.0_dp ! re-calculate gradient along search direction - doda0 = -real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdq,1),dp) + doda0 = -real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_loc,1,cdq_loc,1),dp) + + call comms_allreduce(doda0,1,'SUM') + doda0 = doda0 / (4.0_dp*wbtot) ! if search direction still uphill then reverse search direction if (doda0.gt.0.0_dp) then - if ( lprint .and. iprint>2 ) & + if ( lprint .and. iprint>2 .and. on_root) & write(stdout,*) ' LINE --> Search direction still uphill: reversing' - cdq(:,:,:) = -cdq(:,:,:) + cdq_loc(:,:,:) = -cdq_loc(:,:,:) doda0 = -doda0 endif ! if doing a SD step then reverse search direction else - if ( lprint .and. iprint>2 ) & + if ( lprint .and. iprint>2 .and.on_root ) & write(stdout,*) ' LINE --> Search direction uphill: reversing' - cdq(:,:,:) = -cdq(:,:,:) + cdq_loc(:,:,:) = -cdq_loc(:,:,:) doda0 = -doda0 endif endif @@ -870,7 +1002,7 @@ subroutine internal_search_direction() !~ ! calculate search direction !~ cdq(:,:,:) = cdodq(:,:,:) + cdqkeep(:,:,:) * gcfac - if (timing_level>1) call io_stopwatch('wann: main: search_direction',2) + if (timing_level>1.and.on_root) call io_stopwatch('wann: main: search_direction',2) lrandom=.false. @@ -892,7 +1024,7 @@ subroutine internal_optimal_step() real(kind=dp) :: fac,shift,eqa,eqb - if (timing_level>1) call io_stopwatch('wann: main: optimal_step',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: main: optimal_step',1) fac = trial_spread%om_tot - wann_spread%om_tot if ( abs(fac) .gt. tiny(1.0_dp) ) then @@ -910,7 +1042,7 @@ subroutine internal_optimal_step() falphamin = wann_spread%om_tot & - 0.25_dp * eqb * eqb / (fac * eqa) * (trial_step**2) else - if ( lprint .and. iprint>2 ) write(stdout,*) & + if ( lprint .and. iprint>2 .and. on_root ) write(stdout,*) & ' LINE --> Parabolic line search unstable: using trial step' lquad=.false. alphamin = trial_step @@ -918,14 +1050,14 @@ subroutine internal_optimal_step() endif if (doda0*alphamin.gt.0.0_dp) then - if ( lprint .and. iprint>2 ) write(stdout,*) & + if ( lprint .and. iprint>2 .and. on_root ) write(stdout,*) & ' LINE --> Line search unstable : using trial step' lquad=.false. alphamin=trial_step falphamin=trial_spread%om_tot endif - if (timing_level>1) call io_stopwatch('wann: main: optimal_step',2) + if (timing_level>1.and.on_root) call io_stopwatch('wann: main: optimal_step',2) return @@ -944,30 +1076,31 @@ subroutine internal_new_u_and_m() implicit none - integer :: nkp,nn,nkp2,nsdim + integer :: nkp,nn,nkp2,nsdim,nkp_loc logical :: ltmp - if (timing_level>1) call io_stopwatch('wann: main: u_and_m',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: main: u_and_m',1) - do nkp=1,num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) if (lsitesymmetry) then !YN: RS: if (ir2ik(ik2ir(nkp)).ne.nkp) cycle !YN: RS: end if !YN: RS: ! cdq(nkp) is anti-Hermitian; tmp_cdq = i*cdq is Hermitian - tmp_cdq(:,:) = cmplx_i * cdq(:,:,nkp) + tmp_cdq(:,:) = cmplx_i * cdq_loc(:,:,nkp_loc) ! Hermitian matrix eigen-solver call zheev('V','U',num_wann,tmp_cdq,num_wann,evals,cwork,4*num_wann,rwork,info) if (info.ne.0) then - write(stdout,*) & + if (on_root) write(stdout,*) & 'wann_main: ZHEEV in internal_new_u_and_m failed, info= ',info - write(stdout,*) ' trying Schur decomposition instead' -!~ call io_error('wann_main: problem in ZHEEV in internal_new_u_and_m') - tmp_cdq(:,:) = cdq(:,:,nkp) + if (on_root) write(stdout,*) ' trying Schur decomposition instead' +!!$ call io_error('wann_main: problem in ZHEEV in internal_new_u_and_m') + tmp_cdq(:,:) = cdq_loc(:,:,nkp_loc) call zgees ('V', 'N', ltmp, num_wann, tmp_cdq, num_wann, nsdim, & cwschur1, cz, num_wann, cwschur2, 10 * num_wann, cwschur3, & cwschur4, info) if (info.ne.0) then - write(stdout,*) 'wann_main: SCHUR failed, info= ', info + if (on_root) write(stdout,*) 'wann_main: SCHUR failed, info= ', info call io_error('wann_main: problem computing schur form 1') endif do i=1,num_wann @@ -975,50 +1108,63 @@ subroutine internal_new_u_and_m() enddo ! cmtmp = tmp_cdq . cz^{dagger} call utility_zgemm(cmtmp,tmp_cdq,'N',cz,'C',num_wann) - cdq(:,:,nkp)=cmtmp(:,:) + cdq_loc(:,:,nkp_loc)=cmtmp(:,:) else do i=1,num_wann cmtmp(:,i) = tmp_cdq(:,i) * exp(-cmplx_i * evals(i)) enddo ! cdq(nkp) = cmtmp . tmp_cdq^{dagger} - call utility_zgemm(cdq(:,:,nkp),cmtmp,'N',tmp_cdq,'C',num_wann) + call utility_zgemm(cdq_loc(:,:,nkp_loc),cmtmp,'N',tmp_cdq,'C',num_wann) endif enddo -!~ do nkp = 1, num_kpts -!~ tmp_cdq(:,:) = cdq(:,:,nkp) -!~ call zgees ('V', 'N', ltmp, num_wann, tmp_cdq, num_wann, nsdim, & -!~ cwschur1, cz, num_wann, cwschur2, 10 * num_wann, cwschur3, & -!~ cwschur4, info) -!~ if (info.ne.0) then -!~ write(stdout,*) 'SCHUR: ', info -!~ call io_error('wann_main: problem computing schur form 1') -!~ endif -!~ do i=1,num_wann -!~ tmp_cdq(:,i) = cz(:,i) * exp(cwschur1(i)) -!~ enddo -!~ ! cmtmp = tmp_cdq . cz^{dagger} -!~ call utility_zgemm(cmtmp,tmp_cdq,'N',cz,'C',num_wann) -!~ cdq(:,:,nkp)=cmtmp(:,:) -!~ enddo + ! each process communicates its result to other processes + ! it would be enough to copy only next neighbors + call comms_gatherv(cdq_loc(1,1,1),num_wann*num_wann*counts(my_node_id),& + cdq(1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs) + call comms_bcast(cdq(1,1,1),num_wann*num_wann*num_kpts) + + +!!$ do nkp = 1, num_kpts +!!$ tmp_cdq(:,:) = cdq(:,:,nkp) +!!$ call zgees ('V', 'N', ltmp, num_wann, tmp_cdq, num_wann, nsdim, & +!!$ cwschur1, cz, num_wann, cwschur2, 10 * num_wann, cwschur3, & +!!$ cwschur4, info) +!!$ if (info.ne.0) then +!!$ write(stdout,*) 'SCHUR: ', info +!!$ call io_error('wann_main: problem computing schur form 1') +!!$ endif +!!$ do i=1,num_wann +!!$ tmp_cdq(:,i) = cz(:,i) * exp(cwschur1(i)) +!!$ enddo +!!$ ! cmtmp = tmp_cdq . cz^{dagger} +!!$ call utility_zgemm(cmtmp,tmp_cdq,'N',cz,'C',num_wann) +!!$ cdq(:,:,nkp)=cmtmp(:,:) +!!$ enddo + + if (lsitesymmetry) then + call sitesym_symmetrize_rotation(cdq) !RS: calculate cdq(Rk) from k + cdq_loc(:,:,1:counts(my_node_id))=cdq(:,:,1+displs(my_node_id):displs(my_node_id)+counts(my_node_id)) + endif - if (lsitesymmetry) call sitesym_symmetrize_rotation(cdq) !RS: calculate cdq(Rk) from k ! the orbitals are rotated - do nkp=1,num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) ! cmtmp = U(k) . cdq(k) - call utility_zgemm(cmtmp,u_matrix(:,:,nkp),'N',cdq(:,:,nkp),'N',num_wann) - u_matrix(:,:,nkp)=cmtmp(:,:) + call utility_zgemm(cmtmp,u_matrix_loc(:,:,nkp_loc),'N',cdq_loc(:,:,nkp_loc),'N',num_wann) + u_matrix_loc(:,:,nkp_loc)=cmtmp(:,:) enddo ! and the M_ij are updated - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot nkp2 = nnlist (nkp, nn) ! tmp_cdq = cdq^{dagger} . M - call utility_zgemm(tmp_cdq,cdq(:,:,nkp),'C',m_matrix(:,:,nn,nkp),'N',num_wann) + call utility_zgemm(tmp_cdq,cdq(:,:,nkp),'C',m_matrix_loc(:,:,nn,nkp_loc),'N',num_wann) ! cmtmp = tmp_cdq . cdq call utility_zgemm(cmtmp,tmp_cdq,'N',cdq(:,:,nkp2),'N',num_wann) - m_matrix(:,:,nn,nkp) = cmtmp(:,:) + m_matrix_loc(:,:,nn,nkp_loc) = cmtmp(:,:) enddo enddo @@ -1250,10 +1396,10 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w) real(kind=dp) :: smat(3,3),svec(3),sinv(3,3) real(kind=dp) :: xx0,det,brn complex(kind=dp) :: csumt - integer :: loop_wann,na,nkp,i,j,nn,ind,m + integer :: loop_wann,na,nkp,i,j,nn,ind,m,nkp_loc - if (timing_level>1) call io_stopwatch('wann: phases',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: phases',1) csum=cmplx_0; xx=0.0_dp @@ -1268,9 +1414,10 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w) ! get average phase for each unique bk direction do na = 1, nnh csum (na) = cmplx_0 - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) nn = neigh (nkp, na) - csum (na) = csum (na) + m_matrix (loop_wann, loop_wann, nn, nkp) + csum (na) = csum (na) + m_matrix_loc (loop_wann, loop_wann, nn, nkp_loc) enddo enddo @@ -1278,7 +1425,8 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w) do na = 1, nnh csum (na) = cmplx_0 - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) nn = neigh (nkp, na) csum (na) = csum (na) & + cmplx(m_w(loop_wann,loop_wann,2*nn-1),m_w(loop_wann,loop_wann,2*nn),dp) @@ -1286,6 +1434,8 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w) enddo end if + + call comms_allreduce(csum(1),nnh,'SUM') ! now analyze that information to get good guess at ! wannier center @@ -1415,7 +1565,7 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w) ! enddo ! enddo - if (timing_level>1) call io_stopwatch('wann: phases',2) + if (timing_level>1.and.on_root) call io_stopwatch('wann: phases',2) return @@ -1446,17 +1596,18 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread) !local variables real(kind=dp) :: summ,mnn2 real(kind=dp) :: brn - integer :: ind,nkp,nn,m,n,iw + integer :: ind,nkp,nn,m,n,iw,nkp_loc - if (timing_level>1) call io_stopwatch('wann: omega',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: omega',1) - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot do n = 1, num_wann ! Note that this ln_tmp is defined differently wrt the one in wann_domega - ln_tmp(n,nn,nkp)=( aimag(log(csheet(n,nn,nkp) & - * m_matrix(n,n,nn,nkp))) - sheet(n,nn,nkp) ) + ln_tmp_loc(n,nn,nkp_loc)=( aimag(log(csheet(n,nn,nkp) & + * m_matrix_loc(n,n,nn,nkp_loc))) - sheet(n,nn,nkp) ) end do end do end do @@ -1465,14 +1616,18 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread) rave = 0.0_dp do iw = 1, num_wann do ind = 1, 3 - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot rave(ind,iw) = rave(ind,iw) + wb(nn) * bk(ind,nn,nkp) & - *ln_tmp(iw,nn,nkp) + *ln_tmp_loc(iw,nn,nkp_loc) enddo enddo enddo enddo + + call comms_allreduce(rave(1,1),num_wann*3,'SUM') + rave = -rave/real(num_kpts,dp) rave2 = 0.0_dp @@ -1490,13 +1645,17 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread) r2ave = 0.0_dp do iw = 1, num_wann - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot - mnn2 = real(m_matrix(iw,iw,nn,nkp)*conjg(m_matrix(iw,iw,nn,nkp)),kind=dp) - r2ave(iw) = r2ave(iw) + wb(nn) * ( 1.0_dp - mnn2 + ln_tmp(iw,nn,nkp)**2 ) + mnn2 = real(m_matrix_loc(iw,iw,nn,nkp_loc)*conjg(m_matrix_loc(iw,iw,nn,nkp_loc)),kind=dp) + r2ave(iw) = r2ave(iw) + wb(nn) * ( 1.0_dp - mnn2 + ln_tmp_loc(iw,nn,nkp_loc)**2 ) enddo enddo enddo + + call comms_allreduce(r2ave(1),num_wann,'SUM') + r2ave = r2ave/real(num_kpts,dp) !~ wann_spread%om_1 = 0.0_dp @@ -1555,19 +1714,23 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread) ! on subsequent passes it may be set to omega_invariant if (first_pass) then wann_spread%om_i = 0.0_dp - do nkp = 1, num_kpts + nkp = nkp_loc + displs(my_node_id) + do nkp_loc = 1, counts(my_node_id) do nn = 1, nntot summ = 0.0_dp do m = 1, num_wann do n = 1, num_wann summ = summ & - + real(m_matrix(n,m,nn,nkp)*conjg(m_matrix(n,m,nn,nkp)),kind=dp) + + real(m_matrix_loc(n,m,nn,nkp_loc)*conjg(m_matrix_loc(n,m,nn,nkp_loc)),kind=dp) enddo enddo wann_spread%om_i = wann_spread%om_i & + wb(nn) * (real(num_wann,dp) - summ) enddo enddo + + call comms_allreduce(wann_spread%om_i,1,'SUM') + wann_spread%om_i = wann_spread%om_i / real(num_kpts,dp) first_pass=.false. else @@ -1575,35 +1738,43 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread) endif wann_spread%om_od = 0.0_dp - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot do m = 1, num_wann do n = 1, num_wann if (m.ne.n) wann_spread%om_od = wann_spread%om_od & - + wb(nn) * real( m_matrix(n,m,nn,nkp) & - * conjg(m_matrix(n,m,nn,nkp)), kind=dp ) + + wb(nn) * real( m_matrix_loc(n,m,nn,nkp_loc) & + * conjg(m_matrix_loc(n,m,nn,nkp_loc)), kind=dp ) enddo enddo enddo enddo + + call comms_allreduce(wann_spread%om_od,1,'SUM') + wann_spread%om_od = wann_spread%om_od / real(num_kpts,dp) wann_spread%om_d = 0.0_dp - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot do n = 1, num_wann brn = sum(bk(:,nn,nkp)*rave(:,n)) wann_spread%om_d = wann_spread%om_d + wb(nn) & - * ( ln_tmp(n,nn,nkp) + brn)**2 + * ( ln_tmp_loc(n,nn,nkp_loc) + brn)**2 enddo enddo enddo + + call comms_allreduce(wann_spread%om_d,1,'SUM') + wann_spread%om_d = wann_spread%om_d / real(num_kpts,dp) wann_spread%om_tot = wann_spread%om_i + wann_spread%om_d + wann_spread%om_od - if (timing_level>1) call io_stopwatch('wann: omega',2) + if (timing_level>1.and.on_root) call io_stopwatch('wann: omega',2) return @@ -1628,17 +1799,19 @@ subroutine wann_domega(csheet,sheet,rave,cdodq) complex(kind=dp), intent(in) :: csheet (:,:,:) real(kind=dp), intent(in) :: sheet (:,:,:) real(kind=dp), intent(out) :: rave (:,:) - complex(kind=dp), intent(out) :: cdodq (:,:,:) + ! as we work on the local cdodq, returning the full cdodq array is now + ! made optional + complex(kind=dp), intent(out), optional :: cdodq (:,:,:) !local complex(kind=dp), allocatable :: cr (:,:) complex(kind=dp), allocatable :: crt (:,:) ! local - integer :: iw,ind,nkp,nn,m,n,ierr + integer :: iw,ind,nkp,nn,m,n,ierr,nkp_loc complex(kind=dp) :: mnn - if (timing_level>1) call io_stopwatch('wann: domega',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: domega',1) allocate( cr (num_wann, num_wann),stat=ierr ) if (ierr/=0) call io_error('Error in allocating cr in wann_domega') @@ -1646,73 +1819,99 @@ subroutine wann_domega(csheet,sheet,rave,cdodq) if (ierr/=0) call io_error('Error in allocating crt in wann_domega') - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot do n = 1, num_wann ! Note that this ln_tmp is defined differently wrt the one in wann_omega - ln_tmp(n,nn,nkp)=wb(nn)*( aimag(log(csheet(n,nn,nkp) & - * m_matrix(n,n,nn,nkp))) - sheet(n,nn,nkp) ) + ln_tmp_loc(n,nn,nkp_loc)=wb(nn)*( aimag(log(csheet(n,nn,nkp) & + * m_matrix_loc(n,n,nn,nkp_loc))) - sheet(n,nn,nkp) ) end do end do end do + + ! recalculate rave rave = 0.0_dp do iw = 1, num_wann do ind = 1, 3 - do nkp = 1, num_kpts + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot rave(ind,iw) = rave(ind,iw) + bk(ind,nn,nkp) & - * ln_tmp(iw,nn,nkp) + * ln_tmp_loc(iw,nn,nkp_loc) enddo enddo enddo enddo rave = -rave/real(num_kpts,dp) + call comms_allreduce(rave(1,1),num_wann*3,'SUM') + ! R_mn=M_mn/M_nn and q_m^{k,b} = Im phi_m^{k,b} + b.r_n are calculated rnkb = 0.0_dp - do nkp=1,num_kpts + rnkb_loc = 0.0_dp + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn=1,nntot do n=1,num_wann - rnkb(n,nn,nkp) = sum(bk(:,nn,nkp)*rave(:,n)) + rnkb_loc(n,nn,nkp_loc) = sum(bk(:,nn,nkp)*rave(:,n)) enddo enddo enddo ! cd0dq(m,n,nkp) is calculated - cdodq=cmplx_0 - do nkp = 1, num_kpts + cdodq_loc=cmplx_0 + do nkp_loc = 1, counts(my_node_id) + nkp = nkp_loc + displs(my_node_id) do nn = 1, nntot do n=1,num_wann - mnn = m_matrix(n,n,nn,nkp) - crt(:,n) = m_matrix(:,n,nn,nkp) / mnn - cr(:,n) = m_matrix(:,n,nn,nkp) * conjg(mnn) + mnn = m_matrix_loc(n,n,nn,nkp_loc) + crt(:,n) = m_matrix_loc(:,n,nn,nkp_loc) / mnn + cr(:,n) = m_matrix_loc(:,n,nn,nkp_loc) * conjg(mnn) enddo do n = 1, num_wann do m = 1, num_wann ! A[R^{k,b}]=(R-Rdag)/2 - cdodq(m,n,nkp) = cdodq(m,n,nkp) & + cdodq_loc(m,n,nkp_loc) = cdodq_loc(m,n,nkp_loc) & + wb(nn) * 0.5_dp & *( cr(m,n) - conjg(cr(n,m)) ) ! -S[T^{k,b}]=-(T+Tdag)/2i ; T_mn = Rt_mn q_n - cdodq(m,n,nkp) = cdodq(m,n,nkp) - & - ( crt(m,n) * ln_tmp(n,nn,nkp) & - + conjg( crt(n,m) * ln_tmp(m,nn,nkp) ) ) & + cdodq_loc(m,n,nkp_loc) = cdodq_loc(m,n,nkp_loc) - & + ( crt(m,n) * ln_tmp_loc(n,nn,nkp_loc) & + + conjg( crt(n,m) * ln_tmp_loc(m,nn,nkp_loc) ) ) & * cmplx(0.0_dp,-0.5_dp,kind=dp) - cdodq(m,n,nkp) = cdodq(m,n,nkp) - wb(nn) & - * ( crt(m,n) * rnkb(n,nn,nkp) + conjg(crt(n,m) & - * rnkb(m,nn,nkp)) ) * cmplx(0.0_dp,-0.5_dp,kind=dp) + cdodq_loc(m,n,nkp_loc) = cdodq_loc(m,n,nkp_loc) - wb(nn) & + * ( crt(m,n) * rnkb_loc(n,nn,nkp_loc) + conjg(crt(n,m) & + * rnkb_loc(m,nn,nkp_loc)) ) * cmplx(0.0_dp,-0.5_dp,kind=dp) enddo enddo enddo enddo - cdodq = cdodq / real(num_kpts,dp) * 4.0_dp + cdodq_loc = cdodq_loc / real(num_kpts,dp) * 4.0_dp + + if(present(cdodq)) then + ! each process communicates its result to other processes + call comms_gatherv(cdodq_loc(1,1,1),num_wann*num_wann*counts(my_node_id),& + cdodq(1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs) + call comms_bcast(cdodq(1,1,1),num_wann*num_wann*num_kpts) + if (lsitesymmetry) then + call sitesym_symmetrize_gradient(1,cdodq) !RS: + cdodq_loc(:,:,1:counts(my_node_id))=cdodq(:,:,displs(my_node_id)+1:displs(my_node_id)+counts(my_node_id)) + endif + end if + + + deallocate( cr, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating cr in wann_domega') + deallocate( crt, stat=ierr ) + if (ierr/=0) call io_error('Error in deallocating crt in wann_domega') - if (lsitesymmetry) call sitesym_symmetrize_gradient(1,cdodq) !RS: - if (timing_level>1) call io_stopwatch('wann: domega',2) + + if (timing_level>1.and.on_root) call io_stopwatch('wann: domega',2) return @@ -1762,14 +1961,16 @@ subroutine wann_calc_projection() integer :: nw,nb,nkp,counter real(kind=dp) :: summ - if (timing_level>1) call io_stopwatch('wann: calc_projection',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: calc_projection',1) - write(stdout,'(/1x,a78)') repeat('-',78) - write(stdout,'(1x,9x,a)') & - 'Projection of Bands in Outer Window on all Wannier Functions' - write(stdout,'(1x,8x,62a)') repeat('-',62) - write(stdout,'(1x,16x,a)') ' Kpt Band Eigval |Projection|^2' - write(stdout,'(1x,16x,a47)') repeat('-',47) + if (on_root) then + write(stdout,'(/1x,a78)') repeat('-',78) + write(stdout,'(1x,9x,a)') & + 'Projection of Bands in Outer Window on all Wannier Functions' + write(stdout,'(1x,8x,62a)') repeat('-',62) + write(stdout,'(1x,16x,a)') ' Kpt Band Eigval |Projection|^2' + write(stdout,'(1x,16x,a47)') repeat('-',47) + endif do nkp=1,num_kpts counter=0 @@ -1780,14 +1981,14 @@ subroutine wann_calc_projection() do nw=1,num_wann summ=summ+abs(u_matrix_opt(counter,nw,nkp))**2 enddo - write(stdout,'(1x,16x,i5,1x,i5,1x,f14.6,2x,f14.8)') & + if (on_root) write(stdout,'(1x,16x,i5,1x,i5,1x,f14.6,2x,f14.8)') & nkp,nb,eigval(nb,nkp),summ endif enddo enddo - write(stdout,'(1x,a78/)') repeat('-',78) + if (on_root) write(stdout,'(1x,a78/)') repeat('-',78) - if (timing_level>1) call io_stopwatch('wann: calc_projection',2) + if (timing_level>1.and.on_root) call io_stopwatch('wann: calc_projection',2) return @@ -2022,7 +2223,7 @@ subroutine wann_check_unitarity() integer :: nkp,i,j,m complex(kind=dp) :: ctmp1,ctmp2 - if (timing_level>1) call io_stopwatch('wann: check_unitarity',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: check_unitarity',1) do nkp = 1, num_kpts do i = 1, num_wann @@ -2035,23 +2236,23 @@ subroutine wann_check_unitarity() enddo if ( (i.eq.j) .and. (abs (ctmp1 - cmplx_1 ) .gt. eps5) ) & then - write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & + if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & ctmp1 call io_error('wann_check_unitarity: error 1') endif if ( (i.eq.j) .and. (abs (ctmp2 - cmplx_1 ) .gt. eps5) ) & then - write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & + if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & ctmp2 call io_error('wann_check_unitarity: error 2') endif if ( (i.ne.j) .and. (abs (ctmp1) .gt. eps5) ) then - write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & + if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & ctmp1 call io_error('wann_check_unitarity: error 3') endif if ( (i.ne.j) .and. (abs (ctmp2) .gt. eps5) ) then - write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & + if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, & ctmp2 call io_error('wann_check_unitarity: error 4') endif @@ -2059,7 +2260,7 @@ subroutine wann_check_unitarity() enddo enddo - if (timing_level>1) call io_stopwatch('wann: check_unitarity',2) + if (timing_level>1.and.on_root) call io_stopwatch('wann: check_unitarity',2) return @@ -2136,7 +2337,7 @@ subroutine wann_svd_omega_i() integer :: nkp,nn,nb,na,ind real(kind=dp) :: omt1,omt2,omt3 - if (timing_level>1) call io_stopwatch('wann: svd_omega_i',1) + if (timing_level>1.and.on_root) call io_stopwatch('wann: svd_omega_i',1) allocate( cw1 (10 * num_wann),stat=ierr ) if (ierr/=0) call io_error('Error in allocating cw1 in wann_svd_omega_i') @@ -2181,13 +2382,15 @@ subroutine wann_svd_omega_i() omt1 = omt1 / real(num_kpts,dp) omt2 = omt2 / real(num_kpts,dp) omt3 = omt3 / real(num_kpts,dp) - write ( stdout , * ) ' ' - write(stdout,'(2x,a,f15.9,1x,a)') 'Omega Invariant: 1-s^2 = ',& - omt1*lenconfac**2,'('//trim(length_unit)//'^2)' - write(stdout,'(2x,a,f15.9,1x,a)') ' -2log s = ',& - omt2*lenconfac**2,'('//trim(length_unit)//'^2)' - write(stdout,'(2x,a,f15.9,1x,a)') ' acos^2 = ',& - omt3*lenconfac**2,'('//trim(length_unit)//'^2)' + if (on_root) then + write ( stdout , * ) ' ' + write(stdout,'(2x,a,f15.9,1x,a)') 'Omega Invariant: 1-s^2 = ',& + omt1*lenconfac**2,'('//trim(length_unit)//'^2)' + write(stdout,'(2x,a,f15.9,1x,a)') ' -2log s = ',& + omt2*lenconfac**2,'('//trim(length_unit)//'^2)' + write(stdout,'(2x,a,f15.9,1x,a)') ' acos^2 = ',& + omt3*lenconfac**2,'('//trim(length_unit)//'^2)' + endif deallocate(cpad1,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cpad1 in wann_svd_omega_i') @@ -2202,7 +2405,7 @@ subroutine wann_svd_omega_i() deallocate(cw1,stat=ierr) if (ierr/=0) call io_error('Error in deallocating cw1 in wann_svd_omega_i') - if (timing_level>1) call io_stopwatch('wann: svd_omega_i',2) + if (timing_level>1.and.on_root) call io_stopwatch('wann: svd_omega_i',2) return @@ -2419,7 +2622,7 @@ subroutine wann_main_gamma ldump=.false. if ( (num_dump_cycles.gt.0) .and. (mod(iter,num_dump_cycles).eq.0) ) ldump=.true. - if(lprint) write(stdout,'(1x,a,i6)') 'Cycle: ',iter + if(lprint.and.on_root) write(stdout,'(1x,a,i6)') 'Cycle: ',iter !~ ! initialize rguide as rave for use_bloch_phases !~ if ( (iter.gt.num_no_guide_iter) .and. lguide ) then diff --git a/test-suite/Makefile b/test-suite/Makefile index 05a476fe6..2a519b14b 100644 --- a/test-suite/Makefile +++ b/test-suite/Makefile @@ -50,7 +50,7 @@ run-tests-wannier-serial : clean prolog env QE_USE_MPI=0 ${TESTCODE_DIR}/bin/testcode.py --verbose --category=wannier_all run-tests-wannier-parallel : clean prolog - env QE_USE_MPI=1 ${TESTCODE_DIR}/bin/testcode.py --verbose --category=wannier_all + env QE_USE_MPI=1 ${TESTCODE_DIR}/bin/testcode.py --verbose --category=wannier_par run-custom-test-serial : clean prolog @if test -d $(testdir); then \ diff --git a/test-suite/config/extract/extract-wannier.x b/test-suite/config/extract/extract-wannier.x index bff4d9c5f..756d29aab 100755 --- a/test-suite/config/extract/extract-wannier.x +++ b/test-suite/config/extract/extract-wannier.x @@ -34,9 +34,9 @@ omegaT=`grep " Omega Total " $fname | awk '{print $7}'` # Wannier -pp nearn=`sed -n '/ Distance (Ang^-1)/{n;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;}'\ $fname | awk '{print $2}; {print $3}; {print $4}'` -compl=`sed -n '/ b_k(x) /{n;n;p;n;p;n;p;n;p;n;p;n;p;}' \ +compl=`sed -n '/ Completeness relation is fully satisfied /{n;n;n;n;n;n;p;n;p;n;p;n;p;n;p;n;p;}'\ $fname | awk '{print $2; print $3; print $4; print $5; print $6}'` - +#compl=`sed -n '/ b_k(x) /{n;n;p;n;p;n;p;n;p;n;p;n;p;}' \ proji=`sed -n '/ Projections/{n;p;n;p;n;p;n;p;}'\ diff --git a/test-suite/jobconfig b/test-suite/jobconfig index 49d4d5ecd..311b32058 100644 --- a/test-suite/jobconfig +++ b/test-suite/jobconfig @@ -118,6 +118,7 @@ inputs_args = ('gaas1.win', '1'),('gaas2.win', '1') [categories] wannier_all = example*?? test_* wan_* +wannier_par = example*?? wan_l* test_b* test_n* test_p* interface_all = pw_example*?? test_all = example*?? pw_example*?? test_* wan_* _default_ = example*?? test* wan_* diff --git a/test-suite/run_tests_travis.sh b/test-suite/run_tests_travis.sh index bfb594c4a..bbc893482 100755 --- a/test-suite/run_tests_travis.sh +++ b/test-suite/run_tests_travis.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + ## Set here, if needed, the location of the executables THISDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export ESPRESSO_ROOT="${THISDIR}/external-codes/espresso/" @@ -16,6 +18,10 @@ elif [ "$W90TESTSWITHINTERFACE" == "false" ] then # Only wannier tests make run-tests + if [ "$W90BINARYPARALLEL" == "true" ] + then + make run-tests-parallel + fi else # By default: run both make run-tests-all diff --git a/test-suite/travis_copy_make.inc.sh b/test-suite/travis_copy_make.inc.sh new file mode 100755 index 000000000..723e02385 --- /dev/null +++ b/test-suite/travis_copy_make.inc.sh @@ -0,0 +1,10 @@ +#!/bin/bash +#Stop the full script if one line crashes +set -e + +if [ "$W90BINARYPARALLEL" == "true" ] +then + cp config/make.inc.gfort.traviscimpi make.inc +else + cp config/make.inc.gfort.travisci make.inc +fi