diff --git a/.travis.yml b/.travis.yml
index f0c5249aa..94e2fb1ae 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,17 +7,22 @@ addons:
         - gfortran 
         - libblas-dev 
         - liblapack-dev 
+        - openmpi-bin
+        - libopenmpi-dev
 env:
 ## Uncomment the following line if you want to run also the interface tests
 #   - W90TESTSWITHINTERFACE=true
-   - W90TESTSWITHINTERFACE=false
+   - W90TESTSWITHINTERFACE=false W90BINARYPARALLEL=false
+   - W90TESTSWITHINTERFACE=false W90BINARYPARALLEL=true
 install:
 # Possibly install QE and other interface code
     - ./test-suite/external-codes/get-external-codes.sh
 # Install Wannier
-    - cp config/make.inc.gfort.travisci make.inc
+#- cp config/make.inc.gfort.travisci make.inc
+    - ./test-suite/travis_copy_make.inc.sh
     - make -j default
 script:
     - ./test-suite/run_tests_travis.sh
 git:
   depth: 3
+
diff --git a/CHANGE.log b/CHANGE.log
index 8febc1894..28fd63f84 100644
--- a/CHANGE.log
+++ b/CHANGE.log
@@ -4,14 +4,20 @@
 
 	The Maximally-Localised Generalised Wannier Functions Code
 
-v2.x.y (25th January 2017)
+v2.x.y (DATE)
 * G0W0 interface implemented (A. Marrazzo (EPFL, CH) and S. Tsirkin (DIPC, Spain)).
-  Added two utilities (gw2wannier90.py and k_mapper.py) and example 23 on G0W0 bands interpolation with Yambo. 
+  Added two utilities (gw2wannier90.py and k_mapper.py) and example 23 on
+  G0W0 bands interpolation with Yambo.
+* Parallelized the disentanglement and wannierise subroutines
+  with respect to k-points. Other parts of the code have been
+  modified so that only the root process writes in the .wout file.
+  wannier90.x can now be run in parallel (see user guide)
+  (contributed by Guillaume Geranton, FZ Julich)
 v2.1.0 (13th January 2017)
 * Implementation of the symmetry-adapted Wannier functions
   (see R. Sakuma, Phys. Rev. B 87, 235109 (2013), courtesy
    of R. Sakuma (Lund University, Sweden), T. Koretsune (Riken, JP),
-   Y. Nomura (U. Tokyo, JP), Y. Nohara (Atomic-Scale Material 
+   Y. Nomura (U. Tokyo, JP), Y. Nohara (Atomic-Scale Material
    Simulations, Co., Ltd.), R. Arita (Riken, JP))
 * Streamlined the interface between wannier90 and tight-binding
   codes such as pythtb (new input variable: write_tb). Also,
diff --git a/config/make.inc.gfort.traviscimpi b/config/make.inc.gfort.traviscimpi
new file mode 100644
index 000000000..24c50f00d
--- /dev/null
+++ b/config/make.inc.gfort.traviscimpi
@@ -0,0 +1,21 @@
+#===================================
+# gfortran for the travis-ci service
+# This file for MPI builds
+#===================================
+F90 = gfortran
+
+COMMS  = mpi 
+MPIF90 = mpif90 
+
+# Options for debugging  When we move to 4.9 add -fsanitize=leak
+FCOPTS =  -fstrict-aliasing  -fno-omit-frame-pointer -fno-realloc-lhs -fcheck=bounds,do,recursion,pointer -ffree-form -Wall -Waliasing -Wsurprising -Wline-truncation -Wno-tabs -Wno-uninitialized -Wno-unused-dummy-argument -Wno-unused -Wno-character-truncation -O1 -g -fbacktrace 
+LDOPTS =  -fstrict-aliasing  -fno-omit-frame-pointer -fno-realloc-lhs -fcheck=bounds,do,recursion,pointer -ffree-form -Wall -Waliasing -Wsurprising -Wline-truncation -Wno-tabs -Wno-uninitialized -Wno-unused-dummy-argument -Wno-unused -Wno-character-truncation -O1 -g -fbacktrace 
+
+#=======================
+# System LAPACK and BLAS
+# e.g. use
+#   sudo apt-get install libblas-dev liblapack-dev
+# on Ubuntu
+#=======================
+LIBS = -llapack -lblas
+
diff --git a/doc/tutorial/tutorial.tex b/doc/tutorial/tutorial.tex
index 73a040d57..a59821dfa 100644
--- a/doc/tutorial/tutorial.tex
+++ b/doc/tutorial/tutorial.tex
@@ -121,10 +121,8 @@
 
 \sectiontitle{Parallel execution}
 \label{sec:parallel}
-Presently, {\tt wannier90.x} is a serial-only executable, so it
-\emph{cannot} be run in parallel using MPI libraries. On the contrary,
-{\tt postw90.x} can be run in parallel to speed up the calculations,
-using the MPI libraries.
+{\tt postw90.x} and {\tt wannier90.x} can be run in parallel to speed up
+the calculations, using the MPI libraries.
 
 To enable the parallel version to be built, you must specify some
 flags in the {\tt make.inc} file of {\tt wannier90} and {\tt postw90};
diff --git a/doc/user_guide/parameters.tex b/doc/user_guide/parameters.tex
index 19e4b7898..7ad545d63 100644
--- a/doc/user_guide/parameters.tex
+++ b/doc/user_guide/parameters.tex
@@ -1,10 +1,11 @@
 \chapter{Parameters}\label{chap:parameters}
 
 \section{Usage}
-{\tt
-\begin{quote}
-wannier90.x [-pp] [seedname]
-\end{quote} }
+{\tt wannier90.x} can be run in parallel using MPI libraries to
+reduce the computation time.
+
+For serial execution use: {\tt wannier90.x [-pp] [seedname]}
+
 \begin{itemize}
 \item{ {\tt seedname}: If a seedname string is given the code will read its input
 from a file {\tt seedname.win}. The default value is {\tt wannier}. One can also equivalently provide the string
@@ -14,6 +15,25 @@ \section{Usage}
 This information is written to the file {\tt seedname.nnkp}.}
 \end{itemize}
 
+For parallel execution use: {\tt mpirun -np NUMPROCS wannier90.x [-pp] [seedname]}
+
+\begin{itemize} \item 
+{\tt NUMPROCS}: substitute with the number of processors that you want
+to use.
+\end{itemize}
+
+Note that the {\tt mpirun} command and command-line flags may be
+different in your MPI implementation: read your MPI manual or ask your
+computer administrator.
+
+Note also that this requires that the {\tt wannier90.x} executable has been
+compiled in its parallel version (follow the instructions in the file
+{\tt README.install} in the main directory of the wannier90
+distribution) and
+that the MPI libraries and binaries are installed and correctly
+configured on your machine.
+
+
 \section{{\tt seedname.win} File\label{sec:seednamefile}}
 The \wannier\ input file {\tt seedname.win} has a flexible free-form
 structure. 
diff --git a/doc/user_guide/postw90params.tex b/doc/user_guide/postw90params.tex
index 250654588..a57c1f04b 100644
--- a/doc/user_guide/postw90params.tex
+++ b/doc/user_guide/postw90params.tex
@@ -3,9 +3,9 @@ \chapter{Parameters}
 \section{Introduction}
 
 The \texttt{wannier90.x} code described in Part~\ref{part:w90}
-calculates the maximally-localized Wannier functions. The \texttt{wannier90.x} code is a
-serial executable (i.e., it cannot be executed in parallel on different
-CPUs).
+calculates the maximally-localized Wannier functions. %The \texttt{wannier90.x} code is a
+%serial executable (i.e., it cannot be executed in parallel on different
+%CPUs).
 
 The \texttt{postw90.x} executable contains instead a series of modules
 that take the Wannier functions calculated by \texttt{wannier90.x} and
diff --git a/src/Makefile.2 b/src/Makefile.2
index f1473a032..f0795f829 100644
--- a/src/Makefile.2
+++ b/src/Makefile.2
@@ -5,7 +5,8 @@
 include ../../make.inc
 
 OBJS  =  constants.o io.o utility.o parameters.o hamiltonian.o overlap.o \
-	 kmesh.o disentangle.o ws_distance.o wannierise.o plot.o transport.o sitesym.o
+	 kmesh.o disentangle.o ws_distance.o wannierise.o plot.o transport.o sitesym.o comms.o
+
 
 OBJS2  =  wannier_lib.o
 
@@ -31,8 +32,8 @@ TEMP2 = $(F90)
 endif
 
 
-wannier libs w90chk2chk serialobjs: POSTOPTS = 
-wannier libs w90chk2chk serialobjs: COMPILER = $(F90)
+wannier libs w90chk2chk serialobjs: POSTOPTS = $(TEMP1) 
+wannier libs w90chk2chk serialobjs: COMPILER = $(TEMP2)
 wannier: ../../wannier90.x 
 w90chk2chk: ../../w90chk2chk.x
 
@@ -42,7 +43,7 @@ serialobjs: $(OBJS)
 	$(COMPILER) ../w90chk2chk.F90 $(LDOPTS) $(OBJS) $(LIBS) -o ../../w90chk2chk.x
 
 ../../wannier90.x: $(OBJS) ../wannier_prog.F90
-	$(COMPILER) ../wannier_prog.F90 $(LDOPTS) $(OBJS) $(LIBS) -o ../../wannier90.x
+	$(COMPILER) ../wannier_prog.F90 $(POSTOPTS) $(LDOPTS) $(OBJS) $(LIBS) -o ../../wannier90.x
 
 post: POSTOPTS = $(TEMP1)
 post: COMPILER = $(TEMP2)
@@ -88,7 +89,7 @@ io.o: ../io.F90 constants.o
 utility.o: ../utility.F90  constants.o io.o
 	 $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../utility.F90
 
-parameters.o: ../parameters.F90 constants.o io.o utility.o
+parameters.o: ../parameters.F90 constants.o io.o utility.o comms.o
 	 $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../parameters.F90
 
 hamiltonian.o: ../hamiltonian.F90 ws_distance.o constants.o io.o utility.o parameters.o
@@ -100,10 +101,10 @@ overlap.o: ../overlap.F90 constants.o io.o utility.o parameters.o sitesym.o
 kmesh.o: ../kmesh.F90 constants.o io.o utility.o parameters.o 
 	 $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../kmesh.F90
 
-disentangle.o: ../disentangle.F90 constants.o io.o parameters.o sitesym.o
+disentangle.o: ../disentangle.F90 constants.o io.o parameters.o sitesym.o comms.o
 	 $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../disentangle.F90
 
-wannierise.o: ../wannierise.F90 hamiltonian.o constants.o io.o utility.o parameters.o sitesym.o
+wannierise.o: ../wannierise.F90 hamiltonian.o constants.o io.o utility.o parameters.o sitesym.o comms.o
 	 $(COMPILER) $(POSTOPTS) $(FCOPTS) -c ../wannierise.F90
 
 plot.o: ../plot.F90 constants.o io.o utility.o parameters.o hamiltonian.o ws_distance.o
diff --git a/src/disentangle.F90 b/src/disentangle.F90
index 281d7d520..4f3ce26ce 100644
--- a/src/disentangle.F90
+++ b/src/disentangle.F90
@@ -18,7 +18,19 @@ module w90_disentangle
 
   use w90_constants, only: dp,cmplx_0,cmplx_1
   use w90_io, only: io_error,stdout,io_stopwatch
-  use w90_parameters
+  use w90_parameters, only : num_bands,num_wann,a_matrix,u_matrix_opt,&
+       u_matrix,m_matrix_orig,lwindow,dis_conv_window,devel_flag, &
+       nntot,timing_level,omega_invariant,u_matrix,lsitesymmetry, &
+       lenconfac,iprint,wbtot,dis_num_iter,dis_mix_ratio,dis_win_min, &
+       dis_win_max,dis_froz_min,dis_froz_max,dis_spheres_num, &
+       dis_spheres_first_wann,num_kpts,nnlist,ndimwin,wb,gamma_only, &
+       eigval,length_unit,dis_spheres,m_matrix,dis_conv_tol,frozen_states, &
+       optimisation,recip_lattice,kpt_latt
+
+  use w90_comms, only : on_root, my_node_id, num_nodes,&
+                        comms_bcast, comms_array_split,&
+                        comms_gatherv, comms_allreduce
+
   use w90_sitesym, only: sitesym_slim_d_matrix_band, &
        sitesym_replace_d_matrix_band,sitesym_symmetrize_u_matrix,&
        sitesym_symmetrize_zmatrix,sitesym_dis_extract_symmetry !RS:
@@ -67,7 +79,7 @@ subroutine dis_main()
 
     if (timing_level>0) call io_stopwatch('dis: main',1)
 
-    write(stdout,'(/1x,a)') &
+    if (on_root) write(stdout,'(/1x,a)') &
          '*------------------------------- DISENTANGLE --------------------------------*'
 
     ! Allocate arrays
@@ -86,10 +98,10 @@ subroutine dis_main()
     ! (Sec. III.G SMV)
     if (linner) then
        if (lsitesymmetry) call io_error('in symmetry-adapted mode, frozen window not implemented yet') !YN: RS: 
-       write(stdout,'(3x,a)') 'Using an inner window (linner = T)'  
+       if (on_root) write(stdout,'(3x,a)') 'Using an inner window (linner = T)'  
        call dis_proj_froz()
     else
-       write(stdout,'(3x,a)') 'No inner window (linner = F)'         
+       if (on_root) write(stdout,'(3x,a)') 'No inner window (linner = F)'         
     endif
 
     ! Debug
@@ -238,7 +250,7 @@ subroutine dis_main()
     ! Deallocate module arrays
     call internal_dealloc()
 
-    if (timing_level>0) call io_stopwatch('dis: main',2)
+    if (timing_level>0.and.on_root) call io_stopwatch('dis: main',2)
 
     return
 
@@ -279,8 +291,8 @@ subroutine internal_check_orthonorm()
                enddo
                if (l.eq.m) then  
                   if (abs(ctmp - cmplx_1).gt.eps8) then  
-                     write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp  
-                     write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal'
+                     if (on_root) write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp  
+                     if (on_root) write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal'
 !                     write(stdout,'(1x,a)') 'Try re-running the calculation with the input keyword'
 !                     write(stdout,'(1x,a)') '  devel_flag=orth-fix'
 !                     write(stdout,'(1x,a)') 'Please report the sucess or failure of this to the Wannier90 developers'
@@ -288,8 +300,8 @@ subroutine internal_check_orthonorm()
                   endif
                else  
                   if (abs(ctmp).gt.eps8) then  
-                     write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp  
-                     write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal'
+                     if (on_root) write(stdout,'(3i6,2f16.12)') nkp,l,m,ctmp  
+                     if (on_root) write(stdout,'(1x,a)') 'The trial orbitals for disentanglement are not orthonormal'
 !                     write(stdout,'(1x,a)') 'Try re-running the calculation with the input keyword'
 !                     write(stdout,'(1x,a)') '  devel_flag=orth-fix'
 !                     write(stdout,'(1x,a)') 'Please report the sucess or failure of this to the Wannier90 developers'
@@ -300,7 +312,7 @@ subroutine internal_check_orthonorm()
          enddo
       enddo
 
-      if (timing_level>1) call io_stopwatch('dis: main: check_orthonorm',2)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: main: check_orthonorm',2)
 
       return
 
@@ -322,7 +334,7 @@ subroutine internal_slim_m()
       integer                       :: nkp,nkp2,nn,i,j,m,n,ierr
       complex(kind=dp), allocatable :: cmtmp(:,:)
       
-      if (timing_level>1) call io_stopwatch('dis: main: slim_m',1)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: main: slim_m',1)
 
       allocate(cmtmp(num_bands,num_bands),stat=ierr)
       if (ierr/=0) call io_error('Error in allocating cmtmp in dis_main')
@@ -349,7 +361,7 @@ subroutine internal_slim_m()
       deallocate(cmtmp,stat=ierr)
       if (ierr/=0) call io_error('Error deallocating cmtmp in dis_main')
  
-      if (timing_level>1) call io_stopwatch('dis: main: slim_m',2)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: main: slim_m',2)
      
       return
       
@@ -390,7 +402,7 @@ subroutine internal_find_u()
       complex(kind=dp), allocatable :: cz(:,:)
       complex(kind=dp), allocatable :: cwork(:)
 
-      if (timing_level>1) call io_stopwatch('dis: main: find_u',1)
+      if (timing_level>1.and.on_root) call io_stopwatch('dis: main: find_u',1)
 
       ! Allocate arrays needed for ZGESVD
       allocate(svals(num_wann),stat=ierr)
@@ -417,10 +429,10 @@ subroutine internal_find_u()
          call ZGESVD ('A', 'A', num_wann, num_wann, caa(:,:,nkp), num_wann, &
               svals, cz, num_wann, cv, num_wann, cwork, 4*num_wann, rwork, info)
          if (info.ne.0) then  
-            write(stdout,*) ' ERROR: IN ZGESVD IN dis_main'  
-            write(stdout,*) 'K-POINT NKP=', nkp, ' INFO=', info  
+            if (on_root) write(stdout,*) ' ERROR: IN ZGESVD IN dis_main'  
+            if (on_root) write(stdout,*) 'K-POINT NKP=', nkp, ' INFO=', info  
             if (info.lt.0) then  
-               write(stdout,*) 'THE ',  -info, '-TH ARGUMENT HAD ILLEGAL VALUE'  
+               if (on_root) write(stdout,*) 'THE ',  -info, '-TH ARGUMENT HAD ILLEGAL VALUE'  
             endif
             call io_error('dis_main: problem in ZGESVD 1')  
          endif
@@ -625,7 +637,7 @@ subroutine dis_windows()
     !                    it is slimmed down to contain only those inside the
     !                    energy window, stored in nb=1,...,ndimwin(nkp)
 
-    if (timing_level>1) call io_stopwatch('dis: windows',1)
+    if (timing_level>1 .and. on_root) call io_stopwatch('dis: windows',1)
 
     ! Allocate module arrays
     allocate(nfirstwin(num_kpts),stat=ierr)
@@ -641,33 +653,33 @@ subroutine dis_windows()
 
     linner = .false.  
 
-    write(stdout,'(1x,a)') &
+    if (on_root) write(stdout,'(1x,a)') &
          '+----------------------------------------------------------------------------+'
-    write(stdout,'(1x,a)') &
+    if (on_root) write(stdout,'(1x,a)') &
          '|                              Energy  Windows                               |'
-    write(stdout,'(1x,a)') &
+    if (on_root) write(stdout,'(1x,a)') &
          '|                              ---------------                               |'
-    write(stdout,'(1x,a,f10.5,a,f10.5,a)') &
+    if (on_root) write(stdout,'(1x,a,f10.5,a,f10.5,a)') &
          '|                   Outer: ',dis_win_min,'  to ',dis_win_max,&
          '  (eV)                   |'
     if (frozen_states) then
-       write(stdout,'(1x,a,f10.5,a,f10.5,a)') &
+       if (on_root) write(stdout,'(1x,a,f10.5,a,f10.5,a)') &
             '|                   Inner: ',dis_froz_min,'  to ',dis_froz_max,&
             '  (eV)                   |'
     else
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '|                   No frozen states were specified                          |'
     endif
-    write(stdout,'(1x,a)') &
+    if (on_root) write(stdout,'(1x,a)') &
          '+----------------------------------------------------------------------------+'
 
     do nkp = 1, num_kpts  
        ! Check which eigenvalues fall within the outer window
        if ( (eigval_opt(1,nkp).gt.dis_win_max).or.&
             (eigval_opt(num_bands,nkp).lt.dis_win_min) ) then
-          write(stdout,*) ' ERROR AT K-POINT: ', nkp  
-          write(stdout,*) ' ENERGY WINDOW (eV):    [',dis_win_min,  ',', dis_win_max,     ']'
-          write(stdout,*) ' EIGENVALUE RANGE (eV): [',&
+          if (on_root) write(stdout,*) ' ERROR AT K-POINT: ', nkp  
+          if (on_root) write(stdout,*) ' ENERGY WINDOW (eV):    [',dis_win_min,  ',', dis_win_max,     ']'
+          if (on_root) write(stdout,*) ' EIGENVALUE RANGE (eV): [',&
                eigval_opt(1,nkp),',',eigval_opt(num_bands,nkp),']'
           call io_error('dis_windows: The outer energy window contains no eigenvalues')
        endif
@@ -713,7 +725,7 @@ subroutine dis_windows()
        !~~ GS-end
 
        if (ndimwin(nkp).lt.num_wann) then  
-          write(stdout,483) 'Error at k-point ',nkp,&
+          if (on_root) write(stdout,483) 'Error at k-point ',nkp,&
                ' ndimwin=',ndimwin(nkp),' num_wann=',num_wann
 483       format(1x,a17,i4,a8,i3,a9,i3)  
           call io_error('dis_windows: Energy window contains fewer states than number of target WFs') 
@@ -750,11 +762,11 @@ subroutine dis_windows()
        ndimfroz(nkp) = kifroz_max - kifroz_min + 1  
 
        if (ndimfroz(nkp).gt.num_wann) then  
-          write(stdout,401) nkp, ndimfroz(nkp),num_wann  
+          if (on_root) write(stdout,401) nkp, ndimfroz(nkp),num_wann  
 401       format(' ERROR AT K-POINT ',i4,' THERE ARE ',i2, &
                ' BANDS INSIDE THE INNER WINDOW AND ONLY',i2, &
                ' TARGET BANDS')
-          write(stdout,402) (eigval_opt(i,nkp),i = imin, imax)  
+          if (on_root) write(stdout,402) (eigval_opt(i,nkp),i = imin, imax)  
 402       format('BANDS: (eV)',10(F10.5,1X))  
           call io_error('dis_windows: More states in the frozen window than target WFs')
        endif
@@ -772,11 +784,11 @@ subroutine dis_windows()
              lfrozen(indxfroz(i,nkp),nkp) = .true.  
           enddo
           if (indxfroz(ndimfroz(nkp),nkp).ne.kifroz_max) then  
-             write(stdout,*) ' Error at k-point ', nkp, ' frozen band #', i  
-             write(stdout,*) ' ndimfroz=', ndimfroz(nkp)  
-             write(stdout,*) ' kifroz_min=', kifroz_min  
-             write(stdout,*) ' kifroz_max=', kifroz_max  
-             write(stdout,*) ' indxfroz(i,nkp)=', indxfroz(i,nkp)  
+             if (on_root) write(stdout,*) ' Error at k-point ', nkp, ' frozen band #', i  
+             if (on_root) write(stdout,*) ' ndimfroz=', ndimfroz(nkp)  
+             if (on_root) write(stdout,*) ' kifroz_min=', kifroz_min  
+             if (on_root) write(stdout,*) ' kifroz_max=', kifroz_max  
+             if (on_root) write(stdout,*) ' indxfroz(i,nkp)=', indxfroz(i,nkp)  
              call io_error('dis_windows: Something fishy...')
           endif
        endif
@@ -792,8 +804,8 @@ subroutine dis_windows()
        enddo
 
        if ( i.ne.ndimwin(nkp) - ndimfroz(nkp) ) then  
-          write(stdout,*) ' Error at k-point: ',nkp
-          write(stdout,'(3(a,i5))') ' i: ',i,' ndimwin: ',ndimwin(nkp),&
+          if (on_root) write(stdout,*) ' Error at k-point: ',nkp
+          if (on_root) write(stdout,'(3(a,i5))') ' i: ',i,' ndimwin: ',ndimwin(nkp),&
                ' ndimfroz: ',ndimfroz(nkp)
           call io_error('dis_windows: i .ne. (ndimwin-ndimfroz) at k-point')
        endif
@@ -830,51 +842,51 @@ subroutine dis_windows()
 !~![ysl-e]
 
     if (iprint>1) then
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '|                        K-points with Frozen States                         |'
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '|                        ---------------------------                         |'
        i=0
        do nkp=1,num_kpts
           if (ndimfroz(nkp).gt.0) then
              i=i+1
              if (i.eq.1) then
-                write(stdout,'(1x,a,i6)',advance='no') '|',nkp
+                if (on_root) write(stdout,'(1x,a,i6)',advance='no') '|',nkp
              else if ((i.gt.1) .and. (i.lt.12)) then
-                write(stdout,'(i6)',advance='no') nkp
+                if (on_root) write(stdout,'(i6)',advance='no') nkp
              else if (i.eq.12) then 
-                write(stdout,'(i6,a)') nkp,'    |'
+                if (on_root) write(stdout,'(i6,a)') nkp,'    |'
                 i=0
              endif
           endif
        enddo
        if (i.ne.0) then
           do j=1,12-i
-             write(stdout,'(6x)',advance='no')
+             if (on_root) write(stdout,'(6x)',advance='no')
           enddo
-          write(stdout,'(a)') '    |'
+          if (on_root) write(stdout,'(a)') '    |'
        endif
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '+----------------------------------------------------------------------------+'
     endif
 
-    write(stdout,'(3x,a,i4)') 'Number of target bands to extract: ',num_wann
+    if (on_root) write(stdout,'(3x,a,i4)') 'Number of target bands to extract: ',num_wann
     if (iprint>1) then
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '+----------------------------------------------------------------------------+'
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '|                                  Windows                                   |'
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '|                                  -------                                   |'
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '|               K-point      Ndimwin     Ndimfroz    Nfirstwin               |'
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '|               ----------------------------------------------               |'
        do nkp=1,num_kpts
-          write(stdout,403) nkp,ndimwin(nkp),ndimfroz(nkp),nfirstwin(nkp)
+          if (on_root) write(stdout,403) nkp,ndimwin(nkp),ndimfroz(nkp),nfirstwin(nkp)
        enddo
 403    format(1x,'|',14x,i6,7x,i6,7x,i6,6x,i6,18x,'|')
-       write(stdout,'(1x,a)') &
+       if (on_root) write(stdout,'(1x,a)') &
             '+----------------------------------------------------------------------------+'
     endif
 
@@ -949,12 +961,12 @@ subroutine dis_project()
 
     if (timing_level>1) call io_stopwatch('dis: project',1)
 
-    write(stdout,'(/1x,a)') &
+    if (on_root) write(stdout,'(/1x,a)') &
          '                  Unitarised projection of Wannier functions                  '
-    write(stdout,'(1x,a)') &
+    if (on_root) write(stdout,'(1x,a)') &
          '                  ------------------------------------------                  '
-    write(stdout,'(3x,a)') 'A_mn = <psi_m|g_n> --> S = A.A^+ --> U = S^-1/2.A'
-    write(stdout,'(3x,a)',advance='no') 'In dis_project...' 
+    if (on_root) write(stdout,'(3x,a)') 'A_mn = <psi_m|g_n> --> S = A.A^+ --> U = S^-1/2.A'
+    if (on_root) write(stdout,'(3x,a)',advance='no') 'In dis_project...' 
 
     allocate(catmpmat(num_bands,num_bands,num_kpts),stat=ierr)
     if (ierr/=0) call io_error('Error in allocating catmpmat in dis_project')
@@ -992,10 +1004,10 @@ subroutine dis_project()
             num_bands, svals, cz, num_bands, cvdag, num_bands, cwork, &
             4*num_bands, rwork, info)
        if (info.ne.0) then  
-          write(stdout,*) ' ERROR: IN ZGESVD IN dis_project'  
-          write(stdout,*) ' K-POINT NKP=', nkp, ' INFO=', info  
+          if (on_root) write(stdout,*) ' ERROR: IN ZGESVD IN dis_project'  
+          if (on_root) write(stdout,*) ' K-POINT NKP=', nkp, ' INFO=', info  
           if (info.lt.0) then  
-             write(stdout,*) ' THE ',  -info, '-TH ARGUMENT HAD ILLEGAL VALUE'  
+             if (on_root) write(stdout,*) ' THE ',  -info, '-TH ARGUMENT HAD ILLEGAL VALUE'  
           endif
           call io_error('dis_project: problem in ZGESVD 1')   
        endif
@@ -1045,19 +1057,19 @@ subroutine dis_project()
                 ctmp2 = ctmp2 + u_matrix_opt(m,j,nkp) * conjg(u_matrix_opt(m,i,nkp))  
              enddo
              if ( (i.eq.j).and.(abs(ctmp2-cmplx_1).gt.eps5) ) then
-                write(stdout,*) ' ERROR: unitarity of initial U'  
-                write(stdout,'(1x,a,i2)') 'nkp= ', nkp  
-                write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j  
-                write(stdout,'(1x,a,f12.6,1x,f12.6)') &
+                if (on_root) write(stdout,*) ' ERROR: unitarity of initial U'  
+                if (on_root) write(stdout,'(1x,a,i2)') 'nkp= ', nkp  
+                if (on_root) write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j  
+                if (on_root) write(stdout,'(1x,a,f12.6,1x,f12.6)') &
                      '[u_matrix_opt.transpose(u_matrix_opt)]_ij= ',&
                      real(ctmp2,dp),aimag(ctmp2)
                 call io_error('dis_project: Error in unitarity of initial U in dis_project')
              endif
              if ( (i.ne.j) .and. (abs(ctmp2).gt.eps5) ) then  
-                write(stdout,*) ' ERROR: unitarity of initial U'  
-                write(stdout,'(1x,a,i2)') 'nkp= ', nkp  
-                write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j  
-                write(stdout,'(1x,a,f12.6,1x,f12.6)') &
+                if (on_root) write(stdout,*) ' ERROR: unitarity of initial U'  
+                if (on_root) write(stdout,'(1x,a,i2)') 'nkp= ', nkp  
+                if (on_root) write(stdout,'(1x,a,i2,2x,a,i2)') 'i= ', i, 'j= ', j  
+                if (on_root) write(stdout,'(1x,a,f12.6,1x,f12.6)') &
                      '[u_matrix_opt.transpose(u_matrix_opt)]_ij= ', &
                      real(ctmp2,dp),aimag(ctmp2)
                 call io_error('dis_project: Error in unitarity of initial U in dis_project')
@@ -1080,7 +1092,7 @@ subroutine dis_project()
     deallocate(catmpmat,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating catmpmat in dis_project')
 
-    write(stdout,'(a)') ' done'
+    if (on_root) write(stdout,'(a)') ' done'
 
     if (timing_level>1) call io_stopwatch('dis: project',2)
 
@@ -1156,7 +1168,7 @@ subroutine dis_proj_froz()
 
       if (timing_level>1) call io_stopwatch('dis: proj_froz',1)
 
-      write(stdout,'(3x,a)',advance='no') 'In dis_proj_froz...' 
+      if (on_root) write(stdout,'(3x,a)',advance='no') 'In dis_proj_froz...' 
 
       allocate(iwork(5*num_bands),stat=ierr)
       if (ierr/=0) call io_error('Error allocating iwork in dis_proj_froz')
@@ -1231,8 +1243,8 @@ subroutine dis_proj_froz()
             do n = 1, ndimwin(nkp)  
                do m = 1, n  
                   if (abs(cqpq(m,n) - conjg(cqpq(n,m))).gt.eps8) then
-                     write(stdout,*) ' matrix CQPQ is not hermitian'  
-                     write(stdout,*) ' k-point ', nkp  
+                     if (on_root) write(stdout,*) ' matrix CQPQ is not hermitian'  
+                     if (on_root) write(stdout,*) ' k-point ', nkp  
                      call io_error('dis_proj_froz: error')  
                   endif
                enddo
@@ -1261,20 +1273,20 @@ subroutine dis_proj_froz()
 
             ! DEBUG
             if (info.lt.0) then  
-               write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX'
-               write(stdout,*) ' THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
+               if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX'
+               if (on_root) write(stdout,*) ' THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
                call io_error('dis_proj_frozen: error')  
             elseif (info.gt.0) then  
-               write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX'
-               write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE'  
+               if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING CQPQ MATRIX'
+               if (on_root) write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE'  
                call io_error('dis_proj_frozen: error') 
             endif
             ! ENDDEBUG
 
             ! DEBUG
             if (m.ne.ndimwin(nkp)) then  
-               write(stdout,*) ' *** ERROR *** in dis_proj_froz'  
-               write(stdout,*) ' Number of eigenvalues/vectors obtained is', &
+               if (on_root) write(stdout,*) ' *** ERROR *** in dis_proj_froz'  
+               if (on_root) write(stdout,*) ' Number of eigenvalues/vectors obtained is', &
                     m, ' not equal to the number asked,', ndimwin(nkp)
                call io_error('dis_proj_frozen: error')  
             endif
@@ -1283,12 +1295,12 @@ subroutine dis_proj_froz()
             ! DEBUG
             ! check that the eigenvalues are between 0 and 1
             if (iprint>2) then
-               write(stdout,'(/a,i3,a,i3,a,i3,a)') ' K-point ', nkp, ' ndimwin: ', &
+               if (on_root) write(stdout,'(/a,i3,a,i3,a,i3,a)') ' K-point ', nkp, ' ndimwin: ', &
                     ndimwin(nkp),' we want the',num_wann - ndimfroz(nkp),&
                     ' leading eigenvector(s) of QPQ'
             endif
             do j = 1, ndimwin(nkp)  
-               if (iprint>2) write(stdout,'(a,i3,a,f16.12)') '  lambda(', j, ')=', w(j)  
+               if (iprint>2.and.on_root) write(stdout,'(a,i3,a,f16.12)') '  lambda(', j, ')=', w(j)  
 !~[aam]        if ( (w(j).lt.eps8).or.(w(j).gt.1.0_dp + eps8) ) then
                if ( (w(j).lt.-eps8).or.(w(j).gt.1.0_dp + eps8) ) then
                   call io_error('dis_proj_frozen: error - Eigenvalues not between 0 and 1') 
@@ -1321,7 +1333,7 @@ subroutine dis_proj_froz()
                   end if
                end do
                if(nzero>0) then
-                  if(iprint>2) then
+                  if(iprint>2.and.on_root) then
                      write(stdout,*) ' '
                      write(stdout,'(1x,a,i0,a)') 'An eigenvalue of QPQ is close to zero at kpoint '&
                         ,nkp,'. Using safety check.'
@@ -1336,7 +1348,7 @@ subroutine dis_proj_froz()
                      counter=counter+1
                   end do
                   
-                  if(iprint>2) then
+                  if(iprint>2.and.on_root) then
                      do loop_f=1,ndimwin(nkp)
                         write(stdout,'(1x,a,i4,a,es13.6)') 'Eigenvector number',loop_f,'    Eigenvalue: ',w(loop_f)
                         do loop_v=1,ndimwin(nkp)
@@ -1369,7 +1381,7 @@ subroutine dis_proj_froz()
                      end do
                   end do
                   
-                  if(iprint>2)  then
+                  if(iprint>2.and.on_root)  then
                      write(rep,'(i4)') num_wann - ndimfroz(nkp)
                      write(stdout,'(1x,a,'//trim(rep)//'(i0,1x))') 'We use the following eigenvectors: ' &
                           ,vmap(1:(num_wann - ndimfroz(nkp)))
@@ -1404,7 +1416,7 @@ subroutine dis_proj_froz()
                ! PICK THE num_wann-nDIMFROZ(NKP) LEADING EIGENVECTORS AS TRIAL STATES
                ! and PUT THEM RIGHT AFTER THE FROZEN STATES IN u_matrix_opt
                do l = ndimfroz(nkp) + 1, num_wann  
-                  write(stdout,*) 'il=',il
+                  if (on_root) write(stdout,*) 'il=',il
                   u_matrix_opt(1:ndimwin(nkp),l,nkp) = cz(1:ndimwin(nkp),il) 
                   il = il + 1  
                enddo
@@ -1459,7 +1471,7 @@ subroutine dis_proj_froz()
       deallocate(iwork,stat=ierr)
       if (ierr/=0) call io_error('Error deallocating iwork in dis_proj_froz')
 
-      write(stdout,'(a)') ' done'
+      if (on_root) write(stdout,'(a)') ' done'
 
       if (timing_level>1) call io_stopwatch('dis: proj_froz',2)
 
@@ -1478,7 +1490,8 @@ subroutine dis_extract()
     !                                                                  !
     !==================================================================!  
       
-      use w90_io, only: io_time
+
+      use w90_io, only: io_wallclocktime
       use w90_sitesym, only: ir2ik,ik2ir,nkptirr,nsymmetry,kptsym !YN: RS:
 
       implicit none
@@ -1523,10 +1536,19 @@ subroutine dis_extract()
       integer :: icompflag,iter,ndiff
       real(kind=dp) :: womegai,wkomegai,womegai1,rsum,delta_womegai
       real(kind=dp), allocatable :: wkomegai1(:)
+
+      ! for MPI
+      real(kind=dp), allocatable :: wkomegai1_loc(:)
+      complex(kind=dp), allocatable :: camp_loc(:,:,:)
+      complex(kind=dp), allocatable :: u_matrix_opt_loc(:,:,:)
+
       complex(kind=dp), allocatable :: ceamp(:,:,:)
       complex(kind=dp), allocatable :: camp(:,:,:)
-      complex(kind=dp), allocatable :: czmat_in(:,:,:)
-      complex(kind=dp), allocatable :: czmat_out(:,:,:)
+      ! complex(kind=dp), allocatable :: czmat_in(:,:,:)
+      ! complex(kind=dp), allocatable :: czmat_out(:,:,:)
+      ! the z-matrices are now stored in local arrays
+      complex(kind=dp), allocatable :: czmat_in_loc(:,:,:)
+      complex(kind=dp), allocatable :: czmat_out_loc(:,:,:)
       complex(kind=dp), allocatable :: cham(:,:,:)
 
       integer,          allocatable :: iwork(:)
@@ -1543,11 +1565,16 @@ subroutine dis_extract()
       logical                       :: dis_converged
       complex(kind=dp) :: lambda(num_wann,num_wann) !RS:
 
-      if (timing_level>1) call io_stopwatch('dis: extract',1)
+      ! Needed to split an array on different nodes
+      integer, dimension(0:num_nodes-1) :: counts
+      integer, dimension(0:num_nodes-1) :: displs
+      integer :: nkp_loc
 
-      write(stdout,'(/1x,a)') &
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract',1)
+
+      if (on_root) write(stdout,'(/1x,a)') &
            '                  Extraction of optimally-connected subspace                  '
-      write(stdout,'(1x,a)') &
+      if (on_root) write(stdout,'(1x,a)') &
            '                  ------------------------------------------                  '
 
       allocate(cwb(num_wann,num_bands),stat=ierr)
@@ -1573,12 +1600,24 @@ subroutine dis_extract()
       allocate(cz(num_bands,num_bands),stat=ierr)
       if (ierr/=0) call io_error('Error allocating cz in dis_extract')
 
+      ! for MPI
+      call comms_array_split(num_kpts,counts,displs)
+      allocate(u_matrix_opt_loc(num_bands,num_wann,counts(my_node_id)),stat=ierr)
+      if (ierr/=0) call io_error('Error allocating u_matrix_opt_loc in dis_extract')   
+      ! Copy matrix elements from global U matrix to local U matrix
+      do nkp_loc = 1, counts(my_node_id)
+         nkp = nkp_loc + displs(my_node_id)
+         u_matrix_opt_loc(:,:,nkp_loc) = u_matrix_opt(:,:,nkp)
+      enddo  
+      allocate(wkomegai1_loc(counts(my_node_id)),stat=ierr)
+      if (ierr/=0) call io_error('Error allocating wkomegai1_loc in dis_extract')
+      allocate(czmat_in_loc(num_bands,num_bands,counts(my_node_id)),stat=ierr)
+      if (ierr/=0) call io_error('Error allocating czmat_in_loc in dis_extract')
+      allocate(czmat_out_loc(num_bands,num_bands,counts(my_node_id)),stat=ierr)
+      if (ierr/=0) call io_error('Error allocating czmat_out_loc in dis_extract')
+
       allocate(wkomegai1(num_kpts),stat=ierr)
       if (ierr/=0) call io_error('Error allocating wkomegai1 in dis_extract')
-      allocate(czmat_in(num_bands,num_bands,num_kpts),stat=ierr)
-      if (ierr/=0) call io_error('Error allocating czmat_in in dis_extract')
-      allocate(czmat_out(num_bands,num_bands,num_kpts),stat=ierr)
-      if (ierr/=0) call io_error('Error allocating czmat_out in dis_extract')
 
       allocate(history(dis_conv_window),stat=ierr)
       if (ierr/=0) call io_error('Error allocating history in dis_extract')
@@ -1619,9 +1658,9 @@ subroutine dis_extract()
 
       ! DEBUG
       if (iprint>2) then
-         write(stdout,'(a,/)') '  Original eigenvalues inside outer window:'  
+         if (on_root) write(stdout,'(a,/)') '  Original eigenvalues inside outer window:'  
          do nkp = 1, num_kpts  
-            write(stdout,'(a,i3,3x,20(f9.5,1x))') '  K-point ', nkp,&
+            if (on_root) write(stdout,'(a,i3,3x,20(f9.5,1x))') '  K-point ', nkp,&
                  ( eigval_opt(i, nkp), i = 1, ndimwin (nkp) )
          enddo
       endif
@@ -1630,11 +1669,11 @@ subroutine dis_extract()
       ! TO DO: Check if this is the best place to initialize icompflag
       icompflag = 0  
 
-      write(stdout,'(1x,a)') &
+      if (on_root) write(stdout,'(1x,a)') &
            '+---------------------------------------------------------------------+<-- DIS'
-      write(stdout,'(1x,a)') &
+      if (on_root) write(stdout,'(1x,a)') &
            '|  Iter     Omega_I(i-1)      Omega_I(i)      Delta (frac.)    Time   |<-- DIS'
-      write(stdout,'(1x,a)') &
+      if (on_root) write(stdout,'(1x,a)') &
            '+---------------------------------------------------------------------+<-- DIS'
 
       dis_converged = .false.
@@ -1644,18 +1683,22 @@ subroutine dis_extract()
       ! ------------------
       do iter = 1, dis_num_iter
 
-      if (timing_level>1) call io_stopwatch('dis: extract_1',1)
+         if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_1',1)
 
          if (iter.eq.1) then  
             ! Initialize Z matrix at k points w/ non-frozen states
-            do nkp = 1, num_kpts  
-               if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_in(:,:,nkp))
+            do nkp_loc = 1, counts(my_node_id)
+               nkp = nkp_loc + displs(my_node_id)            
+               if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_in_loc(:,:,nkp_loc))
             enddo
-            if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_in,lwindow) !RS:
+
+            if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_in_loc,lwindow) !RS:
+
          else  
             ! [iter.ne.1]
             ! Update Z matrix at k points with non-frozen states, using a mixing sch
-            do nkp = 1, num_kpts  
+            do nkp_loc = 1, counts(my_node_id)
+               nkp = nkp_loc + displs(my_node_id)            
                if (lsitesymmetry) then                !YN: RS: 
                   if (ir2ik(ik2ir(nkp)).ne.nkp) cycle !YN: RS: 
                endif                                  !YN: RS:
@@ -1663,20 +1706,20 @@ subroutine dis_extract()
                   ndimk = ndimwin(nkp) - ndimfroz(nkp)
                   do i=1,ndimk
                      do j=1,i
-                        czmat_in(j,i,nkp) = &
-                             cmplx(dis_mix_ratio,0.0_dp,dp) * czmat_out(j,i,nkp) &
-                             + cmplx(1.0_dp-dis_mix_ratio,0.0_dp,dp) * czmat_in(j,i,nkp)
+                        czmat_in_loc(j,i,nkp_loc) = &
+                             cmplx(dis_mix_ratio,0.0_dp,dp) * czmat_out_loc(j,i,nkp_loc) &
+                             + cmplx(1.0_dp-dis_mix_ratio,0.0_dp,dp) * czmat_in_loc(j,i,nkp_loc)
                         ! hermiticity
-                        czmat_in(i,j,nkp) = conjg(czmat_in(j,i,nkp))
+                        czmat_in_loc(i,j,nkp_loc) = conjg(czmat_in_loc(j,i,nkp_loc))
                      enddo
                   enddo
                endif
             enddo
          endif
          ! [if iter=1]
-      if (timing_level>1) call io_stopwatch('dis: extract_1',2)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_1',2)
 
-      if (timing_level>1) call io_stopwatch('dis: extract_2',1)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_2',1)
 
          womegai1 = 0.0_dp
          ! wkomegai1 is defined by Eq. (18) of SMV.
@@ -1684,13 +1727,14 @@ subroutine dis_extract()
          ! every k (before updating any k), so that for iter>1 overlaps are with
          ! non-frozen neighboring states from the previous iteration
 
-         wkomegai1 = real(num_wann,dp) * wbtot
+         wkomegai1_loc = real(num_wann,dp) * wbtot
          if (lsitesymmetry) then                                                                        !RS:
             do nkp=1,nkptirr                                                                            !RS:                      
-               wkomegai1(ir2ik(nkp))=wkomegai1(ir2ik(nkp))*nsymmetry/count(kptsym(:,nkp).eq.ir2ik(nkp)) !RS:   
+               wkomegai1_loc(ir2ik(nkp))=wkomegai1_loc(ir2ik(nkp))*nsymmetry/count(kptsym(:,nkp).eq.ir2ik(nkp)) !RS:   
             enddo                                                                                       !RS:
          endif                                                                                          !RS:
-         do nkp = 1, num_kpts  
+         do nkp_loc = 1, counts(my_node_id)
+            nkp = nkp_loc + displs(my_node_id)            
             if ( ndimfroz(nkp).gt.0 ) then  
                if (lsitesymmetry) call io_error('not implemented in symmetry-adapted mode') !YN: RS: 
                do nn=1,nntot
@@ -1706,43 +1750,51 @@ subroutine dis_extract()
                         rsum = rsum + real(cww(m,n),dp)**2 + aimag(cww(m,n))**2
                      enddo
                   enddo
-                  wkomegai1(nkp) = wkomegai1(nkp) - wb(nn)*rsum
+                  wkomegai1_loc(nkp_loc) = wkomegai1_loc(nkp_loc) - wb(nn)*rsum 
                enddo
             endif
          enddo
-      if (timing_level>1) call io_stopwatch('dis: extract_2',2)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_2',2)
 
-      if (timing_level>1) call io_stopwatch('dis: extract_3',1)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_3',1)
+
+         ! send chunks of wkomegai1 to root node
+         call comms_gatherv(wkomegai1_loc(1),counts(my_node_id),wkomegai1(1),counts,displs)
+         ! send back the whole wkomegai1 array to other nodes
+         call comms_bcast(wkomegai1(1),num_kpts)
 
          ! Refine optimal subspace at k points w/ non-frozen states
-         do nkp = 1, num_kpts  
+         do nkp_loc = 1, counts(my_node_id)
+            nkp = nkp_loc + displs(my_node_id)
             if (lsitesymmetry) then                                                     !RS: 
                if (ir2ik(ik2ir(nkp)).ne.nkp) cycle                                      !RS:
             end if                                                                      !RS:
             if (lsitesymmetry) then                                                     !RS:
-               call sitesym_dis_extract_symmetry(nkp,ndimwin(nkp),czmat_in,lambda,u_matrix_opt) !RS:
+
+               call sitesym_dis_extract_symmetry(nkp,ndimwin(nkp),czmat_in_loc,lambda,u_matrix_opt) !RS:
+
                do j=1,num_wann                                                          !RS:
-                  wkomegai1(nkp)=wkomegai1(nkp)-real(lambda(j,j),kind=dp)               !RS:
+                  wkomegai1_loc(nkp_loc)=wkomegai1(nkp_loc)-real(lambda(j,j),kind=dp)               !RS:
                enddo                                                                    !RS:
             else                                                                        !RS:
                if ( num_wann.gt.ndimfroz(nkp) ) then  
                   ! Diagonalize Z matrix
                   do j = 1, ndimwin(nkp) - ndimfroz(nkp)  
                      do i = 1, j  
-                        cap(i + ( (j - 1) * j) / 2) = czmat_in(i,j,nkp)  
+                        cap(i + ( (j - 1) * j) / 2) = czmat_in_loc(i,j,nkp_loc)  
                      enddo
                   enddo
                   ndiff = ndimwin(nkp) - ndimfroz(nkp)  
                   call ZHPEVX ('V', 'A', 'U', ndiff, cap, 0.0_dp, 0.0_dp, 0, 0, &
                        -1.0_dp, m, w, cz, num_bands, cwork, rwork, iwork, ifail, info)
                   if (info.lt.0) then  
-                     write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX'
-                     write(stdout,*) ' THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
+                     if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX'
+                     if (on_root) write(stdout,*) ' THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
                      call io_error(' dis_extract: error')  
                   endif
                   if (info.gt.0) then  
-                     write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX'
-                     write(stdout,*) info, ' EIGENVECTORS FAILED TO CONVERGE'  
+                     if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX'
+                     if (on_root) write(stdout,*) info, ' EIGENVECTORS FAILED TO CONVERGE'  
                      call io_error(' dis_extract: error')  
                   endif
    
@@ -1752,12 +1804,12 @@ subroutine dis_extract()
                   m = ndimfroz(nkp)  
                   do j = ndimwin(nkp) - num_wann + 1, ndimwin(nkp) - ndimfroz(nkp)
                      m = m + 1  
-                     wkomegai1(nkp) = wkomegai1(nkp) - w(j)  
-                     u_matrix_opt(1:ndimwin(nkp),m,nkp) = cmplx_0
+                     wkomegai1_loc(nkp_loc) = wkomegai1_loc(nkp_loc) - w(j)  
+                     u_matrix_opt_loc(1:ndimwin(nkp),m,nkp_loc) = cmplx_0
                      ndimk=ndimwin(nkp)-ndimfroz(nkp)
                      do i=1,ndimk
                         p=indxnfroz(i,nkp)
-                        u_matrix_opt(p,m,nkp) = cz(i,j)  
+                        u_matrix_opt_loc(p,m,nkp_loc) = cz(i,j)  
                      enddo
                   enddo
                endif
@@ -1766,7 +1818,7 @@ subroutine dis_extract()
 
             ! Now that we have contribs. from both frozen and non-frozen states to
             ! wkomegai1(nkp), add it to womegai1
-            womegai1 = womegai1 + wkomegai1(nkp)  
+            womegai1 = womegai1 + wkomegai1_loc(nkp_loc)  
 
 
             if(index(devel_flag,'compspace')>0) then
@@ -1777,18 +1829,20 @@ subroutine dis_extract()
                if (iter.eq.dis_num_iter) then  
                   allocate(camp(num_bands,num_bands,num_kpts),stat=ierr)
                   if (ierr/=0) call io_error('Error allocating camp in dis_extract')
+                  allocate(camp_loc(num_bands,num_bands,counts(my_node_id)),stat=ierr)
+                  if (ierr/=0) call io_error('Error allocating ucamp_loc in dis_extract') 
 
                   if (ndimwin(nkp).gt.num_wann) then  
                      do j = 1, ndimwin(nkp) - num_wann  
                         if ( num_wann.gt.ndimfroz(nkp) ) then  
                            ! USE THE NON-LEADING EIGENVECTORS OF THE Z-MATRIX
-                           camp(1:ndimwin(nkp),j,nkp)=cz(1:ndimwin(nkp),j)
+                           camp_loc(1:ndimwin(nkp),j,nkp_loc)=cz(1:ndimwin(nkp),j)
                         else  
                            ! Then num_wann=NDIMFROZ(NKP)
                            ! USE THE ORIGINAL NON-FROZEN BLOCH EIGENSTATES
                            do i = 1,ndimwin(nkp)  
-                              camp(i,j,nkp) = cmplx_0  
-                              if (i.eq.indxnfroz(j,nkp)) camp(i,j,nkp) = cmplx_1
+                              camp_loc(i,j,nkp_loc) = cmplx_0  
+                              if (i.eq.indxnfroz(j,nkp)) camp_loc(i,j,nkp_loc) = cmplx_1
                            enddo
                         endif
                      enddo
@@ -1801,10 +1855,34 @@ subroutine dis_extract()
 
          enddo
          ! [Loop over k points (nkp)]
+
          if (lsitesymmetry) call sitesym_symmetrize_u_matrix(num_bands,u_matrix_opt,lwindow) !RS:
-      if (timing_level>1) call io_stopwatch('dis: extract_3',2)
 
 
+         ! send chunks of wkomegai1 to root node
+         call comms_gatherv(wkomegai1_loc(1),counts(my_node_id),wkomegai1(1),counts,displs)
+         ! send back the whole wkomegai1 array to other nodes
+         call comms_bcast(wkomegai1(1),num_kpts)
+
+         call comms_allreduce(womegai1,1,'SUM')
+
+         if ( num_wann.gt.ndimfroz(nkp) ) then  
+            call comms_gatherv(u_matrix_opt_loc(1,1,1),num_bands*num_wann*counts(my_node_id),&
+                 u_matrix_opt(1,1,1),num_bands*num_wann*counts,num_bands*num_wann*displs)
+            call comms_bcast(u_matrix_opt(1,1,1),num_bands*num_wann*num_kpts)    
+         endif
+
+         if(index(devel_flag,'compspace')>0) then      
+            if (iter.eq.dis_num_iter) then
+               call comms_gatherv(camp_loc(1,1,1),num_bands*num_bands*counts(my_node_id),&
+                       camp(1,1,1),num_bands*num_bands*counts,num_bands*num_bands*displs)
+
+               call comms_bcast(camp(1,1,1),num_bands*num_bands*num_kpts)    
+            endif
+         endif      
+
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_3',2)
+
          womegai1 = womegai1 / real(num_kpts,dp)  
 
          ! DEBUG
@@ -1844,10 +1922,11 @@ subroutine dis_extract()
 
          ! Compute womegai  using the updated subspaces at all k, i.e.,
          ! replacing (i-1) by (i) in Eq. (12) SMV
-      if (timing_level>1) call io_stopwatch('dis: extract_4',1)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_4',1)
 
          womegai = 0.0_dp
-         do nkp = 1, num_kpts  
+         do nkp_loc = 1, counts(my_node_id)
+            nkp = nkp_loc + displs(my_node_id)   
             wkomegai=0.0_dp
             do nn=1,nntot
                nkp2=nnlist(nkp,nn)
@@ -1867,41 +1946,46 @@ subroutine dis_extract()
             wkomegai = real(num_wann,dp) * wbtot - wkomegai
             womegai =  womegai + wkomegai
          enddo
+
+         call comms_allreduce(womegai,1,'SUM')
+
          womegai = womegai / real(num_kpts,dp)  
          ! [Loop over k (nkp)]
-      if (timing_level>1) call io_stopwatch('dis: extract_4',2)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract_4',2)
 
          delta_womegai = womegai1/womegai - 1.0_dp
 
-         write(stdout,124) iter,womegai1*lenconfac**2,womegai*lenconfac**2,&
-              delta_womegai,io_time()
+         if (on_root) write(stdout,124) iter,womegai1*lenconfac**2,womegai*lenconfac**2,&
+              delta_womegai,io_wallclocktime()
 
 
 124      format(2x,i6,3x,f14.8,3x,f14.8,6x,es10.3,2x,f8.2,4x,'<-- DIS')
 
          ! Construct the updated Z matrix, CZMAT_OUT, at k points w/ non-frozen s
-         do nkp = 1, num_kpts  
-            if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_out(:,:,nkp))
+         do nkp_loc = 1, counts(my_node_id)
+            nkp = nkp_loc + displs(my_node_id)   
+            if (num_wann.gt.ndimfroz(nkp)) call internal_zmatrix(nkp,czmat_out_loc(:,:,nkp_loc))
          enddo
-         if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_out,lwindow) !RS:
+
+         if (lsitesymmetry) call sitesym_symmetrize_zmatrix(czmat_out_loc,lwindow) !RS:
 
          call internal_test_convergence()
          
          if (dis_converged) then
-            write(stdout,'(/13x,a,es10.3,a,i2,a)') &
+            if (on_root) write(stdout,'(/13x,a,es10.3,a,i2,a)') &
                  '<<<      Delta <',dis_conv_tol,&
                  '  over ',dis_conv_window,' iterations     >>>'
-            write(stdout,'(13x,a)')  '<<< Disentanglement convergence criteria satisfied >>>'
+            if (on_root) write(stdout,'(13x,a)')  '<<< Disentanglement convergence criteria satisfied >>>'
             exit
          endif
 
       enddo
       ! [BIG ITERATION LOOP (iter)]
 
-      deallocate(czmat_out,stat=ierr)
-      if (ierr/=0) call io_error('Error deallocating czmat_out in dis_extract')
-      deallocate(czmat_in,stat=ierr)
-      if (ierr/=0) call io_error('Error deallocating czmat_in in dis_extract')
+      deallocate(czmat_out_loc,stat=ierr)
+      if (ierr/=0) call io_error('Error deallocating czmat_out_loc in dis_extract')
+      deallocate(czmat_in_loc,stat=ierr)
+      if (ierr/=0) call io_error('Error deallocating czmat_in_loc in dis_extract')
 
       allocate(ceamp(num_bands,num_bands,num_kpts),stat=ierr)
       if (ierr/=0) call io_error('Error allocating ceamp in dis_extract')
@@ -1909,28 +1993,28 @@ subroutine dis_extract()
       if (ierr/=0) call io_error('Error allocating cham in dis_extract')
 
       if (.not.dis_converged) then
-         write(stdout,'(/5x,a)') &
+         if (on_root) write(stdout,'(/5x,a)') &
           '<<< Warning: Maximum number of disentanglement iterations reached >>>'
-         write(stdout,'(10x,a)') '<<< Disentanglement convergence criteria not satisfied >>>'
+         if (on_root) write(stdout,'(10x,a)') '<<< Disentanglement convergence criteria not satisfied >>>'
       endif
 
       if(index(devel_flag,'compspace')>0) then
 
          if (icompflag.eq.1) then
             if (iprint>2) then
-               write(stdout,('(/4x,a)')) &
+               if (on_root) write(stdout,('(/4x,a)')) &
                     'WARNING: Complement subspace has zero dimensions at the following k-points:'
                i=0
-               write(stdout,'(4x)',advance='no')
+               if (on_root) write(stdout,'(4x)',advance='no')
                do nkp=1,num_kpts
                   if (ndimwin(nkp).eq.num_wann) then  
                      i=i+1
                      if (i.le.12) then
-                        write(stdout,'(i6)',advance='no') nkp
+                        if (on_root) write(stdout,'(i6)',advance='no') nkp
                      else
                         i=1
-                        write(stdout,'(/4x)',advance='no')
-                        write(stdout,'(i6)',advance='no') nkp
+                        if (on_root) write(stdout,'(/4x)',advance='no')
+                        if (on_root) write(stdout,'(i6)',advance='no') nkp
                      endif
                   endif
                enddo
@@ -1944,7 +2028,7 @@ subroutine dis_extract()
       ! Write the final womegai. This should remain unchanged during the
       ! subsequent minimization of Omega_tilde in wannierise.f90
       ! We store it in the checkpoint file as a sanity check
-      write(stdout,'(/8x,a,f14.8,a/)') 'Final Omega_I ',&
+      if (on_root) write(stdout,'(/8x,a,f14.8,a/)') 'Final Omega_I ',&
            womegai*lenconfac**2,' ('//trim(length_unit)//'^2)'
 
       ! Set public variable omega_invariant
@@ -1973,13 +2057,13 @@ subroutine dis_extract()
               m, w, cz, num_bands, cwork, rwork, iwork, ifail, info)
 
          if (info.lt.0) then  
-            write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
-            write(stdout,*) ' THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
+            if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
+            if (on_root) write(stdout,*) ' THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
             call io_error(' dis_extract: error')   
          endif
          if (info.gt.0) then  
-            write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
-            write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE'  
+            if (on_root) write(stdout,*) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
+            if (on_root) write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE'  
             call io_error(' dis_extract: error')   
          endif
 
@@ -2001,9 +2085,9 @@ subroutine dis_extract()
 
       ! DEBUG
       if (iprint>2) then
-         write(stdout,'(/,a,/)') '  Eigenvalues inside optimal subspace:'  
+         if (on_root) write(stdout,'(/,a,/)') '  Eigenvalues inside optimal subspace:'  
          do nkp = 1, num_kpts  
-            write(stdout,'(a,i3,2x,20(f9.5,1x))') '  K-point ', &
+            if (on_root) write(stdout,'(a,i3,2x,20(f9.5,1x))') '  K-point ', &
                  nkp, (eigval_opt(i,nkp), i = 1, num_wann)
          enddo
       endif
@@ -2028,8 +2112,8 @@ subroutine dis_extract()
 
          if (icompflag.eq.1) then  
             if (iprint>2) then
-               write(stdout,*) 'AT SOME K-POINT(S) COMPLEMENT SUBSPACE HAS ZERO DIMENSIONALITY'
-               write(stdout,*) '=> DID NOT CREATE FILE COMPSPACE.DAT'  
+               if (on_root) write(stdout,*) 'AT SOME K-POINT(S) COMPLEMENT SUBSPACE HAS ZERO DIMENSIONALITY'
+               if (on_root) write(stdout,*) '=> DID NOT CREATE FILE COMPSPACE.DAT'  
             endif
          else  
             ! DIAGONALIZE THE HAMILTONIAN IN THE COMPLEMENT SUBSPACE, WRITE THE
@@ -2053,13 +2137,13 @@ subroutine dis_extract()
                call ZHPEVX ('V', 'A', 'U', ndiff, cap, 0.0_dp, 0.0_dp, 0, 0, &
                     -1.0_dp, m, w, cz, num_bands, cwork, rwork, iwork, ifail, info)
                if (info.lt.0) then  
-                  write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
-                  write(stdout,*) 'THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
+                  if (on_root) write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
+                  if (on_root) write(stdout,*) 'THE ',  -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
                   call io_error(' dis_extract: error')   
                endif
                if (info.gt.0) then  
-                  write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
-                  write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE'  
+                  if (on_root) write(stdout,*) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
+                  if (on_root) write(stdout,*) info, 'EIGENVECTORS FAILED TO CONVERGE'  
                   call io_error(' dis_extract: error')   
                endif
                ! CALCULATE AMPLITUDES OF THE ENERGY EIGENVECTORS IN THE COMPLEMENT SUBS
@@ -2095,8 +2179,16 @@ subroutine dis_extract()
          deallocate(camp,stat=ierr)
          if (ierr/=0) call io_error('Error deallocating camp in dis_extract')
       end if
+      if(allocated(camp_loc)) then
+         deallocate(camp_loc,stat=ierr)
+         if (ierr/=0) call io_error('Error deallocating camp_loc in dis_extract')
+      endif
       deallocate(ceamp,stat=ierr)
       if (ierr/=0) call io_error('Error deallocating ceamp in dis_extract')
+      deallocate(u_matrix_opt_loc,stat=ierr)
+         if (ierr/=0) call io_error('Error deallocating u_matrix_opt_loc in dis_extract')
+      deallocate(wkomegai1_loc,stat=ierr)
+      if (ierr/=0) call io_error('Error deallocating wkomegai1_loc in dis_extract')
       deallocate(wkomegai1,stat=ierr)
       if (ierr/=0) call io_error('Error deallocating wkomegai1 in dis_extract')
 
@@ -2122,10 +2214,10 @@ subroutine dis_extract()
       deallocate(cwb,stat=ierr)
       if (ierr/=0) call io_error('Error deallocating cwb in dis_extract')
 
-      write(stdout,'(1x,a/)') &
+      if (on_root) write(stdout,'(1x,a/)') &
            '+----------------------------------------------------------------------------+'
 
-      if (timing_level>1) call io_stopwatch('dis: extract',2)
+      if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract',2)
 
       return  
 
@@ -2185,7 +2277,7 @@ subroutine internal_zmatrix(nkp,cmtrx)
         integer          :: l,m,n,p,q,nn,nkp2,ndimk
         complex(kind=dp) :: csum
 
-        if (timing_level>1) call io_stopwatch('dis: extract: zmatrix',1)
+        if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract: zmatrix',1)
 
         cmtrx=cmplx_0
         ndimk=ndimwin(nkp)-ndimfroz(nkp)
@@ -2208,7 +2300,7 @@ subroutine internal_zmatrix(nkp,cmtrx)
            enddo
         enddo
 
-        if (timing_level>1) call io_stopwatch('dis: extract: zmatrix',2)
+        if (timing_level>1 .and. on_root) call io_stopwatch('dis: extract: zmatrix',2)
 
         return  
 
diff --git a/src/hamiltonian.F90 b/src/hamiltonian.F90
index acfb57461..8a74d7ee3 100644
--- a/src/hamiltonian.F90
+++ b/src/hamiltonian.F90
@@ -17,6 +17,7 @@ module w90_hamiltonian
   !! This is a simplified routine, more sophisticated properties
   !! are found in postw90 (e.g. w90_get_oper)
   use w90_constants, only : dp
+  use w90_comms, only : on_root
 
   implicit none
 
@@ -411,14 +412,15 @@ subroutine internal_translate_centres()
       ! NEVER overwrite wannier_centres
       !wannier_centres = r_home
 
-      write(stdout,'(1x,a)') 'Translated centres'
-      write(stdout,'(4x,a,3f10.6)') 'translation centre in fractional coordinate:',translation_centre_frac(:)
-      do iw=1,num_wann
-         write(stdout,888) iw,(r_home(ind,iw)*lenconfac,ind=1,3)
-      end do
-      write(stdout,'(1x,a78)') repeat('-',78)
-      write(stdout,*)
-
+      if (on_root) then
+         write(stdout,'(1x,a)') 'Translated centres'
+         write(stdout,'(4x,a,3f10.6)') 'translation centre in fractional coordinate:',translation_centre_frac(:)
+         do iw=1,num_wann
+            write(stdout,888) iw,(r_home(ind,iw)*lenconfac,ind=1,3)
+         end do
+         write(stdout,'(1x,a78)') repeat('-',78)
+         write(stdout,*)
+      endif
       wannier_centres_translated = r_home
 
       deallocate(r_frac,stat=ierr)
@@ -583,7 +585,7 @@ subroutine hamiltonian_wigner_seitz(count_pts)
     if(count_pts) return
 
 
-    if(iprint>=3) then
+    if(iprint>=3.and.on_root) then
        write(stdout,'(1x,i4,a,/)') nrpts,  ' lattice points in Wigner-Seitz supercell:'
        do i=1,nrpts
           write(stdout,'(4x,a,3(i3,1x),a,i2)') '  vector ', irvec(1,i),irvec(2,i),&
diff --git a/src/io.F90 b/src/io.F90
index d159a283f..6382bd0bd 100644
--- a/src/io.F90
+++ b/src/io.F90
@@ -17,7 +17,6 @@ module w90_io
 
 
   use w90_constants, only : dp
-
   implicit none
 
   private
@@ -59,6 +58,7 @@ module w90_io
   public :: io_print_timings
   public :: io_get_seedname
   public :: io_time
+  public :: io_wallclocktime
   public :: io_date
   public :: io_error
   public :: io_file_unit
@@ -211,23 +211,31 @@ subroutine io_error ( error_msg )
 
 #ifdef MPI
          character(len=50) :: filename
-         integer           :: stderr,ierr,whoami
+         integer           :: stderr,ierr,whoami,num_nodes
 
          call mpi_comm_rank(mpi_comm_world, whoami, ierr)
-         if(whoami>99999) then
-            write(filename,'(a,a,I0,a)')trim(seedname),'.node_',whoami,'.werr'
-         else
-            write(filename,'(a,a,I5.5,a)')trim(seedname),'.node_',whoami,'.werr'
-         endif
-         stderr=io_file_unit()
-         open(unit=stderr,file=trim(filename),form='formatted',err=105)
-         write(stderr, '(1x,a)') trim(error_msg)
-         close(stderr)
+         call mpi_comm_size(mpi_comm_world, num_nodes, ierr)
+         if(num_nodes>1) then
+            if(whoami>99999) then
+               write(filename,'(a,a,I0,a)')trim(seedname),'.node_',whoami,'.werr'
+            else
+               write(filename,'(a,a,I5.5,a)')trim(seedname),'.node_',whoami,'.werr'
+            endif
+            stderr=io_file_unit()
+            open(unit=stderr,file=trim(filename),form='formatted',err=105)
+            write(stderr, '(1x,a)') trim(error_msg)
+            close(stderr)
+         end if
 
 105      write(*,'(1x,a)') trim(error_msg)
 106      write(*,'(1x,a,I0,a)') "Error on node ", &
               whoami, ": examine the output/error files for details"
          
+         if(whoami==0) then
+            write(stdout,*)  'Exiting.......' 
+            write(stdout, '(1x,a)') trim(error_msg)
+         end if
+
          call MPI_abort(MPI_comm_world,1,ierr)
 
 #else
@@ -308,6 +316,36 @@ function io_time()
     return
   end function io_time
 
+
+    !==================================================================!
+      function io_wallclocktime()
+    !==================================================================!
+    !                                                                  !
+    ! Returns elapsed wall clock time in seconds since its first call  !
+    !                                                                  !
+    !===================================================================  
+    use w90_constants, only : dp
+    implicit none
+
+    real(kind=dp) :: io_wallclocktime
+
+    integer :: c0,c1
+    integer :: rate
+    logical :: first=.true.
+    save first, rate, c0
+
+    if (first) then
+
+       call system_clock(c0, rate)
+       io_wallclocktime = 0.0_dp
+       first = .false.
+    else
+       call system_clock(c1)
+       io_wallclocktime = real(c1 - c0)/real(rate)
+    endif
+    return
+  end function io_wallclocktime
+
   !===========================================
   function io_file_unit()
   !===========================================
diff --git a/src/kmesh.F90 b/src/kmesh.F90
index d9bd993bf..d06a71ad9 100644
--- a/src/kmesh.F90
+++ b/src/kmesh.F90
@@ -24,6 +24,7 @@ module w90_kmesh
 
   use w90_constants, only : dp
   use w90_parameters
+  use w90_comms, only : on_root
 
   implicit none
 
@@ -91,7 +92,7 @@ subroutine kmesh_get()
 
     if (timing_level>0) call io_stopwatch('kmesh: get',1)
 
-    write(stdout,'(/1x,a)') &
+    if (on_root) write(stdout,'(/1x,a)') &
       '*---------------------------------- K-MESH ----------------------------------*'  
 
     ! Sort the cell neighbours so we loop in order of distance from the home shell
@@ -131,28 +132,31 @@ subroutine kmesh_get()
        dnn1 = eta  
     enddo
 
-    write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' 
-    write(stdout,'(1x,a)') '|                    Distance to Nearest-Neighbour Shells                    |'
-    write(stdout,'(1x,a)') '|                    ------------------------------------                    |'
-    if (lenconfac.eq.1.0_dp) then
-       write(stdout,'(1x,a)') '|          Shell             Distance (Ang^-1)          Multiplicity         |'
-       write(stdout,'(1x,a)') '|          -----             -----------------          ------------         |'
-    else
-       write(stdout,'(1x,a)') '|          Shell             Distance (Bohr^-1)         Multiplicity         |'
-       write(stdout,'(1x,a)') '|          -----             ------------------         ------------         |'
+    if (on_root) then
+       write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' 
+       write(stdout,'(1x,a)') '|                    Distance to Nearest-Neighbour Shells                    |'
+       write(stdout,'(1x,a)') '|                    ------------------------------------                    |'
+       if (lenconfac.eq.1.0_dp) then
+          write(stdout,'(1x,a)') '|          Shell             Distance (Ang^-1)          Multiplicity         |'
+          write(stdout,'(1x,a)') '|          -----             -----------------          ------------         |'
+       else
+          write(stdout,'(1x,a)') '|          Shell             Distance (Bohr^-1)         Multiplicity         |'
+          write(stdout,'(1x,a)') '|          -----             ------------------         ------------         |'
+       endif
+       do ndnn = 1, ndnntot  
+          write(stdout,'(1x,a,11x,i3,17x,f10.6,19x,i4,12x,a)') '|',ndnn,dnn(ndnn)/lenconfac,multi(ndnn),'|' 
+       enddo
+       write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' 
     endif
-    do ndnn = 1, ndnntot  
-       write(stdout,'(1x,a,11x,i3,17x,f10.6,19x,i4,12x,a)') '|',ndnn,dnn(ndnn)/lenconfac,multi(ndnn),'|' 
-    enddo
-    write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' 
-
 
     if(iprint>=4) then
        ! Write out all the bvectors
-       write(stdout,'(1x,"|",76(" "),"|")') 
-       write(stdout,'(1x,a)') '|         Complete list of b-vectors and their lengths                       |' 
-       write(stdout,'(1x,"|",76(" "),"|")') 
-       write(stdout,'(1x,"+",76("-"),"+")') 
+       if (on_root) then 
+          write(stdout,'(1x,"|",76(" "),"|")') 
+          write(stdout,'(1x,a)') '|         Complete list of b-vectors and their lengths                       |' 
+          write(stdout,'(1x,"|",76(" "),"|")') 
+          write(stdout,'(1x,"+",76("-"),"+")') 
+       endif
 
        allocate( bvec_tmp(3,maxval(multi)),stat=ierr)
        if (ierr/=0) call io_error('Error allocating bvec_tmp in kmesh_get')
@@ -162,14 +166,14 @@ subroutine kmesh_get()
           call kmesh_get_bvectors(multi(shell),1,dnn(shell),bvec_tmp(:,1:multi(shell)))
           do loop=1,multi(shell)
              counter=counter+1
-             write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector  ',counter,': (', &
+             if (on_root)write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector  ',counter,': (', &
                   bvec_tmp(:,loop)/lenconfac,')',dnn(shell)/lenconfac,'  |'
           end do
        end do
        deallocate( bvec_tmp)
        if (ierr/=0) call io_error('Error deallocating bvec_tmp in kmesh_get')
-       write(stdout,'(1x,"|",76(" "),"|")') 
-       write(stdout,'(1x,"+",76("-"),"+")') 
+       if (on_root)write(stdout,'(1x,"|",76(" "),"|")') 
+       if (on_root)write(stdout,'(1x,"+",76("-"),"+")') 
     end if
 
 
@@ -183,19 +187,20 @@ subroutine kmesh_get()
           call kmesh_shell_fixed(multi,dnn,bweight)
        end if
 
-       write(stdout,'(1x,a)',advance='no') '| The following shells are used: '
-       do ndnn=1,num_shells
-          if (ndnn.eq.num_shells) then
-             write(stdout,'(i3,1x)',advance='no') shell_list(ndnn)
-          else
-             write(stdout,'(i3,",")',advance='no') shell_list(ndnn)
-          endif
-       enddo
-       do l=1,11-num_shells
-          write(stdout,'(4x)',advance='no')
-       enddo
-       write(stdout,'("|")')
-
+       if (on_root) then
+          write(stdout,'(1x,a)',advance='no') '| The following shells are used: '
+          do ndnn=1,num_shells
+             if (ndnn.eq.num_shells) then
+                write(stdout,'(i3,1x)',advance='no') shell_list(ndnn)
+             else
+                write(stdout,'(i3,",")',advance='no') shell_list(ndnn)
+             endif
+          enddo
+          do l=1,11-num_shells
+             write(stdout,'(4x)',advance='no')
+          enddo
+          write(stdout,'("|")')
+       endif
     end if
        
     nntot=0
@@ -204,26 +209,28 @@ subroutine kmesh_get()
     end do
 
     if(nntot>num_nnmax) then
-    write(stdout,'(a,i2,a)') ' **WARNING: kmesh has found >',num_nnmax,' nearest neighbours**'
-    write(stdout,'(a)') ' '
-    write(stdout,'(a)') ' This is probably caused by an error in your unit cell specification'
-    write(stdout,'(a)') ' '
-    write(stdout,'(a)') ' If you think this is not the problem; please send your *.win file to the '
-    write(stdout,'(a)') ' wannier90 developers'
-    write(stdout,'(a)') ' '
-    write(stdout,'(a)') ' The problem may be caused by having accidentally degenerate shells of '
-    write(stdout,'(a)') ' kpoints. The solution is then to rerun wannier90 specifying the b-vectors '
-    write(stdout,'(a)') ' in each shell.  Give devel_flag=kmesh_degen in the *.win file'
-    write(stdout,'(a)') ' and create a *.kshell file:'
-    write(stdout,'(a)') ' '
-    write(stdout,'(a)') ' $>   cat hexagonal.kshell'
-    write(stdout,'(a)') ' $>   1 2'
-    write(stdout,'(a)') ' $>   5 6 7 8'
-    write(stdout,'(a)') ' '
-    write(stdout,'(a)') ' Where each line is a new shell (so num_shells in total)'
-    write(stdout,'(a)') ' The elements are the bvectors labelled according to the following '
-    write(stdout,'(a)') ' list (last column is distance)'
-    write(stdout,'(a)') ' '
+    if (on_root) then
+       write(stdout,'(a,i2,a)') ' **WARNING: kmesh has found >',num_nnmax,' nearest neighbours**'
+       write(stdout,'(a)') ' '
+       write(stdout,'(a)') ' This is probably caused by an error in your unit cell specification'
+       write(stdout,'(a)') ' '
+       write(stdout,'(a)') ' If you think this is not the problem; please send your *.win file to the '
+       write(stdout,'(a)') ' wannier90 developers'
+       write(stdout,'(a)') ' '
+       write(stdout,'(a)') ' The problem may be caused by having accidentally degenerate shells of '
+       write(stdout,'(a)') ' kpoints. The solution is then to rerun wannier90 specifying the b-vectors '
+       write(stdout,'(a)') ' in each shell.  Give devel_flag=kmesh_degen in the *.win file'
+       write(stdout,'(a)') ' and create a *.kshell file:'
+       write(stdout,'(a)') ' '
+       write(stdout,'(a)') ' $>   cat hexagonal.kshell'
+       write(stdout,'(a)') ' $>   1 2'
+       write(stdout,'(a)') ' $>   5 6 7 8'
+       write(stdout,'(a)') ' '
+       write(stdout,'(a)') ' Where each line is a new shell (so num_shells in total)'
+       write(stdout,'(a)') ' The elements are the bvectors labelled according to the following '
+       write(stdout,'(a)') ' list (last column is distance)'
+       write(stdout,'(a)') ' '
+    endif
     
     allocate( bvec_tmp(3,maxval(multi)),stat=ierr)
     if (ierr/=0) call io_error('Error allocating bvec_tmp in kmesh_get')
@@ -233,11 +240,11 @@ subroutine kmesh_get()
        call kmesh_get_bvectors(multi(shell),1,dnn(shell),bvec_tmp(:,1:multi(shell)))
        do loop=1,multi(shell)
           counter=counter+1
-          write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector  ',counter,': (', &
+          if (on_root) write(stdout,'(a,I4,1x,a,2x,3f12.6,2x,a,2x,f12.6,a)') ' | b-vector  ',counter,': (', &
                bvec_tmp(:,loop)/lenconfac,')',dnn(shell)/lenconfac,'  |'
        end do
     end do
-    write(stdout,'(a)') ' '
+    if (on_root) write(stdout,'(a)') ' '
     deallocate( bvec_tmp)
     if (ierr/=0) call io_error('Error deallocating bvec_tmp in kmesh_get')
 
@@ -275,9 +282,11 @@ subroutine kmesh_get()
     ! Comment: Now we have bk(3,nntot,num_kps) 09/04/2006
 
 
-    write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' 
-    write(stdout,'(1x,a)') '|                        Shell   # Nearest-Neighbours                        |'
-    write(stdout,'(1x,a)') '|                        -----   --------------------                        |'
+    if (on_root) then
+       write(stdout,'(1x,a)') '+----------------------------------------------------------------------------+' 
+       write(stdout,'(1x,a)') '|                        Shell   # Nearest-Neighbours                        |'
+       write(stdout,'(1x,a)') '|                        -----   --------------------                        |'
+    endif
     if(index(devel_flag,'kmesh_degen')==0) then
        !
        ! Standard routine
@@ -354,9 +363,9 @@ subroutine kmesh_get()
 
     do ndnnx=1, num_shells
        ndnn = shell_list(ndnnx)
-       write(stdout,'(1x,a,24x,i3,13x,i3,33x,a)') '|',ndnn,nnshell(1,ndnn),'|'
+       if (on_root) write(stdout,'(1x,a,24x,i3,13x,i3,33x,a)') '|',ndnn,nnshell(1,ndnn),'|'
     end do
-    write(stdout,'(1x,"+",76("-"),"+")') 
+    if (on_root) write(stdout,'(1x,"+",76("-"),"+")') 
 
 
     do nkp = 1, num_kpts  
@@ -372,7 +381,7 @@ subroutine kmesh_get()
                 bbn = bbn + bk_local(i,nnx,nkp) * bk_local(i,nnx,nkp)  
              enddo
              if (abs(sqrt(bb1)-sqrt(bbn)).gt.kmesh_tol) then  
-                write(stdout,'(1x,2f10.6)') bb1,bbn
+                if (on_root) write(stdout,'(1x,2f10.6)') bb1,bbn
                 call io_error('Non-symmetric k-point neighbours!')
              endif
           enddo
@@ -398,11 +407,11 @@ subroutine kmesh_get()
                    enddo
                 enddo
                 if ( (i.eq.j) .and. (abs(ddelta-1.0_dp).gt.kmesh_tol) ) then
-                   write(stdout,'(1x,2i3,f12.8)') i,j,ddelta
+                   if (on_root) write(stdout,'(1x,2i3,f12.8)') i,j,ddelta
                    call io_error('Eq. (B1) not satisfied in kmesh_get (1)')  
                 endif
                 if ( (i.ne.j) .and. (abs(ddelta).gt.kmesh_tol) ) then  
-                   write(stdout,'(1x,2i3,f12.8)') i,j,ddelta
+                   if (on_root) write(stdout,'(1x,2i3,f12.8)') i,j,ddelta
                    call io_error('Eq. (B1) not satisfied in kmesh_get (2)')  
                 endif
              enddo
@@ -410,8 +419,8 @@ subroutine kmesh_get()
        enddo
     end if
        
-    write(stdout,'(1x,a)') '| Completeness relation is fully satisfied [Eq. (B1), PRB 56, 12847 (1997)]  |'  
-    write(stdout,'(1x,"+",76("-"),"+")') 
+    if (on_root) write(stdout,'(1x,a)') '| Completeness relation is fully satisfied [Eq. (B1), PRB 56, 12847 (1997)]  |'  
+    if (on_root) write(stdout,'(1x,"+",76("-"),"+")') 
 
     !
     wbtot = 0.0_dp  
@@ -447,35 +456,36 @@ subroutine kmesh_get()
     enddo
     if (na.ne.nnh) call io_error('Did not find right number of bk directions')
 
-
-    if (lenconfac.eq.1.0_dp) then
-       write(stdout,'(1x,a)') '|                  b_k Vectors (Ang^-1) and Weights (Ang^2)                  |'
-       write(stdout,'(1x,a)') '|                  ----------------------------------------                  |'
-    else
-       write(stdout,'(1x,a)') '|                 b_k Vectors (Bohr^-1) and Weights (Bohr^2)                 |'
-       write(stdout,'(1x,a)') '|                 ------------------------------------------                 |'
-    endif
-    write(stdout,'(1x,a)') '|            No.         b_k(x)      b_k(y)      b_k(z)        w_b           |'
-    write(stdout,'(1x,a)') '|            ---        --------------------------------     --------        |'
-    do i = 1, nntot  
-       write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') &
-            i,(bk_local(j,i,1)/lenconfac,j=1,3),wb_local(i)*lenconfac**2
-    enddo
-    write(stdout,'(1x,"+",76("-"),"+")') 
-    if (lenconfac.eq.1.0_dp) then
-       write(stdout,'(1x,a)') '|                           b_k Directions (Ang^-1)                          |'
-       write(stdout,'(1x,a)') '|                           -----------------------                          |'
-    else
-       write(stdout,'(1x,a)') '|                           b_k Directions (Bohr^-1)                         |'
-       write(stdout,'(1x,a)') '|                           ------------------------                         |'
+    if (on_root) then
+       if (lenconfac.eq.1.0_dp) then
+          write(stdout,'(1x,a)') '|                  b_k Vectors (Ang^-1) and Weights (Ang^2)                  |'
+          write(stdout,'(1x,a)') '|                  ----------------------------------------                  |'
+       else
+          write(stdout,'(1x,a)') '|                 b_k Vectors (Bohr^-1) and Weights (Bohr^2)                 |'
+          write(stdout,'(1x,a)') '|                 ------------------------------------------                 |'
+       endif
+       write(stdout,'(1x,a)') '|            No.         b_k(x)      b_k(y)      b_k(z)        w_b           |'
+       write(stdout,'(1x,a)') '|            ---        --------------------------------     --------        |'
+       do i = 1, nntot  
+          write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') &
+               i,(bk_local(j,i,1)/lenconfac,j=1,3),wb_local(i)*lenconfac**2
+       enddo
+       write(stdout,'(1x,"+",76("-"),"+")') 
+       if (lenconfac.eq.1.0_dp) then
+          write(stdout,'(1x,a)') '|                           b_k Directions (Ang^-1)                          |'
+          write(stdout,'(1x,a)') '|                           -----------------------                          |'
+       else
+          write(stdout,'(1x,a)') '|                           b_k Directions (Bohr^-1)                         |'
+          write(stdout,'(1x,a)') '|                           ------------------------                         |'
+       endif
+       write(stdout,'(1x,a)') '|            No.           x           y           z                         |'
+       write(stdout,'(1x,a)') '|            ---        --------------------------------                     |'
+       do i = 1, nnh  
+          write(stdout,'(1x,"|",11x,i3,5x,3f12.6,21x,"|")') i,(bka(j,i)/lenconfac,j=1,3)  
+       enddo
+       write(stdout,'(1x,"+",76("-"),"+")') 
+       write(stdout,*) ' '  
     endif
-    write(stdout,'(1x,a)') '|            No.           x           y           z                         |'
-    write(stdout,'(1x,a)') '|            ---        --------------------------------                     |'
-    do i = 1, nnh  
-       write(stdout,'(1x,"|",11x,i3,5x,3f12.6,21x,"|")') i,(bka(j,i)/lenconfac,j=1,3)  
-    enddo
-    write(stdout,'(1x,"+",76("-"),"+")') 
-    write(stdout,*) ' '  
 
 
     ! find index array
@@ -490,7 +500,7 @@ subroutine kmesh_get()
           enddo
           ! check found
           if (neigh(nkp,na).eq.0) then  
-             write(stdout,*) ' nkp,na=',nkp,na  
+             if (on_root) write(stdout,*) ' nkp,na=',nkp,na  
              call io_error('kmesh_get: failed to find neighbours for this kpoint')  
           endif
        enddo
@@ -573,24 +583,26 @@ subroutine kmesh_get()
 
        if (na.ne.nnh) call io_error('Did not find right number of b-vectors in gamma_only option')
 
-       write(stdout,'(1x,"+",76("-"),"+")')
-       write(stdout,'(1x,a)') '|        Gamma-point: number of the b-vectors is reduced by half             |'
-       write(stdout,'(1x,"+",76("-"),"+")')
-       if (lenconfac.eq.1.0_dp) then
-          write(stdout,'(1x,a)') '|                  b_k Vectors (Ang^-1) and Weights (Ang^2)                  |'
-          write(stdout,'(1x,a)') '|                  ----------------------------------------                  |'
-       else
-          write(stdout,'(1x,a)') '|                 b_k Vectors (Bohr^-1) and Weights (Bohr^2)                 |'
-          write(stdout,'(1x,a)') '|                 ------------------------------------------                 |'
+       if (on_root) then 
+          write(stdout,'(1x,"+",76("-"),"+")')
+          write(stdout,'(1x,a)') '|        Gamma-point: number of the b-vectors is reduced by half             |'
+          write(stdout,'(1x,"+",76("-"),"+")')
+          if (lenconfac.eq.1.0_dp) then
+             write(stdout,'(1x,a)') '|                  b_k Vectors (Ang^-1) and Weights (Ang^2)                  |'
+             write(stdout,'(1x,a)') '|                  ----------------------------------------                  |'
+          else
+             write(stdout,'(1x,a)') '|                 b_k Vectors (Bohr^-1) and Weights (Bohr^2)                 |'
+             write(stdout,'(1x,a)') '|                 ------------------------------------------                 |'
+          endif
+          write(stdout,'(1x,a)') '|            No.         b_k(x)      b_k(y)      b_k(z)        w_b           |'
+          write(stdout,'(1x,a)') '|            ---        --------------------------------     --------        |'
+          do i = 1, nntot
+             write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') &
+                  i,(bk(j,i,1)/lenconfac,j=1,3),wb(i)*lenconfac**2
+          enddo
+          write(stdout,'(1x,"+",76("-"),"+")')
+          write(stdout,*) ' '
        endif
-       write(stdout,'(1x,a)') '|            No.         b_k(x)      b_k(y)      b_k(z)        w_b           |'
-       write(stdout,'(1x,a)') '|            ---        --------------------------------     --------        |'
-       do i = 1, nntot
-          write (stdout,'(1x,"|",11x,i3,5x,3f12.6,3x,f10.6,8x,"|")') &
-               i,(bk(j,i,1)/lenconfac,j=1,3),wb(i)*lenconfac**2
-       enddo
-       write(stdout,'(1x,"+",76("-"),"+")')
-       write(stdout,*) ' '
 
        deallocate(nnlist_tmp, stat=ierr )
        if (ierr/=0) call io_error('Error in deallocating nnlist_tmp in kmesh_get')
@@ -760,31 +772,41 @@ subroutine kmesh_dealloc()
     !========================================
     !                                       
     !!  Release memory from the kmesh module 
-    !                                      
+    !   This routine now check to see if arrays
+    !   are allocated, as there are some code
+    !   paths that will not allocate on all nodes
     !========================================
     use w90_io,   only : io_error
     implicit none
     integer :: ierr
 
     ! Deallocate real arrays that are public
-    if (.not. explicit_nnkpts) then
-        deallocate(bk, stat=ierr )
-        if (ierr/=0) call io_error('Error in deallocating bk in kmesh_dealloc')
-        deallocate(bka, stat=ierr )
-        if (ierr/=0) call io_error('Error in deallocating bka in kmesh_dealloc')
-        deallocate(wb, stat=ierr )
-        if (ierr/=0) call io_error('Error in deallocating wb in kmesh_dealloc')
+    if(allocated(bk))then
+       deallocate(bk, stat=ierr )
+       if (ierr/=0) call io_error('Error in deallocating bk in kmesh_dealloc')
+    endif
+    if(allocated(bka))then
+       deallocate(bka, stat=ierr )
+       if (ierr/=0) call io_error('Error in deallocating bka in kmesh_dealloc')
+    endif
+    if(allocated(wb))then
+       deallocate(wb, stat=ierr )
+       if (ierr/=0) call io_error('Error in deallocating wb in kmesh_dealloc')
     end if
 
     ! Deallocate integer arrays that are public
-    if (.not. explicit_nnkpts) then
-        deallocate(neigh, stat=ierr )
-        if (ierr/=0) call io_error('Error in deallocating neigh in kmesh_dealloc')
+    if(allocated(neigh))then
+       deallocate(neigh, stat=ierr )
+       if (ierr/=0) call io_error('Error in deallocating neigh in kmesh_dealloc')
     end if
-    deallocate(nncell, stat=ierr )
-    if (ierr/=0) call io_error('Error in deallocating nncell in kmesh_dealloc')
-    deallocate(nnlist, stat=ierr )
-    if (ierr/=0) call io_error('Error in deallocating nnlist in kmesh_dealloc')
+    if(allocated(nncell))then
+       deallocate(nncell, stat=ierr )
+       if (ierr/=0) call io_error('Error in deallocating nncell in kmesh_dealloc')
+    endif
+    if(allocated(nnlist))then
+       deallocate(nnlist, stat=ierr )
+       if (ierr/=0) call io_error('Error in deallocating nnlist in kmesh_dealloc')
+    endif
 
     return
 
@@ -930,7 +952,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
        if (ierr/=0) call io_error('Error allocating bvector in kmesh_shell_automatic')
     bvector=0.0_dp;bweight=0.0_dp
 
-    write(stdout,'(1x,a)') '| The b-vectors are chosen automatically                                     |'
+    if (on_root) write(stdout,'(1x,a)') '| The b-vectors are chosen automatically                                     |'
 
     b1sat=.false.
     do shell=1,search_shells
@@ -939,7 +961,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
        ! get the b vectors for the new shell
        call kmesh_get_bvectors(multi(shell),1,dnn(shell),bvector(:,1:multi(shell),cur_shell))
 
-       if(iprint>=3) then
+       if(iprint>=3.and.on_root) then
           write(stdout,'(1x,a8,1x,I2,a14,1x,I2,49x,a)') '| Shell:',shell,' Multiplicity:',multi(shell), '|'
              do loop=1,multi(shell)
                 write(stdout,'(1x,a10,I2,1x,a1,4x,3f12.6,5x,a9,9x,a)') '| b-vector ',loop,':', &
@@ -963,7 +985,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
        end if
 
        if(lpar) then
-          if(iprint>=3) then
+          if(iprint>=3.and.on_root) then
              write(stdout,'(1x,a)') '| This shell is linearly dependent on existing shells: Trying next shell     |'
           end if
           cycle
@@ -1007,7 +1029,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
        info=0
        call dgesvd('A','A',max_shells,num_shells,amat,max_shells,singv,umat,max_shells,vmat,num_shells,work,lwork,info)
        if(info<0) then
-          write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_automatic: Argument',abs(info),'of dgesvd is incorrect'
+          if (on_root) write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_automatic: Argument',abs(info),'of dgesvd is incorrect'
           call io_error('kmesh_shell_automatic: Problem with Singular Value Decomposition')
        else if (info>0) then
           call io_error('kmesh_shell_automatic: Singular Value Decomposition did not converge')
@@ -1017,7 +1039,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
          if(num_shells==1)  then 
              call io_error('kmesh_shell_automatic: Singular Value Decomposition has found a very small singular value')
          else
-            write(stdout,'(1x,a)') '| SVD found small singular value, Rejecting this shell and trying the next   |'
+            if (on_root) write(stdout,'(1x,a)') '| SVD found small singular value, Rejecting this shell and trying the next   |'
             b1sat=.false.
             num_shells=num_shells-1
             goto 200
@@ -1038,7 +1060,7 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
        tmp3 = matmul(transpose(vmat),tmp2)
        bweight(1:num_shells) = tmp3
 
-       if(iprint>=2) then
+       if(iprint>=2.and.on_root) then
           do loop_s=1,num_shells
              write(stdout,'(1x,a,I2,a,f12.7,5x,a8,36x,a)') '| Shell: ',loop_s,&
                   ' w_b ', bweight(loop_s)*lenconfac**2,'('//trim(length_unit)//'^2)','|'
@@ -1066,13 +1088,13 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
 
        if(.not.b1sat) then
           if(shell<search_shells .and. iprint>=3) then
-             write(stdout,'(1x,a,24x,a1)') '| B1 condition is not satisfied: Adding another shell','|'
+             if (on_root) write(stdout,'(1x,a,24x,a1)') '| B1 condition is not satisfied: Adding another shell','|'
           elseif(shell==search_shells) then
-             write(stdout,*) ' '
-       write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells'
-       write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid'
-       write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)'
-       write(stdout,*) ' '
+             if (on_root) write(stdout,*) ' '
+             if (on_root) write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells'
+             if (on_root) write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid'
+             if (on_root) write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)'
+             if (on_root) write(stdout,*) ' '
              call io_error('kmesh_get_automatic')
           end if
        end if
@@ -1102,11 +1124,11 @@ subroutine kmesh_shell_automatic(multi,dnn,bweight)
     end do
 
     if(.not. b1sat)  then
-       write(stdout,*) ' '
-       write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells'
-       write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid'
-       write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)'
-       write(stdout,*) ' '
+       if (on_root) write(stdout,*) ' '
+       if (on_root) write(stdout,'(1x,a,i3,a)') 'Unable to satisfy B1 with any of the first ',search_shells,' shells'
+       if (on_root) write(stdout,'(1x,a)') 'Your cell might be very long, or you may have an irregular MP grid'
+       if (on_root) write(stdout,'(1x,a)') 'Try increasing the parameter search_shells in the win file (default=12)'
+       if (on_root) write(stdout,*) ' '
        call io_error('kmesh_get_automatic')
     end if
 
@@ -1156,7 +1178,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight)
     bvector=0.0_dp;bweight=0.0_dp
     amat=0.0_dp;umat=0.0_dp;vmat=0.0_dp;smat=0.0_dp;singv=0.0_dp
 
-    write(stdout,'(1x,a)') '| The b-vectors are set in the win file                                      |'
+    if (on_root) write(stdout,'(1x,a)') '| The b-vectors are set in the win file                                      |'
 
 
     do shell=1,num_shells
@@ -1165,7 +1187,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight)
             bvector(:,1:multi(shell_list(shell)),shell))
     end do
 
-    if(iprint>=3) then
+    if(iprint>=3.and.on_root) then
        do shell=1,num_shells
           write(stdout,'(1x,a8,1x,I2,a14,1x,I2,49x,a)') '| Shell:',shell,' Multiplicity:',multi(shell_list(shell)), '|'
           do loop=1,multi(shell_list(shell))
@@ -1190,7 +1212,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight)
     info=0
     call dgesvd('A','A',max_shells,num_shells,amat,max_shells,singv,umat,max_shells,vmat,num_shells,work,lwork,info)
     if(info<0) then
-       write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect'
+       if (on_root) write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect'
        call io_error('kmesh_shell_fixed: Problem with Singular Value Decomposition')
     else if (info>0) then
        call io_error('kmesh_shell_fixed: Singular Value Decomposition did not converge')
@@ -1205,7 +1227,7 @@ subroutine kmesh_shell_fixed(multi,dnn,bweight)
     end do
 
     bweight(1:num_shells)=matmul(transpose(vmat),matmul(smat,matmul(transpose(umat),target)))
-    if(iprint>=2) then
+    if(iprint>=2.and.on_root) then
        do loop_s=1,num_shells
 !          write(stdout,'(1x,a,I2,a,f12.7,49x,a)') '| Shell: ',loop_s,' w_b ', bweight(loop_s),'|'
           write(stdout,'(1x,a,I2,a,f12.7,5x,a8,36x,a)') '| Shell: ',loop_s,&
@@ -1284,7 +1306,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight)
        if (ierr/=0) call io_error('Error allocating bvector in kmesh_shell_fixed') 
     bvector=0.0_dp;bweight=0.0_dp
 
-    write(stdout,'(1x,a)') '| The b-vectors are defined in the kshell file                               |'
+    if (on_root) write(stdout,'(1x,a)') '| The b-vectors are defined in the kshell file                               |'
 
     counter=1
     do shell=1,search_shells
@@ -1352,7 +1374,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight)
 
 
 
-    if(iprint>=3) then
+    if(iprint>=3.and.on_root) then
        do shell=1,num_shells
           write(stdout,'(1x,a8,1x,I2,a14,1x,I2,49x,a)') '| Shell:',shell,' Multiplicity:',multi(shell), '|'
           do loop=1,multi(shell)
@@ -1390,7 +1412,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight)
     info=0
     call dgesvd('A','A',max_shells,num_shells,amat,max_shells,singv,umat,max_shells,vmat,num_shells,work,lwork,info)
     if(info<0) then
-       write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect'
+       if (on_root) write(stdout,'(1x,a,1x,I1,1x,a)') 'kmesh_shell_fixed: Argument',abs(info),'of dgesvd is incorrect'
        call io_error('kmesh_shell_fixed: Problem with Singular Value Decomposition')
     else if (info>0) then
        call io_error('kmesh_shell_fixed: Singular Value Decomposition did not converge')
@@ -1405,7 +1427,7 @@ subroutine kmesh_shell_from_file(multi,dnn,bweight)
     end do
 
     bweight(1:num_shells)=matmul(transpose(vmat),matmul(smat,matmul(transpose(umat),target)))
-    if(iprint>=2) then
+    if(iprint>=2.and.on_root) then
        do loop_s=1,num_shells
           write(stdout,'(1x,a,I2,a,f12.7,5x,a8,36x,a)') '| Shell: ',loop_s,&
                ' w_b ', bweight(loop_s)*lenconfac**2,'('//trim(length_unit)//'^2)','|'
diff --git a/src/overlap.F90 b/src/overlap.F90
index 8e0e0eed8..ea7e43c9d 100644
--- a/src/overlap.F90
+++ b/src/overlap.F90
@@ -12,6 +12,7 @@
 ! https://github.com/wannier-developers/wannier90            !
 !------------------------------------------------------------!
 
+
 module w90_overlap
   !! This module reads in the overlap (Mmn) and Projections (Amn)
   !! and performs simple operations on them.
@@ -19,6 +20,7 @@ module w90_overlap
   use w90_constants, only : dp,cmplx_0,cmplx_1
   use w90_parameters, only : disentanglement
   use w90_io, only : stdout
+  use w90_comms, only : on_root,comms_bcast
 
   implicit none
  
@@ -83,35 +85,18 @@ subroutine overlap_read( )
     endif
 
 
-    if (index(devel_flag,'f77input')>0) then
-       ! This block left for the short term as a means
-       ! to quickly benchmark against the old f77 code
-       ! Read U_matrix and M_matrix from file 
-       open(20,file='wannier0.dat',form='formatted',status='unknown')
-       do i=1,num_wann
-          do j=1,num_wann
-             do nkp=1,num_kpts
-                read(20,*) u_matrix(i,j,nkp)
-                do nn=1,nntot
-                   read(20,*) m_matrix(i,j,nn,nkp)
-                end do
-             end do
-          end do
-       end do
-       close(20)
-
-    else
+    if(on_root) then
 
        ! Read M_matrix_orig from file
        mmn_in=io_file_unit()
        open(unit=mmn_in,file=trim(seedname)//'.mmn',&
             form='formatted',status='old',action='read',err=101)
               
-       write(stdout,'(/a)',advance='no') ' Reading overlaps from '//trim(seedname)//'.mmn    : '
+       if(on_root) write(stdout,'(/a)',advance='no') ' Reading overlaps from '//trim(seedname)//'.mmn    : '
 
        ! Read the comment line
        read(mmn_in,'(a)',err=103,end=103) dummy
-       write(stdout,'(a)') trim(dummy)
+       if(on_root) write(stdout,'(a)') trim(dummy)
 
        ! Read the number of bands, k-points and nearest neighbours
        read(mmn_in,*,err=103,end=103) nb_tmp,nkp_tmp,nntot_tmp
@@ -153,7 +138,7 @@ subroutine overlap_read( )
              endif
           end do
           if (nn.eq.0) then
-             write(stdout,'(/a,i8,2i5,i4,2x,3i3)') &
+             if(on_root) write(stdout,'(/a,i8,2i5,i4,2x,3i3)') &
                   ' Error reading '//trim(seedname)//'.mmn:',ncount,nkp,nkp2,nn,nnl,nnm,nnn
              call io_error('Neighbour not found')
           end if
@@ -166,21 +151,27 @@ subroutine overlap_read( )
        end do
        deallocate(mmn_tmp,stat=ierr)
        if (ierr/=0) call io_error('Error in deallocating mmn_tmp in overlap_read')
- 
        close(mmn_in)
+    endif
+    
+    if(disentanglement) then
+       call comms_bcast(m_matrix_orig(1,1,1,1),num_bands*num_bands*nntot*num_kpts)
+    else
+       call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts)
+    endif
 
-
-       if(.not. use_bloch_phases) then
+    if(.not. use_bloch_phases) then
+       if(on_root) then
 
           ! Read A_matrix from file wannier.amn
           amn_in=io_file_unit()
           open(unit=amn_in,file=trim(seedname)//'.amn',form='formatted',status='old',err=102)
           
-          write(stdout,'(/a)',advance='no') ' Reading projections from '//trim(seedname)//'.amn : '
+          if(on_root) write(stdout,'(/a)',advance='no') ' Reading projections from '//trim(seedname)//'.amn : '
           
           ! Read the comment line
           read(amn_in,'(a)',err=104,end=104) dummy
-          write(stdout,'(a)') trim(dummy)
+          if(on_root) write(stdout,'(a)') trim(dummy)
           
           ! Read the number of bands, k-points and wannier functions
           read(amn_in,*,err=104,end=104) nb_tmp, nkp_tmp, nw_tmp
@@ -206,18 +197,24 @@ subroutine overlap_read( )
                 u_matrix(m,n,nkp) = cmplx(a_real,a_imag,kind=dp)
              end do
           end if
-          
           close(amn_in)
-          
+       endif
+
+       if(disentanglement) then
+          call comms_bcast(a_matrix(1,1,1),num_bands*num_wann*num_kpts)
        else
-          
-          do n=1,num_kpts
-             do m=1,num_wann
-                u_matrix(m,m,n)=cmplx_1
-             end do
-          end do
+          call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts)
+       endif
 
-       end if
+    else
+       
+       do n=1,num_kpts
+          do m=1,num_wann
+             u_matrix(m,m,n)=cmplx_1
+          end do
+       end do
+       
+    end if
        
        ! If post-processing a Car-Parinello calculation (gamma only)
        ! then rotate M and A to the basis of Kohn-Sham eigenstates
@@ -261,7 +258,6 @@ subroutine overlap_read( )
  !~      end if
 ![ysl-e]
 
-    endif
 
     if (timing_level>0) call io_stopwatch('overlap: read',2)
 
diff --git a/src/parameters.F90 b/src/parameters.F90
index 69453efce..23ab9e188 100644
--- a/src/parameters.F90
+++ b/src/parameters.F90
@@ -12,12 +12,14 @@
 ! https://github.com/wannier-developers/wannier90            !
 !------------------------------------------------------------!
 
+
 module w90_parameters
   !! This module contains parameters to control the actions of wannier90.
   !! Also routines to read the parameters and write them out again.
 
   use w90_constants, only : dp
   use w90_io,        only : stdout,maxlen
+  use w90_comms,     only : on_root,num_nodes
 
   implicit none
 
@@ -450,6 +452,8 @@ module w90_parameters
   public :: param_lib_set_atoms
   public :: param_memory_estimate
   public :: param_get_smearing_type
+  public :: param_dist
+  public :: param_chkpt_dist
 
 contains
 
@@ -487,6 +491,10 @@ subroutine param_read ( )
 
     ! default value is symmetrize_eps=0.001
     call param_get_keyword('symmetrize_eps',found,r_value=symmetrize_eps)!YN:
+!jry    if (lsitesymmetry.and.num_nodes>1) then
+!jry       call io_error('Error: site symmetry can not be used in parallel mode')
+!jry    end if
+
 
     !%%%%%%%%%%%%%%%%
     ! Transport 
@@ -571,7 +579,7 @@ subroutine param_read ( )
     ! AAM_2016-09-16: some changes to logic to patch a problem with uninitialised num_bands in library mode
 !    num_bands       =   -1   
     call param_get_keyword('num_bands',found,i_value=i_temp)
-    if(found.and.library) write(stdout,'(/a)') ' Ignoring <num_bands> in input file'
+    if(found.and.library.and.on_root) write(stdout,'(/a)') ' Ignoring <num_bands> in input file'
     if (.not. library .and. .not.effective_model) then
        if(found) num_bands=i_temp
        if(.not.found) num_bands=num_wann
@@ -596,7 +604,7 @@ subroutine param_read ( )
 
 !    mp_grid=-99
     call param_get_keyword_vector('mp_grid',found,3,i_value=iv_temp)
-    if(found.and.library) write(stdout,'(a)') ' Ignoring <mp_grid> in input file'
+    if(found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring <mp_grid> in input file'
     if(.not.library .and. .not.effective_model) then
        if(found) mp_grid=iv_temp
        if (.not. found) then
@@ -615,7 +623,7 @@ subroutine param_read ( )
        if ( gamma_only .and. (num_kpts.ne.1) ) &
             call io_error('Error: gamma_only is true, but num_kpts > 1')
     else
-       if (found) write(stdout,'(a)') ' Ignoring <gamma_only> in input file'
+       if (found.and.on_root) write(stdout,'(a)') ' Ignoring <gamma_only> in input file'
     endif
 ![ysl-e]
 
@@ -662,7 +670,7 @@ subroutine param_read ( )
     if (.not.library) then
        spinors=ltmp
     else
-       if (found) write(stdout,'(a)') ' Ignoring <spinors> in input file'
+       if (found.and.on_root) write(stdout,'(a)') ' Ignoring <spinors> in input file'
     endif
 !    if(spinors .and. (2*(num_wann/2))/=num_wann) &
 !       call io_error('Error: For spinor WF num_wann must be even')
@@ -1383,7 +1391,7 @@ subroutine param_read ( )
              do k=1,num_kpts
                 do n=1,num_bands
                    read(eig_unit,*,err=106,end=106) i,j,eigval(n,k)
-                   if ((i.ne.n).or.(j.ne.k)) then
+                   if ((((i.ne.n).or.(j.ne.k))).and.on_root) then
                       write(stdout,'(a)') 'Found a mismatch in '//trim(seedname)//'.eig' 
                       write(stdout,'(a,i0,a,i0)') 'Wanted band  : ',n,' found band  : ',i
                       write(stdout,'(a,i0,a,i0)') 'Wanted kpoint: ',k,' found kpoint: ',j
@@ -1765,7 +1773,7 @@ subroutine param_read ( )
     call param_get_keyword('skip_b1_tests', found, l_value=skip_B1_tests)
     
     call param_get_keyword_block('unit_cell_cart',found,3,3,r_value=real_lattice_tmp)
-    if(found.and.library) write(stdout,'(a)') ' Ignoring <unit_cell_cart> in input file'
+    if(found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring <unit_cell_cart> in input file'
     if (.not. library) then
        real_lattice=transpose(real_lattice_tmp)
        if(.not. found) call io_error('Error: Did not find the cell information in the input file')
@@ -1783,7 +1791,7 @@ subroutine param_read ( )
     end if
 
     call param_get_keyword_block('kpoints',found,num_kpts,3,r_value=kpt_cart)
-    if(found.and.library) write(stdout,'(a)') ' Ignoring <kpoints> in input file'
+    if(found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring <kpoints> in input file'
     if (.not. library .and. .not.effective_model) then
        kpt_latt=kpt_cart
        if(.not. found) call io_error('Error: Did not find the kpoint information in the input file')
@@ -1898,9 +1906,9 @@ subroutine param_read ( )
     ! Atoms
     if (.not.library) num_atoms=0
     call param_get_block_length('atoms_frac',found,i_temp)
-    if (found.and.library) write(stdout,'(a)') ' Ignoring <atoms_frac> in input file'
+    if (found.and.library.and.on_root) write(stdout,'(a)') ' Ignoring <atoms_frac> in input file'
     call param_get_block_length('atoms_cart',found2,i_temp2,lunits)
-    if (found2.and.library) write(stdout,'(a)') ' Ignoring <atoms_cart> in input file'
+    if (found2.and.library.and.on_root) write(stdout,'(a)') ' Ignoring <atoms_cart> in input file'
     if (.not.library) then
        if (found.and.found2) call io_error('Error: Cannot specify both atoms_frac and atoms_cart')
        if (found .and. i_temp>0) then
@@ -1925,7 +1933,7 @@ subroutine param_read ( )
 
 302  continue
 
-    if ( any(len_trim(in_data(:))>0 )) then
+    if ( any(len_trim(in_data(:))>0 ).and.on_root) then
        write(stdout,'(1x,a)') 'The following section of file '//trim(seedname)//'.win contained unrecognised keywords'
        write(stdout,*) 
        do loop=1,num_lines
@@ -3308,14 +3316,14 @@ subroutine param_read_chkpt()
     real(kind=dp) :: tmp_latt(3,3), tmp_kpt_latt(3,num_kpts)
     integer :: tmp_excl_bands(1:num_exclude_bands),tmp_mp_grid(1:3)
 
-    write(stdout,'(1x,3a)') 'Reading restart information from file ',trim(seedname),'.chk :'
+    if (on_root) write(stdout,'(1x,3a)') 'Reading restart information from file ',trim(seedname),'.chk :'
 
     chk_unit=io_file_unit()
     open(unit=chk_unit,file=trim(seedname)//'.chk',status='old',form='unformatted',err=121)
 
     ! Read comment line
     read(chk_unit) header
-    write(stdout,'(1x,a)',advance='no') trim(header)
+    if (on_root) write(stdout,'(1x,a)',advance='no') trim(header)
 
     ! Consistency checks
     read(chk_unit) ntmp                           ! Number of bands
@@ -3419,7 +3427,7 @@ subroutine param_read_chkpt()
 
     close(chk_unit)
 
-    write(stdout,'(a/)') ' ... done'
+    if (on_root) write(stdout,'(a/)') ' ... done'
 
     return
 
@@ -3439,6 +3447,74 @@ subroutine param_read_chkpt()
   end subroutine param_read_chkpt
 
 
+  !===========================================================!
+  subroutine param_chkpt_dist
+  !===========================================================!
+  !                                                           !
+  !! Distribute the chk files
+  !                                                           !
+  !===========================================================!
+
+    use w90_constants,  only : dp,cmplx_0,cmplx_i,twopi
+    use w90_io,         only : io_error,io_file_unit,&
+                               io_date,io_time,io_stopwatch
+    use w90_comms,      only : on_root,comms_bcast
+
+    implicit none
+
+    integer :: ierr,loop_kpt,m,i,j
+
+    call comms_bcast(checkpoint,len(checkpoint))
+
+    if (.not.on_root .and. .not.allocated(u_matrix)) then
+       allocate(u_matrix(num_wann,num_wann,num_kpts),stat=ierr)
+       if (ierr/=0)&
+            call io_error('Error allocating u_matrix in param_chkpt_dist')
+    endif
+    call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts)
+
+    if (.not.on_root .and. .not.allocated(m_matrix)) then
+       allocate(m_matrix(num_wann,num_wann,nntot,num_kpts),stat=ierr)
+       if (ierr/=0)&
+            call io_error('Error allocating m_matrix in param_chkpt_dist')
+    endif
+    call comms_bcast(m_matrix(1,1,1,1),num_wann*num_wann*nntot*num_kpts)
+    
+    call comms_bcast(have_disentangled,1)
+
+    if (have_disentangled) then
+       if(.not.on_root) then
+
+          if (.not.allocated(u_matrix_opt)) then
+             allocate(u_matrix_opt(num_bands,num_wann,num_kpts),stat=ierr)
+             if (ierr/=0)&
+              call io_error('Error allocating u_matrix_opt in param_chkpt_dist')
+          endif
+          
+          if (.not.allocated(lwindow)) then
+             allocate(lwindow(num_bands,num_kpts),stat=ierr)
+             if (ierr/=0)&
+                  call io_error('Error allocating lwindow in param_chkpt_dist')
+          endif
+          
+          if (.not.allocated(ndimwin)) then
+             allocate(ndimwin(num_kpts),stat=ierr)
+             if (ierr/=0)&
+                  call io_error('Error allocating ndimwin in param_chkpt_dist')
+          endif
+     
+       end if
+
+       call comms_bcast(u_matrix_opt(1,1,1),num_bands*num_wann*num_kpts)
+       call comms_bcast(lwindow(1,1),num_bands*num_kpts)
+       call comms_bcast(ndimwin(1),num_kpts)
+       call comms_bcast(omega_invariant,1)
+    end if
+    call comms_bcast(wannier_centres(1,1),3*num_wann)
+    call comms_bcast(wannier_spreads(1),num_wann)
+
+  end subroutine param_chkpt_dist
+
   !=======================================!
   subroutine param_in_file
     !=======================================!
@@ -5280,37 +5356,37 @@ subroutine param_memory_estimate
     if(disentanglement) &
          mem_wan= mem_wan+ num_wann*num_wann*nntot*num_kpts*size_cmplx       !m_matrix
 
-     write(stdout,'(1x,a)') '*============================================================================*'
-     write(stdout,'(1x,a)')  '|                              MEMORY ESTIMATE                               |'
-     write(stdout,'(1x,a)')  '|         Maximum RAM allocated during each phase of the calculation         |'
-     write(stdout,'(1x,a)')  '*============================================================================*'
-     if(disentanglement) &
-          write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis)/(1024**2),' Mb'
-     write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb'
-     if(optimisation>0 .and. iprint>1 ) then
-        write(stdout,'(1x,a)')  '|                                                                            |'
-        write(stdout,'(1x,a)')  '|   N.B. by setting optimisation=0 memory usage will be reduced to:          |'
-        if (disentanglement) &
-        write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis- &
-             max(mem_dis1,mem_dis2)+mem_dis1)/(1024**2),' Mb'
-        if(gamma_only) then
-         write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb'
-        else
-         write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan-mem_wan1)/(1024**2),' Mb'
-        end if
-     write(stdout,'(1x,a)')  '|   However, this will result in more i/o and slow down the calculation      |'
-     endif
-
-     write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'plot_wannier:',(mem_param+mem_wan)/(1024**2),' Mb'
-
-     if (ispostw90) then
-        if (boltzwann) &
-             write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'BoltzWann:',(mem_param+mem_bw)/(1024**2),' Mb'
-     end if
+     if (on_root) then 
+        write(stdout,'(1x,a)') '*============================================================================*'
+        write(stdout,'(1x,a)')  '|                              MEMORY ESTIMATE                               |'
+        write(stdout,'(1x,a)')  '|         Maximum RAM allocated during each phase of the calculation         |'
+        write(stdout,'(1x,a)')  '*============================================================================*'
+        if(disentanglement) &
+             write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis)/(1024**2),' Mb'
+        write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb'
+        if(optimisation>0 .and. iprint>1 ) then
+           write(stdout,'(1x,a)')  '|                                                                            |'
+           write(stdout,'(1x,a)')  '|   N.B. by setting optimisation=0 memory usage will be reduced to:          |'
+           if (disentanglement) &
+                write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Disentanglement:',(mem_param+mem_dis- &
+                max(mem_dis1,mem_dis2)+mem_dis1)/(1024**2),' Mb'
+           if(gamma_only) then
+              write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan)/(1024**2),' Mb'
+           else
+              write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'Wannierise:',(mem_param+mem_wan-mem_wan1)/(1024**2),' Mb'
+           end if
+           write(stdout,'(1x,a)')  '|   However, this will result in more i/o and slow down the calculation      |'
+        endif
 
-     write(stdout,'(1x,a)')  '*----------------------------------------------------------------------------*'
-     write(stdout,*) ' '
+        if (ispostw90) then
+           if (boltzwann) &
+              write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'BoltzWann:',(mem_param+mem_bw)/(1024**2),' Mb'
+        end if
 
+        write(stdout,'(1x,"|",24x,a15,f16.2,a,18x,"|")') 'plot_wannier:',(mem_param+mem_wan)/(1024**2),' Mb'
+        write(stdout,'(1x,a)')  '*----------------------------------------------------------------------------*'
+        write(stdout,*) ' '
+     endif
 
 !    if(disentanglement) then
 !       write(*,'(a12,f12.4,a)') 'Disentangle',(mem_param+mem_dis)/(1024**2),' Mb'
@@ -5322,4 +5398,369 @@ subroutine param_memory_estimate
   end subroutine param_memory_estimate
 
 
+  !===========================================================!
+  subroutine param_dist
+  !===========================================================!
+  !                                                           !
+  !! distribute the parameters across processors              !
+  !                                                           !
+  !===========================================================!
+
+    use w90_constants,  only : dp,cmplx_0,cmplx_i,twopi
+    use w90_io,         only : io_error,io_file_unit,io_date,io_time,&
+                               io_stopwatch
+    use w90_comms,      only : comms_bcast
+
+    integer :: ierr
+
+    call comms_bcast(effective_model,1) 
+    call comms_bcast(eig_found,1) 
+    call comms_bcast(postproc_setup,1)
+    if(.not.effective_model) then
+       call comms_bcast(mp_grid(1),3)
+       call comms_bcast(num_kpts,1)
+       call comms_bcast(num_bands,1)
+    endif
+    call comms_bcast(num_wann,1)
+    call comms_bcast(timing_level,1)
+    call comms_bcast(iprint,1)
+    call comms_bcast(energy_unit,1) 
+    call comms_bcast(length_unit,1) 
+    call comms_bcast(wvfn_formatted,1) 
+    call comms_bcast(spn_formatted,1) 
+    call comms_bcast(berry_uHu_formatted,1) 
+    call comms_bcast(spin,1) 
+    call comms_bcast(num_dump_cycles,1)
+    call comms_bcast(num_print_cycles,1)
+    call comms_bcast(num_atoms,1)   ! Ivo: not used in postw90, right?
+    call comms_bcast(num_species,1) ! Ivo: not used in postw90, right?
+    call comms_bcast(real_lattice(1,1),9)
+    call comms_bcast(recip_lattice(1,1),9)
+    call comms_bcast(real_metric(1,1),9)
+    call comms_bcast(recip_metric(1,1),9)
+    call comms_bcast(cell_volume,1)
+    call comms_bcast(dos_energy_step,1)
+    call comms_bcast(dos_adpt_smr,1)
+    call comms_bcast(dos_smr_index,1)
+    call comms_bcast(dos_kmesh_spacing,1) 
+    call comms_bcast(dos_kmesh(1),3) 
+    call comms_bcast(dos_adpt_smr_max,1)
+    call comms_bcast(dos_smr_fixed_en_width,1)
+    call comms_bcast(dos_adpt_smr_fac,1)
+    call comms_bcast(num_dos_project,1)
+    call comms_bcast(num_exclude_bands,1)
+    if(num_exclude_bands>0) then
+       if(.not.on_root) then
+          allocate(exclude_bands(num_exclude_bands), stat=ierr )
+          if (ierr/=0) &
+               call io_error('Error in allocating exclude_bands in param_dist')
+       endif
+       call comms_bcast(exclude_bands(1),num_exclude_bands)
+    end if
+
+    call comms_bcast(gamma_only,1)  
+    call comms_bcast(dis_win_min,1)
+    call comms_bcast(dis_win_max,1)
+    call comms_bcast(dis_froz_min,1)
+    call comms_bcast(dis_froz_max,1)
+    call comms_bcast(dis_num_iter,1)
+    call comms_bcast(dis_mix_ratio,1)
+    call comms_bcast(dis_conv_tol,1)
+    call comms_bcast(dis_conv_window,1)
+    call comms_bcast(dis_spheres_first_wann,1)
+    call comms_bcast(dis_spheres_num,1)
+    if(dis_spheres_num>0) then
+       if(.not.on_root) then
+          allocate(dis_spheres(4,dis_spheres_num), stat=ierr )
+          if (ierr/=0) &
+               call io_error('Error in allocating dis_spheres in param_dist')
+       endif
+       call comms_bcast(dis_spheres(1,1),4*dis_spheres_num)
+    end if
+    call comms_bcast(num_iter,1)
+    call comms_bcast(num_cg_steps,1)
+    call comms_bcast(conv_tol,1)
+    call comms_bcast(conv_window,1)
+    call comms_bcast(wannier_plot,1)
+    call comms_bcast(num_wannier_plot,1)
+    if(num_wannier_plot>0) then
+       if(.not.on_root) then
+          allocate(wannier_plot_list(num_wannier_plot), stat=ierr )
+          if (ierr/=0) &
+               call io_error('Error in allocating wannier_plot_list in param_dist')
+       endif
+       call comms_bcast(wannier_plot_list(1),num_wannier_plot)
+    end if
+    call comms_bcast(wannier_plot_supercell(1),3)
+    call comms_bcast(wannier_plot_format,len(wannier_plot_format))
+    call comms_bcast(wannier_plot_mode,len(wannier_plot_mode))
+    call comms_bcast(write_u_matrices,1)
+    call comms_bcast(bands_plot,1)
+    call comms_bcast(bands_num_points,1)
+    call comms_bcast(bands_plot_format,len(bands_plot_format))
+    call comms_bcast(bands_plot_mode,len(bands_plot_mode))
+    call comms_bcast(num_bands_project,1)
+
+    if(num_bands_project>0) then
+       if(.not.on_root) then
+          allocate(bands_plot_project(num_bands_project), stat=ierr )
+          if (ierr/=0) &
+               call io_error('Error in allocating bands_plot_project in param_dist')
+       endif
+       call comms_bcast(bands_plot_project(1),num_bands_project)
+    end if
+    call comms_bcast(bands_plot_dim,1)
+    call comms_bcast(write_hr,1)
+    call comms_bcast(write_rmn,1)
+    call comms_bcast(write_tb,1)
+    call comms_bcast(hr_cutoff,1)
+    call comms_bcast(dist_cutoff,1)
+    call comms_bcast(dist_cutoff_mode,len(dist_cutoff_mode))
+    call comms_bcast(dist_cutoff_hc,1)
+    call comms_bcast(one_dim_axis,len(one_dim_axis))
+    call comms_bcast(use_ws_distance,1)
+!    call comms_bcast(ws_distance_tol,1)
+    call comms_bcast(fermi_surface_plot,1)
+    call comms_bcast(fermi_surface_num_points,1)
+    call comms_bcast(fermi_surface_plot_format,len(fermi_surface_plot_format))
+    call comms_bcast(fermi_energy,1) !! used?
+    call comms_bcast(berry,1)
+    call comms_bcast(berry_task,len(berry_task))
+    call comms_bcast(berry_kmesh_spacing,1)
+    call comms_bcast(berry_kmesh(1),3)
+    call comms_bcast(berry_curv_adpt_kmesh,1)
+    call comms_bcast(berry_curv_adpt_kmesh_thresh,1)
+    call comms_bcast(berry_curv_unit,len(berry_curv_unit))
+    call comms_bcast(kubo_adpt_smr,1)
+    call comms_bcast(kubo_adpt_smr_fac,1)
+    call comms_bcast(kubo_adpt_smr_max,1)
+    call comms_bcast(kubo_smr_fixed_en_width,1)
+    call comms_bcast(kubo_smr_index,1)
+    call comms_bcast(kubo_eigval_max,1)
+    call comms_bcast(kubo_nfreq,1)
+    call comms_bcast(nfermi,1)
+    call comms_bcast(dos_energy_min,1)
+    call comms_bcast(dos_energy_max,1)
+    call comms_bcast(spin_kmesh_spacing,1)
+    call comms_bcast(spin_kmesh(1),3)
+    call comms_bcast(wanint_kpoint_file,1)
+
+    call comms_bcast(devel_flag,len(devel_flag))
+    call comms_bcast(spin_moment,1) 
+    call comms_bcast(spin_axis_polar,1) 
+    call comms_bcast(spin_axis_azimuth,1) 
+    call comms_bcast(spin_decomp,1)
+    call comms_bcast(use_degen_pert,1) 
+    call comms_bcast(degen_thr,1)
+    call comms_bcast(num_valence_bands,1)
+    call comms_bcast(dos,1)
+    call comms_bcast(dos_task,len(dos_task)) 
+    call comms_bcast(kpath,1) 
+    call comms_bcast(kpath_task,len(kpath_task)) 
+    call comms_bcast(kpath_bands_colour,len(kpath_bands_colour)) 
+    call comms_bcast(kslice,1) 
+    call comms_bcast(kslice_task,len(kslice_task)) 
+    call comms_bcast(transl_inv,1) 
+    call comms_bcast(num_elec_per_state,1)
+    call comms_bcast(scissors_shift,1)
+    !
+
+! ----------------------------------------------
+    call comms_bcast(geninterp,1)
+    call comms_bcast(geninterp_alsofirstder,1)
+    call comms_bcast(geninterp_single_file,1)
+    ! [gp-begin, Apr 12, 2012]
+    ! BoltzWann variables
+    call comms_bcast(boltzwann,1) 
+    call comms_bcast(boltz_calc_also_dos,1) 
+    call comms_bcast(boltz_2d_dir_num,1) 
+    call comms_bcast(boltz_dos_energy_step,1) 
+    call comms_bcast(boltz_dos_energy_min,1) 
+    call comms_bcast(boltz_dos_energy_max,1) 
+    call comms_bcast(boltz_dos_adpt_smr,1)
+    call comms_bcast(boltz_dos_smr_fixed_en_width,1)
+    call comms_bcast(boltz_dos_adpt_smr_fac,1)
+    call comms_bcast(boltz_dos_adpt_smr_max,1)
+    call comms_bcast(boltz_mu_min,1) 
+    call comms_bcast(boltz_mu_max,1) 
+    call comms_bcast(boltz_mu_step,1) 
+    call comms_bcast(boltz_temp_min,1) 
+    call comms_bcast(boltz_temp_max,1) 
+    call comms_bcast(boltz_temp_step,1) 
+    call comms_bcast(boltz_kmesh_spacing,1) 
+    call comms_bcast(boltz_kmesh(1),3) 
+    call comms_bcast(boltz_tdf_energy_step,1) 
+    call comms_bcast(boltz_relax_time,1) 
+    call comms_bcast(boltz_TDF_smr_fixed_en_width,1)
+    call comms_bcast(boltz_TDF_smr_index,1)
+    call comms_bcast(boltz_dos_smr_index,1)
+    call comms_bcast(boltz_bandshift,1) 
+    call comms_bcast(boltz_bandshift_firstband,1) 
+    call comms_bcast(boltz_bandshift_energyshift,1) 
+    ! [gp-end]
+    call comms_bcast(use_ws_distance,1)
+    call comms_bcast(disentanglement,1)
+
+    
+    call comms_bcast(transport,1)
+    call comms_bcast(tran_easy_fix,1)
+    call comms_bcast(transport_mode,len(transport_mode))
+    call comms_bcast(tran_win_min,1)
+    call comms_bcast(tran_win_max,1)
+    call comms_bcast(tran_energy_step,1)
+    call comms_bcast(tran_num_bb,1)
+    call comms_bcast(tran_num_ll,1)
+    call comms_bcast(tran_num_rr,1)
+    call comms_bcast(tran_num_cc,1)
+    call comms_bcast(tran_num_lc,1)
+    call comms_bcast(tran_num_cr,1)
+    call comms_bcast(tran_num_bandc,1)
+    call comms_bcast(tran_write_ht,1)
+    call comms_bcast(tran_read_ht ,1)
+    call comms_bcast(tran_use_same_lead,1)
+    call comms_bcast(tran_num_cell_ll,1)
+    call comms_bcast(tran_num_cell_rr,1)
+    call comms_bcast(tran_group_threshold,1)
+    call comms_bcast(translation_centre_frac(1),3)
+    call comms_bcast(num_shells,1)
+    call comms_bcast(skip_B1_tests,1)
+    call comms_bcast(explicit_nnkpts,1)
+
+
+    call comms_bcast(calc_only_A,1)
+    call comms_bcast(use_bloch_phases,1)
+    call comms_bcast(restart,len(restart))
+    call comms_bcast(write_r2mn,1)
+    call comms_bcast(num_guide_cycles,1)
+    call comms_bcast(num_no_guide_iter,1)
+    call comms_bcast(fixed_step,1)
+    call comms_bcast(trial_step,1)
+    call comms_bcast(precond,1)
+    call comms_bcast(write_proj,1)
+    call comms_bcast(timing_level,1)
+    call comms_bcast(spinors,1)
+    call comms_bcast(num_elec_per_state,1)
+    call comms_bcast(translate_home_cell,1)
+    call comms_bcast(write_xyz,1)
+    call comms_bcast(write_hr_diag,1)
+    call comms_bcast(conv_noise_amp,1)
+    call comms_bcast(conv_noise_num,1)
+    call comms_bcast(wannier_plot_radius,1)
+    call comms_bcast(kmesh_tol,1)
+    call comms_bcast(optimisation,1)
+    call comms_bcast(write_vdw_data,1)
+    call comms_bcast(lenconfac,1)
+    call comms_bcast(lfixstep,1)
+    call comms_bcast(lsitesymmetry,1)
+    call comms_bcast(frozen_states,1)
+
+    call comms_bcast(num_proj,1)
+    if(num_proj>0) then
+       if(.not.on_root) then
+          allocate( proj_site(3,num_proj),stat=ierr)
+          if (ierr/=0) call io_error('Error allocating proj_site in param_dist') 
+       endif
+       call comms_bcast(proj_site(1,1),3*num_proj)
+    endif
+
+
+    ! These variables are different from the ones above in that they are 
+    ! allocatable, and in param_read they were allocated on the root node only
+    !
+    if(.not.on_root) then
+       allocate(fermi_energy_list(nfermi),stat=ierr)
+       if (ierr/=0) call io_error(&
+            'Error allocating fermi_energy_read in postw90_param_dist')
+       allocate(kubo_freq_list(kubo_nfreq),stat=ierr)
+       if (ierr/=0) call io_error(&
+            'Error allocating kubo_freq_list in postw90_param_dist')
+       allocate(dos_project(num_dos_project),stat=ierr)
+       if (ierr/=0)&
+            call io_error('Error allocating dos_project in postw90_param_dist')
+       if(.not.effective_model) then
+          if (eig_found) then
+             allocate(eigval(num_bands,num_kpts),stat=ierr)
+             if (ierr/=0)&
+                  call io_error('Error allocating eigval in postw90_param_dist')
+          end if
+          allocate(kpt_latt(3,num_kpts),stat=ierr)
+          if (ierr/=0)&
+               call io_error('Error allocating kpt_latt in postw90_param_dist')
+       endif
+    end if
+    if(nfermi>0) call comms_bcast(fermi_energy_list(1),nfermi)
+    if(kubo_nfreq>0) call comms_bcast(kubo_freq_list(1),kubo_nfreq)
+    if(num_dos_project>0) call comms_bcast(dos_project(1),num_dos_project)
+    if(.not.effective_model) then
+       if (eig_found) then
+          call comms_bcast(eigval(1,1),num_bands*num_kpts)
+       end if
+       call comms_bcast(kpt_latt(1,1),3*num_kpts)
+    endif
+
+    
+
+    if(.not.effective_model.and..not.explicit_nnkpts) then
+
+       call comms_bcast(nnh,1)
+       call comms_bcast(nntot,1)
+       call comms_bcast(wbtot,1)
+
+       if(.not. on_root) then
+          allocate(nnlist(num_kpts,nntot), stat=ierr )
+          if (ierr/=0)&
+               call io_error('Error in allocating nnlist in param_dist')
+          allocate(neigh(num_kpts,nntot/2), stat=ierr )
+          if (ierr/=0)&
+               call io_error('Error in allocating neigh in param_dist')
+          allocate(nncell(3,num_kpts,nntot), stat=ierr )
+          if (ierr/=0)&
+               call io_error('Error in allocating nncell in param_dist')
+          allocate(wb(nntot), stat=ierr )
+          if (ierr/=0)&
+               call io_error('Error in allocating wb in param_dist')
+          allocate(bka(3,nntot/2), stat=ierr )
+          if (ierr/=0)&
+               call io_error('Error in allocating bka in param_dist')
+          allocate(bk(3,nntot,num_kpts), stat=ierr )
+          if (ierr/=0)&
+               call io_error('Error in allocating bk in param_dist')
+       end if
+       
+       call comms_bcast(nnlist(1,1),num_kpts*nntot)
+       call comms_bcast(neigh(1,1),num_kpts*nntot/2)
+       call comms_bcast(nncell(1,1,1),3*num_kpts*nntot)
+       call comms_bcast(wb(1),nntot)
+       call comms_bcast(bka(1,1),3*nntot/2)
+       call comms_bcast(bk(1,1,1),3*nntot*num_kpts)
+
+    endif
+
+    call comms_bcast(omega_total,1)
+    call comms_bcast(omega_tilde,1)
+    call comms_bcast(omega_invariant,1)
+    call comms_bcast(have_disentangled,1)
+
+    if(.not.on_root) then
+       allocate(wannier_centres(3,num_wann),stat=ierr)
+       if (ierr/=0) call io_error('Error allocating wannier_centres in param_dist')
+       wannier_centres=0.0_dp
+       allocate(wannier_spreads(num_wann),stat=ierr)
+       if (ierr/=0) call io_error('Error in allocating wannier_spreads in param_dist')
+       wannier_spreads=0.0_dp 
+       if (disentanglement) then 
+          allocate(ndimwin(num_kpts),stat=ierr)
+          if (ierr/=0) call io_error('Error allocating ndimwin in param_dist')
+          allocate(lwindow(num_bands,num_kpts),stat=ierr)
+          if (ierr/=0) call io_error('Error allocating lwindow in param_dist')
+       endif
+    endif
+
+
+
+
+  end subroutine param_dist
+
+
+
 end module w90_parameters
diff --git a/src/postw90/comms.F90 b/src/postw90/comms.F90
index a9cd4975f..513fcf42b 100644
--- a/src/postw90/comms.F90
+++ b/src/postw90/comms.F90
@@ -18,7 +18,6 @@
 !                                                            !
 !------------------------------------------------------------!
 
-
 module w90_comms
   !! This module handles all of the communications
 
@@ -99,7 +98,7 @@ module w90_comms
   interface comms_gatherv
 !     module procedure comms_gatherv_int    ! to be done
      module procedure comms_gatherv_real
-!     module procedure comms_gatherv_cmplx
+     module procedure comms_gatherv_cmplx
   end interface comms_gatherv
 
   interface comms_scatterv
@@ -846,6 +845,47 @@ subroutine comms_gatherv_real(array,localcount,rootglobalarray,counts,displs)
 
   end subroutine comms_gatherv_real
 
+
+  ! Array: local array for sending data; localcount elements will be sent
+  !        to the root node
+  ! rootglobalarray: array on the root node to which data will be sent
+  ! counts, displs : how data should be partitioned, see MPI documentation or
+  !                  function comms_array_split
+  subroutine comms_gatherv_cmplx(array,localcount,rootglobalarray,counts,displs)
+
+    implicit none
+
+    complex(kind=dp), intent(inout)           :: array
+    integer, intent(in)                       :: localcount
+    complex(kind=dp), intent(inout)           :: rootglobalarray
+    integer, dimension(num_nodes), intent(in) :: counts
+    integer, dimension(num_nodes), intent(in) :: displs
+
+#ifdef MPI
+    integer :: error
+
+    call MPI_gatherv(array,localcount,MPI_double_complex,rootglobalarray,counts,&
+         displs,MPI_double_complex,root_id,mpi_comm_world,error)
+
+    if(error.ne.MPI_success) then
+       call io_error('Error in comms_gatherv_cmplx')
+    end if
+
+#else
+    call zcopy(localcount,array,1,rootglobalarray,1)
+#endif
+
+    return
+
+  end subroutine comms_gatherv_cmplx
+
+
+  ! Array: local array for getting data; localcount elements will be fetched
+  !        from the root node
+  ! rootglobalarray: array on the root node from which data will be sent
+  ! counts, displs : how data should be partitioned, see MPI documentation or
+  !                  function comms_array_split
+
   subroutine comms_scatterv_real(array,localcount,rootglobalarray,counts,displs)
     !! Scatter data from root node
     implicit none
diff --git a/src/postw90/postw90_common.F90 b/src/postw90/postw90_common.F90
index 21557c6d5..720f7c8b9 100644
--- a/src/postw90/postw90_common.F90
+++ b/src/postw90/postw90_common.F90
@@ -372,9 +372,7 @@ subroutine pw90common_wanint_param_dist
                call io_error('Error allocating kpt_latt in postw90_param_dist')
        endif
     end if
-    if (nfermi /= 0) then
-       call comms_bcast(fermi_energy_list(1),nfermi)
-    end if
+    if(nfermi>0) call comms_bcast(fermi_energy_list(1),nfermi)
     call comms_bcast(kubo_freq_list(1),kubo_nfreq)
     call comms_bcast(dos_project(1),num_dos_project)
     if(.not.effective_model) then
diff --git a/src/wannier_prog.F90 b/src/wannier_prog.F90
index 4bcd948ba..4885ad034 100644
--- a/src/wannier_prog.F90
+++ b/src/wannier_prog.F90
@@ -15,6 +15,7 @@
 !       functions",                                          !
 !       Computer Physics Communications 185, 2309 (2014),    !
 !       http://dx.doi.org/10.1016/j.cpc.2014.05.003          !
+
 !                                                            !
 ! in any publications arising from the use of this code.     !
 !                                                            !
@@ -64,103 +65,138 @@ program wannier
   use w90_wannierise
   use w90_plot
   use w90_transport
+  use w90_comms, only : on_root,num_nodes, comms_setup, comms_end, comms_bcast, my_node_id
   use w90_sitesym !YN:
+
  
   implicit none
 
   real(kind=dp) time0,time1,time2
   character(len=9) :: stat,pos,cdate,ctime
   logical :: wout_found
+  integer :: len_seedname
 
-  time0=io_time()
+  call comms_setup
 
   library = .false.
 
-  call io_get_seedname()
+  time0=io_time()
+  
+  if (on_root) then
+     call io_get_seedname()
+     len_seedname = len(seedname)
+  end if
+  call comms_bcast(len_seedname,1)
+  call comms_bcast(seedname,len_seedname)
 
-  stdout=io_file_unit()
-  open(unit=stdout,file=trim(seedname)//'.werr')
-  call io_date(cdate,ctime)
-  write(stdout,*)  'Wannier90: Execution started on ',cdate,' at ',ctime
-  call param_read()
-  close(stdout,status='delete')
 
-  if (restart.eq.' ') then
-     stat='replace'
-     pos ='rewind'
-  else
-     inquire(file=trim(seedname)//'.wout',exist=wout_found)
-     if (wout_found) then
-        stat='old'
-     else
+
+  if(on_root) then 
+     stdout=io_file_unit()
+     open(unit=stdout,file=trim(seedname)//'.werr')
+     call io_date(cdate,ctime)
+     write(stdout,*) 'Wannier90: Execution started on ',cdate,' at ',ctime
+
+     call param_read
+     close(stdout,status='delete')     
+
+     if (restart.eq.' ') then
         stat='replace'
+        pos ='rewind'
+     else
+        inquire(file=trim(seedname)//'.wout',exist=wout_found)
+        if (wout_found) then
+           stat='old'
+        else
+           stat='replace'
+        endif
+        pos='append'
      endif
-     pos='append'
-  endif
-
-  stdout=io_file_unit()
-  open(unit=stdout,file=trim(seedname)//'.wout',status=trim(stat),position=trim(pos))
-  call param_write_header()
-  call param_write()
 
-  time1=io_time()
-  write(stdout,'(1x,a25,f11.3,a)') 'Time to read parameters  ',time1-time0,' (sec)'
+     stdout=io_file_unit()
+     open(unit=stdout,file=trim(seedname)//'.wout',status=trim(stat),position=trim(pos))
+     call param_write_header()
+       if(num_nodes==1) then
+#ifdef MPI        
+          write(stdout,'(/,1x,a)') 'Running in serial (with parallel executable)'
+#else
+          write(stdout,'(/,1x,a)') 'Running in serial (with serial executable)'
+#endif
+       else
+          write(stdout,'(/,1x,a,i3,a/)')&
+               'Running in parallel on ',num_nodes,' CPUs'
+       endif
+       call param_write()
+       
+       time1=io_time()
+       write(stdout,'(1x,a25,f11.3,a)') 'Time to read parameters  ',time1-time0,' (sec)'
+       
+       
+       if (.not. explicit_nnkpts) call kmesh_get
+       time2=io_time()
+       write(stdout,'(1x,a25,f11.3,a)')&
+            'Time to get kmesh        ',time2-time1,' (sec)'
+
+       call param_memory_estimate
+    end if
+
+  ! We now distribute the parameters to the other nodes
+    call param_dist
+    if(gamma_only.and.num_nodes>1) &
+         call io_error('Gamma point branch is serial only at the moment')
 
   if (transport .and. tran_read_ht) goto 3003
 
-  if (.not. explicit_nnkpts) call kmesh_get()
-  call param_memory_estimate()
-
   ! Sort out restarts
   if (restart.eq.' ') then  ! start a fresh calculation
-     write(stdout,'(1x,a/)') 'Starting a new Wannier90 calculation ...'
+     if (on_root) write(stdout,'(1x,a/)') 'Starting a new Wannier90 calculation ...'
   else                      ! restart a previous calculation
-     call param_read_chkpt()
-!~     call param_read_um
+     if(on_root) call param_read_chkpt()
+     call param_chkpt_dist
+
      select case (restart)
         case ('default')    ! continue from where last checkpoint was written
-           write(stdout,'(/1x,a)',advance='no') 'Resuming a previous Wannier90 calculation '
+           if (on_root) write(stdout,'(/1x,a)',advance='no') 'Resuming a previous Wannier90 calculation '
            if (checkpoint.eq.'postdis') then 
-              write(stdout,'(a/)') 'from wannierisation ...'
+              if (on_root) write(stdout,'(a/)') 'from wannierisation ...'
               goto 1001         ! go to wann_main
            elseif (checkpoint.eq.'postwann') then
-              write(stdout,'(a/)') 'from plotting ...'
+              if (on_root) write(stdout,'(a/)') 'from plotting ...'
               goto 2002         ! go to plot_main
            else
-              write(stdout,'(/a/)')
+              if (on_root) write(stdout,'(/a/)')
               call io_error('Value of checkpoint not recognised in wann_prog')
            endif
         case ('wannierise') ! continue from wann_main irrespective of value of last checkpoint
-           write(stdout,'(1x,a/)') 'Restarting Wannier90 from wannierisation ...'
+           if (on_root) write(stdout,'(1x,a/)') 'Restarting Wannier90 from wannierisation ...'
            goto 1001
         case ('plot')       ! continue from plot_main irrespective of value of last checkpoint 
-           write(stdout,'(1x,a/)') 'Restarting Wannier90 from plotting routines ...'
+           if (on_root) write(stdout,'(1x,a/)') 'Restarting Wannier90 from plotting routines ...'
            goto 2002       
         case ('transport')   ! continue from tran_main irrespective of value of last checkpoint 
-           write(stdout,'(1x,a/)') 'Restarting Wannier90 from transport routines ...'
+           if (on_root) write(stdout,'(1x,a/)') 'Restarting Wannier90 from transport routines ...'
            goto 3003       
         case default        ! for completeness... (it is already trapped in param_read)
            call io_error('Value of restart not recognised in wann_prog')
      end select
   endif
 
+
   if (postproc_setup) then
-     call kmesh_write()
+     if(on_root) call kmesh_write()
      call kmesh_dealloc()
      call param_dealloc()
-     write(stdout,'(1x,a25,f11.3,a)') 'Time to write kmesh      ',io_time(),' (sec)'
-     write(stdout,'(/a)') ' Exiting... '//trim(seedname)//'.nnkp written.'
+     if (on_root) write(stdout,'(1x,a25,f11.3,a)') 'Time to write kmesh      ',io_time(),' (sec)'
+     if (on_root) write(stdout,'(/a)') ' Exiting... '//trim(seedname)//'.nnkp written.'
+     call comms_end
      stop
   endif
 
-  time2=io_time()
-  write(stdout,'(1x,a25,f11.3,a)') 'Time to get kmesh        ',time2-time1,' (sec)'
-
-  if (lsitesymmetry) call sitesym_read()   !YN:
+  if (lsitesymmetry) call sitesym_read()   ! update this to read on root and bcast - JRY
   call overlap_read()
 
   time1=io_time()
-  write(stdout,'(/1x,a25,f11.3,a)') 'Time to read overlaps    ',time1-time2,' (sec)'
+  if (on_root) write(stdout,'(/1x,a25,f11.3,a)') 'Time to read overlaps    ',time1-time2,' (sec)'
 
   have_disentangled = .false.
 
@@ -168,10 +204,10 @@ program wannier
      call dis_main()
      have_disentangled=.true.
      time2=io_time()
-     write(stdout,'(1x,a25,f11.3,a)') 'Time to disentangle bands',time2-time1,' (sec)'     
+     if(on_root) write(stdout,'(1x,a25,f11.3,a)') 'Time to disentangle bands',time2-time1,' (sec)'     
   endif
 
-  call param_write_chkpt('postdis')
+  if (on_root) call param_write_chkpt('postdis')
 !~  call param_write_um
 
 1001 time2=io_time()
@@ -183,26 +219,30 @@ program wannier
   end if
 
   time1=io_time()
-  write(stdout,'(1x,a25,f11.3,a)') 'Time for wannierise      ',time1-time2,' (sec)'     
-
-  call param_write_chkpt('postwann')
-
-2002 time2=io_time()
-
-  if (wannier_plot .or. bands_plot .or. fermi_surface_plot .or. write_hr) then
-     call plot_main()
-     time1=io_time()
-     write(stdout,'(1x,a25,f11.3,a)') 'Time for plotting        ',time1-time2,' (sec)'
-  end if
-
-3003 time2=io_time()
+  if (on_root) write(stdout,'(1x,a25,f11.3,a)') 'Time for wannierise      ',time1-time2,' (sec)'     
+
+  if (on_root) call param_write_chkpt('postwann')
+
+2002 continue
+  if (on_root) then
+    time2=io_time()
+    if (wannier_plot .or. bands_plot .or. fermi_surface_plot .or. write_hr) then
+       call plot_main()
+       time1=io_time()
+       write(stdout,'(1x,a25,f11.3,a)') 'Time for plotting        ',time1-time2,' (sec)'
+    end if
+  endif
 
-  if (transport) then
-     call tran_main()
-     time1=io_time()
-     write(stdout,'(1x,a25,f11.3,a)') 'Time for transport       ',time1-time2,' (sec)'
-     if (tran_read_ht) goto 4004
-  end if
+3003 continue
+  if (on_root) then
+     time2=io_time()
+     if (transport) then
+        call tran_main()
+        time1=io_time()
+        write(stdout,'(1x,a25,f11.3,a)') 'Time for transport       ',time1-time2,' (sec)'
+        if (tran_read_ht) goto 4004
+     end if
+  endif
 
   call tran_dealloc()
   call hamiltonian_dealloc()
@@ -213,15 +253,18 @@ program wannier
 
 4004 continue 
 
-  write(stdout,'(1x,a25,f11.3,a)') 'Total Execution Time     ',io_time(),' (sec)'
+  if (on_root) then
+    write(stdout,'(1x,a25,f11.3,a)') 'Total Execution Time     ',io_time(),' (sec)'
 
-  if (timing_level>0) call io_print_timings()
+    if (timing_level>0) call io_print_timings()
 
-  write(stdout,*) 
-  write(stdout,'(1x,a)') 'All done: wannier90 exiting'
+    write(stdout,*) 
+    write(stdout,'(1x,a)') 'All done: wannier90 exiting'
  
-  close(stdout)
+    close(stdout)
+  endif
 
+  call comms_end
 
 
 end program wannier
diff --git a/src/wannierise.F90 b/src/wannierise.F90
index 91eb46c92..95c32acda 100644
--- a/src/wannierise.F90
+++ b/src/wannierise.F90
@@ -16,6 +16,9 @@ module w90_wannierise
   !! Main routines for the minimisation of the spread
 
   use w90_constants
+  use w90_comms, only : on_root, my_node_id, num_nodes,&
+                        comms_bcast, comms_array_split,&
+                        comms_gatherv, comms_allreduce
 
   implicit none
 
@@ -26,12 +29,30 @@ module w90_wannierise
 
   ! Data to avoid large allocation within iteration loop
   real(kind=dp),    allocatable  :: rnkb (:,:,:)   
+  real(kind=dp),    allocatable  :: rnkb_loc (:,:,:)   
   real(kind=dp),    allocatable  :: ln_tmp(:,:,:)
 
+  real(kind=dp),    allocatable  :: ln_tmp_loc(:,:,:)
+
+  ! for MPI
+  complex(kind=dp), allocatable  :: u_matrix_loc(:,:,:)
+  complex(kind=dp), allocatable  :: m_matrix_loc(:,:,:,:)
+  complex(kind=dp), allocatable  :: m_matrix_1b(:,:,:)
+  complex(kind=dp), allocatable  :: m_matrix_1b_loc(:,:,:)
+  complex(kind=dp), allocatable  :: cdq_loc(:,:,:) ! the only large array sent
+                                                   ! from process to process
+                                                   ! in the main loop
+  complex(kind=dp), allocatable  :: cdodq_loc(:,:,:)
+  integer,          allocatable  :: counts(:)
+  integer,          allocatable  :: displs(:)
+    
   logical :: first_pass
   !! Used to trigger the calculation of the invarient spread
   !! we only need to do this on entering wann_main (_gamma)
 
+#ifdef MPI
+  include 'mpif.h'
+#endif
 
   type localisation_vars
      !! Contributions to the spread
@@ -59,7 +80,7 @@ subroutine wann_main
     !                                                                  !
     !===================================================================  
     use w90_constants,  only : dp,cmplx_1,cmplx_0
-    use w90_io,         only : stdout,io_error,io_time,io_stopwatch &
+    use w90_io,         only : stdout,io_error,io_wallclocktime,io_stopwatch &
          ,io_file_unit
     use w90_parameters, only : num_wann,num_cg_steps,num_iter,nnlist, &
          nntot,wbtot,u_matrix,m_matrix,num_kpts,iprint,num_print_cycles, &
@@ -96,6 +117,7 @@ subroutine wann_main
     complex(kind=dp), allocatable :: cdodq_r(:,:,:)
     complex(kind=dp), allocatable :: k_to_r(:,:)
     complex(kind=dp), allocatable :: cdodq_precond(:,:,:)
+    complex(kind=dp), allocatable :: cdodq_precond_loc(:,:,:)
     real(kind=dp),    allocatable :: sheet (:,:,:)
     real(kind=dp),    allocatable :: rave(:,:),r2ave(:),rave2(:)
     real(kind=dp), dimension(3) :: rvec_cart
@@ -103,10 +125,14 @@ subroutine wann_main
     !local arrays not passed into subroutines
     complex(kind=dp), allocatable  :: cwschur1 (:), cwschur2 (:)  
     complex(kind=dp), allocatable  :: cwschur3 (:), cwschur4 (:)  
-    complex(kind=dp), allocatable  :: cdq(:,:,:),cdqkeep(:,:,:)  
+    complex(kind=dp), allocatable  :: cdq(:,:,:)!,cdqkeep(:,:,:)
+    ! cdqkeep is replaced by cdqkeep_loc
+    complex(kind=dp), allocatable  :: cdqkeep_loc(:,:,:)
     complex(kind=dp), allocatable  :: cz (:,:)  
     complex(kind=dp), allocatable  :: cmtmp(:,:),tmp_cdq(:,:) 
-    complex(kind=dp), allocatable  :: m0(:,:,:,:),u0(:,:,:)
+    ! complex(kind=dp), allocatable  :: m0(:,:,:,:),u0(:,:,:)
+    ! m0 and u0 are replaced by m0_loc and u0_loc
+    complex(kind=dp), allocatable  :: m0_loc(:,:,:,:),u0_loc(:,:,:)
     complex(kind=dp), allocatable  :: cwork(:)
     real(kind=dp),    allocatable  :: evals(:)
     real(kind=dp),    allocatable  :: rwork(:)
@@ -114,7 +140,7 @@ subroutine wann_main
     real(kind=dp) :: doda0
     real(kind=dp) :: falphamin,alphamin
     real(kind=dp) :: gcfac,gcnorm1,gcnorm0
-    integer       :: i,n,iter,ind,ierr,iw,ncg,info
+    integer       :: i,n,iter,ind,ierr,iw,ncg,info,nkp,nkp_loc,nn
     logical       :: lprint,ldump,lquad
     real(kind=dp), allocatable :: history(:)
     real(kind=dp)              :: save_spread
@@ -124,7 +150,7 @@ subroutine wann_main
     real(kind=dp) :: alpha_precond
     integer :: irpt,loop_kpt
 
-    if (timing_level>0) call io_stopwatch('wann: main',1)
+    if (timing_level>0.and.on_root) call io_stopwatch('wann: main',1)
 
     first_pass=.true.
 
@@ -134,12 +160,12 @@ subroutine wann_main
     if (ierr/=0) call io_error('Error allocating history in wann_main')
 
     ! module data
-    if(optimisation>0) then
-       allocate(  m0 (num_wann, num_wann, nntot, num_kpts),stat=ierr)
-    end if
-    if (ierr/=0) call io_error('Error in allocating m0 in wann_main')
-    allocate(  u0 (num_wann, num_wann, num_kpts),stat=ierr)
-    if (ierr/=0) call io_error('Error in allocating u0 in wann_main')
+!    if(optimisation>0) then
+!       allocate(  m0 (num_wann, num_wann, nntot, num_kpts),stat=ierr)
+!    end if
+!    if (ierr/=0) call io_error('Error in allocating m0 in wann_main')
+!    allocate(  u0 (num_wann, num_wann, num_kpts),stat=ierr)
+!    if (ierr/=0) call io_error('Error in allocating u0 in wann_main')
     allocate( rnkb (num_wann, nntot, num_kpts),stat=ierr    )     
     if (ierr/=0) call io_error('Error in allocating rnkb in wann_main')
     allocate( ln_tmp (num_wann, nntot, num_kpts), stat=ierr    )
@@ -173,7 +199,7 @@ subroutine wann_main
        ! this method of computing the preconditioning is much more efficient, but requires more RAM
        if(optimisation >= 3) then
           allocate(k_to_r(num_kpts,nrpts),stat=ierr)
-          if (ierr/=0) call io_error('Error in allocating cdodq_precond in wann_main')
+          if (ierr/=0) call io_error('Error in allocating k_to_r in wann_main')
           
           do irpt=1,nrpts
              do loop_kpt=1,num_kpts
@@ -194,12 +220,53 @@ subroutine wann_main
     if (ierr/=0) call io_error('Error in allocating cwshur3 in wann_main')
     allocate( cdq (num_wann, num_wann, num_kpts),stat=ierr ) 
     if (ierr/=0) call io_error('Error in allocating cdq in wann_main')
+
+    ! for MPI
+    allocate( counts(0:num_nodes-1), displs(0:num_nodes-1), stat=ierr )
+    if (ierr/=0) call io_error('Error in allocating counts and displs in wann_main')
+    call comms_array_split(num_kpts,counts,displs)
+    allocate( rnkb_loc (num_wann, nntot, counts(my_node_id)),stat=ierr    )     
+    if (ierr/=0) call io_error('Error in allocating rnkb_loc in wann_main')
+    allocate( ln_tmp_loc (num_wann, nntot, counts(my_node_id)), stat=ierr    )
+    if (ierr/=0) call io_error('Error in allocating ln_tmp_loc in wann_main')
+    allocate( u_matrix_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) 
+    if (ierr/=0) call io_error('Error in allocating u_matrix_loc in wann_main')   
+    allocate( m_matrix_loc (num_wann, num_wann, nntot, counts(my_node_id)),stat=ierr ) 
+    if (ierr/=0) call io_error('Error in allocating m_matrix_loc in wann_main')
+    allocate( m_matrix_1b  (num_wann, num_wann, num_kpts),stat=ierr ) 
+    if (ierr/=0) call io_error('Error in allocating m_matrix_1b in wann_main')
+    allocate( m_matrix_1b_loc  (num_wann, num_wann, counts(my_node_id)),stat=ierr ) 
+    if (ierr/=0) call io_error('Error in allocating m_matrix_1b_loc in wann_main')
+    if(precond) then
+       allocate(cdodq_precond_loc(num_wann,num_wann,counts(my_node_id)),stat=ierr)
+       if (ierr/=0) call io_error('Error in allocating cdodq_precond_loc in wann_main')
+    end if
+    ! initialize local u and m matrices with global ones
+    do nkp_loc = 1, counts(my_node_id)
+       nkp = nkp_loc + displs(my_node_id)
+       m_matrix_loc (:,:,:, nkp_loc) = &
+           m_matrix (:,:,:, nkp)
+       u_matrix_loc (:,:, nkp_loc) = &
+           u_matrix (:,:, nkp)
+    end do
+
+    allocate( cdq_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) 
+    if (ierr/=0) call io_error('Error in allocating cdq_loc in wann_main')
+    allocate( cdodq_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr ) 
+    if (ierr/=0) call io_error('Error in allocating cdodq_loc in wann_main')
+    allocate( cdqkeep_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr  )
+    if (ierr/=0) call io_error('Error in allocating cdqkeep_loc in wann_main')
+    if(optimisation>0) then
+       allocate(  m0_loc (num_wann, num_wann, nntot, counts(my_node_id)),stat=ierr)
+    end if
+    if (ierr/=0) call io_error('Error in allocating m0_loc in wann_main')
+    allocate(  u0_loc (num_wann, num_wann, counts(my_node_id)),stat=ierr)
+    if (ierr/=0) call io_error('Error in allocating u0_loc in wann_main')
+
     allocate( cz (num_wann, num_wann),stat=ierr  )
     if (ierr/=0) call io_error('Error in allocating cz in wann_main')
     allocate( cmtmp (num_wann, num_wann),stat=ierr  )
     if (ierr/=0) call io_error('Error in allocating cmtmp in wann_main')
-    allocate( cdqkeep (num_wann, num_wann, num_kpts),stat=ierr  )
-    if (ierr/=0) call io_error('Error in allocating cdqkeep in wann_main')
     allocate(tmp_cdq(num_wann,num_wann),stat=ierr)
     if (ierr/=0) call io_error('Error in allocating tmp_cdq in wann_main')
     allocate( evals (num_wann),stat=ierr)
@@ -211,7 +278,7 @@ subroutine wann_main
 
 
     cwschur1=cmplx_0; cwschur2=cmplx_0; cwschur3=cmplx_0; cwschur4=cmplx_0
-    cdq=cmplx_0; cz=cmplx_0; cmtmp=cmplx_0; cdqkeep=cmplx_0
+    cdq=cmplx_0; cz=cmplx_0; cmtmp=cmplx_0; cdqkeep_loc=cmplx_0; cdq_loc=cmplx_0;! buff=cmplx_0;
     
     gcnorm1=0.0_dp; gcnorm0=0.0_dp
 
@@ -227,17 +294,18 @@ subroutine wann_main
 !       end if
     end if
 
-    write(stdout,*)
-    write(stdout,'(1x,a)') '*------------------------------- WANNIERISE ---------------------------------*'
-    write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV'
-    if (lenconfac.eq.1.0_dp) then
-       write(stdout,'(1x,a)') '| Iter  Delta Spread     RMS Gradient      Spread (Ang^2)      Time  |<-- CONV'
-    else
-       write(stdout,'(1x,a)') '| Iter  Delta Spread     RMS Gradient      Spread (Bohr^2)     Time  |<-- CONV'
+    if (on_root) then
+       write(stdout,*)
+       write(stdout,'(1x,a)') '*------------------------------- WANNIERISE ---------------------------------*'
+       write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV'
+       if (lenconfac.eq.1.0_dp) then
+          write(stdout,'(1x,a)') '| Iter  Delta Spread     RMS Gradient      Spread (Ang^2)      Time  |<-- CONV'
+       else
+          write(stdout,'(1x,a)') '| Iter  Delta Spread     RMS Gradient      Spread (Bohr^2)     Time  |<-- CONV'
+       endif
+       write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV'
+       write(stdout,*)
     endif
-    write(stdout,'(1x,a)') '+--------------------------------------------------------------------+<-- CONV'
-    write(stdout,*)
-
 
     irguide=0
     if (guiding_centres.and.(num_no_guide_iter.le.0)) then
@@ -263,21 +331,23 @@ subroutine wann_main
     old_spread%om_tot = 0.0_dp
 
     ! print initial state
-    write(stdout,'(1x,a78)') repeat('-',78) 
-    write(stdout,'(1x,a)') 'Initial State'
-    do iw=1,num_wann
-       write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),&
-            (r2ave(iw) - rave2(iw))*lenconfac**2
-    end do
-    write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3), (sum(r2ave)-sum(rave2))*lenconfac**2
-    write(stdout,*)
-    write(stdout,'(1x,i6,2x,E12.3,2x,F15.10,2x,F18.10,3x,F8.2,2x,a)') &
-         iter,(wann_spread%om_tot-old_spread%om_tot)*lenconfac**2,sqrt(abs(gcnorm1))*lenconfac,&
-         wann_spread%om_tot*lenconfac**2,io_time(),'<-- CONV'
-    write(stdout,'(8x,a,F15.7,a,F15.7,a,F15.7,a)') &
-         'O_D=',wann_spread%om_d*lenconfac**2,' O_OD=',wann_spread%om_od*lenconfac**2,&
-         ' O_TOT=',wann_spread%om_tot*lenconfac**2,' <-- SPRD'
-    write(stdout,'(1x,a78)') repeat('-',78) 
+    if (on_root) then
+       write(stdout,'(1x,a78)') repeat('-',78) 
+       write(stdout,'(1x,a)') 'Initial State'
+       do iw=1,num_wann
+          write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),&
+               (r2ave(iw) - rave2(iw))*lenconfac**2
+       end do
+       write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3), (sum(r2ave)-sum(rave2))*lenconfac**2
+       write(stdout,*)
+       write(stdout,'(1x,i6,2x,E12.3,2x,F15.10,2x,F18.10,3x,F8.2,2x,a)') &
+            iter,(wann_spread%om_tot-old_spread%om_tot)*lenconfac**2,sqrt(abs(gcnorm1))*lenconfac,&
+            wann_spread%om_tot*lenconfac**2,io_wallclocktime(),'<-- CONV'
+       write(stdout,'(8x,a,F15.7,a,F15.7,a,F15.7,a)') &
+            'O_D=',wann_spread%om_d*lenconfac**2,' O_OD=',wann_spread%om_od*lenconfac**2,&
+            ' O_TOT=',wann_spread%om_tot*lenconfac**2,' <-- SPRD'
+       write(stdout,'(1x,a78)') repeat('-',78) 
+    endif
 
     lconverged=.false. ; lfirst=.true. ; lrandom=.false.
     conv_count=0 ; noise_count=0
@@ -287,6 +357,7 @@ subroutine wann_main
        open(unit=page_unit,status='scratch',form='unformatted')
     endif
 
+
     ! main iteration loop
     do iter=1,num_iter
 
@@ -297,7 +368,7 @@ subroutine wann_main
        ldump=.false.
        if ( (num_dump_cycles.gt.0) .and. (mod(iter,num_dump_cycles).eq.0) ) ldump=.true.
 
-       if(lprint) write(stdout,'(1x,a,i6)') 'Cycle: ',iter
+       if(lprint.and.on_root) write(stdout,'(1x,a,i6)') 'Cycle: ',iter
 
        if ( guiding_centres.and.(iter.gt.num_no_guide_iter) & 
             .and.(mod(iter,num_guide_cycles).eq.0) ) then
@@ -306,9 +377,15 @@ subroutine wann_main
        endif
 
        ! calculate gradient of omega
-       call wann_domega(csheet,sheet,rave,cdodq)
+       
+       if (lsitesymmetry.or.precond) then
+          call wann_domega(csheet,sheet,rave,cdodq) 
+       else
+          call wann_domega(csheet,sheet,rave)!,cdodq)  fills only cdodq_loc
+       endif
 
-       if ( lprint .and. iprint>2 ) &
+
+       if ( lprint .and. iprint>2 .and. on_root) &
             write(stdout,*) ' LINE --> Iteration                     :',iter
 
        ! calculate search direction (cdq)
@@ -316,7 +393,7 @@ subroutine wann_main
        if (lsitesymmetry) call sitesym_symmetrize_gradient(2,cdq) !RS:
 
        ! save search direction 
-       cdqkeep(:,:,:) = cdq(:,:,:)
+       cdqkeep_loc(:,:,:) = cdq_loc(:,:,:)
 
        ! check whether we're doing fixed step lengths
        if (lfixstep) then
@@ -327,16 +404,16 @@ subroutine wann_main
        else
 
           ! take trial step
-          cdq(:,:,:)=cdqkeep(:,:,:)*( trial_step / (4.0_dp*wbtot) ) 
+          cdq_loc(:,:,:)=cdqkeep_loc(:,:,:)*( trial_step / (4.0_dp*wbtot) ) 
           
           ! store original U and M before rotating
-          u0=u_matrix 
+          u0_loc=u_matrix_loc
 
           if(optimisation<=0) then
              write(page_unit)   m_matrix
              rewind(page_unit)
           else
-             m0=m_matrix
+             m0_loc=m_matrix_loc
           endif
 
           ! update U and M
@@ -351,7 +428,7 @@ subroutine wann_main
        endif
 
        ! print line search information
-       if ( lprint .and. iprint>2 ) then
+       if ( lprint .and. iprint>2 .and. on_root) then
           write(stdout,*) ' LINE --> Spread at initial point       :',wann_spread%om_tot*lenconfac**2
           if (.not.lfixstep) &
                write(stdout,*) ' LINE --> Spread at trial step          :',trial_spread%om_tot*lenconfac**2
@@ -374,22 +451,22 @@ subroutine wann_main
        if (lfixstep.or.lquad) then
 
           ! take optimal step
-          cdq(:,:,:) = cdqkeep(:,:,:) * ( alphamin / (4.0_dp*wbtot) ) 
+          cdq_loc(:,:,:) = cdqkeep_loc(:,:,:) * ( alphamin / (4.0_dp*wbtot) ) 
           
           ! if doing a line search then restore original U and M before rotating 
           if (.not.lfixstep) then 
-             u_matrix=u0
+             u_matrix_loc=u0_loc
              if(optimisation<=0) then
                 read(page_unit)  m_matrix
                 rewind(page_unit)
              else
-                m_matrix=m0
+                m_matrix_loc=m0_loc
              endif
           endif
 
           ! update U and M
           call internal_new_u_and_m()
-          
+
           call wann_spread_copy(wann_spread,old_spread)
           
           ! calculate the new centers and spread
@@ -405,7 +482,7 @@ subroutine wann_main
  
 
        ! print the new centers and spreads
-       if(lprint) then
+       if(lprint .and. on_root) then
           do iw=1,num_wann
              write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),&
                   (r2ave(iw) - rave2(iw))*lenconfac**2
@@ -416,7 +493,7 @@ subroutine wann_main
           write(stdout,'(1x,i6,2x,E12.3,2x,F15.10,2x,F18.10,3x,F8.2,2x,a)') &
                iter,(wann_spread%om_tot-old_spread%om_tot)*lenconfac**2,&
                sqrt(abs(gcnorm1))*lenconfac,&
-               wann_spread%om_tot*lenconfac**2,io_time(),'<-- CONV'
+               wann_spread%om_tot*lenconfac**2,io_wallclocktime(),'<-- CONV'
           write(stdout,'(8x,a,F15.7,a,F15.7,a,F15.7,a)') &
                'O_D=',wann_spread%om_d*lenconfac**2,&
                ' O_OD=',wann_spread%om_od*lenconfac**2,&
@@ -436,7 +513,7 @@ subroutine wann_main
        omega_total = wann_spread%om_tot
        omega_tilde = wann_spread%om_d + wann_spread%om_od
 
-       if (ldump) call param_write_chkpt('postdis')
+       if (ldump.and.on_root) call param_write_chkpt('postdis')
 
        if (conv_window.gt.1) call internal_test_convergence()
 
@@ -451,38 +528,55 @@ subroutine wann_main
     enddo
     ! end of the minimization loop
 
-
-    write(stdout,'(1x,a)') 'Final State'
-    do iw=1,num_wann
-       write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),&
-            (r2ave(iw) - rave2(iw))*lenconfac**2
-    end do
-    write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3),&
-         (sum(r2ave)-sum(rave2))*lenconfac**2
-    write(stdout,*)
-    write(stdout,'(3x,a21,a,f15.9)') '     Spreads ('//trim(length_unit)//'^2)',&
-         '       Omega I      = ',wann_spread%om_i*lenconfac**2
-    write(stdout,'(3x,a,f15.9)') '     ================       Omega D      = ',&
-         wann_spread%om_d*lenconfac**2
-    write(stdout,'(3x,a,f15.9)') '                            Omega OD     = ',&
-         wann_spread%om_od*lenconfac**2
-    write(stdout,'(3x,a21,a,f15.9)') 'Final Spread ('//trim(length_unit)//'^2)',&
-         '       Omega Total  = ',wann_spread%om_tot*lenconfac**2  
-    write(stdout,'(1x,a78)') repeat('-',78) 
+    ! the m matrix is sent by piece to avoid huge arrays
+    do nn = 1, nntot
+      m_matrix_1b_loc=m_matrix_loc(:,:,nn,:)
+      call comms_gatherv(m_matrix_1b_loc(1,1,1),num_wann*num_wann*counts(my_node_id),&
+                 m_matrix(1,1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs)
+      call comms_bcast(m_matrix_1b(1,1,1),num_wann*num_wann*num_kpts)
+      m_matrix(:,:,nn,:)=m_matrix_1b(:,:,:)
+    end do!nn
+     
+    ! send u matrix
+    call comms_gatherv(u_matrix_loc(1,1,1),num_wann*num_wann*counts(my_node_id),&
+               u_matrix(1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs)
+    call comms_bcast(u_matrix(1,1,1),num_wann*num_wann*num_kpts)    
+
+    if (on_root) then
+       write(stdout,'(1x,a)') 'Final State'
+       do iw=1,num_wann
+          write(stdout,1000) iw,(rave(ind,iw)*lenconfac,ind=1,3),&
+               (r2ave(iw) - rave2(iw))*lenconfac**2
+       end do
+       write(stdout,1001) (sum(rave(ind,:))*lenconfac,ind=1,3),&
+            (sum(r2ave)-sum(rave2))*lenconfac**2
+       write(stdout,*)
+       write(stdout,'(3x,a21,a,f15.9)') '     Spreads ('//trim(length_unit)//'^2)',&
+            '       Omega I      = ',wann_spread%om_i*lenconfac**2
+       write(stdout,'(3x,a,f15.9)') '     ================       Omega D      = ',&
+            wann_spread%om_d*lenconfac**2
+       write(stdout,'(3x,a,f15.9)') '                            Omega OD     = ',&
+            wann_spread%om_od*lenconfac**2
+       write(stdout,'(3x,a21,a,f15.9)') 'Final Spread ('//trim(length_unit)//'^2)',&
+            '       Omega Total  = ',wann_spread%om_tot*lenconfac**2  
+       write(stdout,'(1x,a78)') repeat('-',78) 
+    endif
 
     if (write_xyz) call wann_write_xyz()
 
     if(write_hr_diag) then
        call hamiltonian_setup()
        call hamiltonian_get_hr()
-       write(stdout,*)
-       write(stdout,'(1x,a)') 'On-site Hamiltonian matrix elements'
-       write(stdout,'(3x,a)') '  n        <0n|H|0n> (eV)'
-       write(stdout,'(3x,a)') '-------------------------'
-       do i=1,num_wann
-          write(stdout,'(3x,i3,5x,f12.6)') i,real(ham_r(i,i,rpt_origin),kind=dp)
-       enddo
-       write(stdout,*)
+       if (on_root) then
+          write(stdout,*)
+          write(stdout,'(1x,a)') 'On-site Hamiltonian matrix elements'
+          write(stdout,'(3x,a)') '  n        <0n|H|0n> (eV)'
+          write(stdout,'(3x,a)') '-------------------------'
+          do i=1,num_wann
+             write(stdout,'(3x,i3,5x,f12.6)') i,real(ham_r(i,i,rpt_origin),kind=dp)
+          enddo
+          write(stdout,*)
+       endif
     endif
 
     if (guiding_centres) call wann_phases(csheet,sheet,rguide,irguide)
@@ -514,14 +608,33 @@ subroutine wann_main
     if (ierr/=0) call io_error('Error in deallocating evals in wann_main')
     deallocate(tmp_cdq,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating tmp_cdq in wann_main')
-    deallocate(cdqkeep,stat=ierr)
-    if (ierr/=0) call io_error('Error in deallocating cdqkeep in wann_main')
     deallocate(cmtmp,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating cmtmp in wann_main')
     deallocate(cz,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating cz in wann_main')
     deallocate(cdq,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating cdq in wann_main')
+
+    ! for MPI
+    deallocate( ln_tmp_loc , stat=ierr  )
+    if (ierr/=0) call io_error('Error in deallocating ln_tmp_loc in wann_main')
+    deallocate( rnkb_loc,stat=ierr  )
+    if (ierr/=0) call io_error('Error in deallocating rnkb_loc in wann_main')
+    deallocate(u_matrix_loc,stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating u_matrix_loc in wann_main')
+    deallocate(m_matrix_loc,stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating m_matrix_loc in wann_main')
+    deallocate(m_matrix_1b,stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating m_matrix_1b in wann_main')
+    deallocate(m_matrix_1b_loc,stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating m_matrix_1b_loc in wann_main')
+    deallocate(cdq_loc,stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating cdq_loc in wann_main')
+    deallocate(cdodq_loc,stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating cdodq_loc in wann_main')
+    deallocate(cdqkeep_loc,stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating cdqkeep_loc in wann_main')
+
     deallocate(cwschur3,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating cwschur3 in wann_main')
     deallocate(cwschur1,stat=ierr)
@@ -535,6 +648,8 @@ subroutine wann_main
        if (ierr/=0) call io_error('Error in deallocating cdodq_r in wann_main')
        deallocate(cdodq_precond,stat=ierr)
        if (ierr/=0) call io_error('Error in deallocating cdodq_precond in wann_main')
+       deallocate(cdodq_precond_loc,stat=ierr)
+       if (ierr/=0) call io_error('Error in deallocating cdodq_precond_loc in wann_main')
     end if
 
     ! deallocate sub vars passed into other subs
@@ -557,17 +672,17 @@ subroutine wann_main
     deallocate( rnkb,stat=ierr  )
     if (ierr/=0) call io_error('Error in deallocating rnkb in wann_main')
 
-    deallocate(u0, stat=ierr)
-    if (ierr/=0) call io_error('Error in deallocating u0 in wann_main')
+    deallocate(u0_loc, stat=ierr)
+    if (ierr/=0) call io_error('Error in deallocating u0_loc in wann_main')
     if(optimisation>0) then
-       deallocate(m0, stat=ierr)
-       if (ierr/=0) call io_error('Error in deallocating m0 in wann_main')
+       deallocate(m0_loc, stat=ierr)
+       if (ierr/=0) call io_error('Error in deallocating m0_loc in wann_main')
     end if
 
     deallocate(history,stat=ierr)
     if (ierr/=0) call io_error('Error deallocating history in wann_main')
 
-    if (timing_level>0) call io_stopwatch('wann: main',2)
+    if (timing_level>0.and.on_root) call io_stopwatch('wann: main',2)
 
     return
 
@@ -679,7 +794,7 @@ subroutine internal_random_noise()
       ! cdq is a num_wann x num_wann x num_kpts anti-hermitian array
       ! to which we add a random anti-hermitian matrix 
 
-      do ikp=1,num_kpts
+      do ikp=1,counts(my_node_id)
          do iw=1,num_wann
             call random_seed()
             call random_number(noise_real(:,iw))
@@ -697,7 +812,7 @@ subroutine internal_random_noise()
             enddo
          enddo
          ! Add noise to search direction
-         cdq(:,:,ikp) = cdq(:,:,ikp) + conv_noise_amp * cnoise(:,:)
+         cdq_loc(:,:,ikp) = cdq_loc(:,:,ikp) + conv_noise_amp * cnoise(:,:)
       enddo
 
       ! Deallocate
@@ -729,13 +844,18 @@ subroutine internal_search_direction()
 
       complex(kind=dp) :: zdotc
 
-      if (timing_level>1) call io_stopwatch('wann: main: search_direction',1)
+      if (timing_level>1.and.on_root) call io_stopwatch('wann: main: search_direction',1)
+
+      ! gcnorm1 = Tr[gradient . gradient] -- NB gradient is anti-Hermitian      
+      ! gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdodq,1),dp)
 
       if (precond) then
          ! compute cdodq_precond
          
          cdodq_r(:,:,:) = 0 ! intermediary gradient in R space
          cdodq_precond(:,:,:) = 0
+         cdodq_precond_loc(:,:,:) = 0
+!         cdodq_precond(:,:,:) = complx_0
 
          ! convert to real space in cdodq_r
          ! Two algorithms: either double loop or GEMM. GEMM is much more efficient but requires more RAM
@@ -786,14 +906,18 @@ subroutine internal_search_direction()
                enddo
             enddo
          end if
+         cdodq_precond_loc(:,:,1:counts(my_node_id))=cdodq_precond(:,:,1+displs(my_node_id):displs(my_node_id)+counts(my_node_id))
+
       end if
 
       ! gcnorm1 = Tr[gradient . gradient] -- NB gradient is anti-Hermitian
       if(precond) then
-         gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq_precond,1,cdodq,1),dp)
+!         gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq_precond,1,cdodq,1),dp)
+         gcnorm1 = real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_precond_loc,1,cdodq_loc,1),dp)
       else
-         gcnorm1 = real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdodq,1),dp)
+         gcnorm1 = real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_loc,1,cdodq_loc,1),dp)
       end if
+      call comms_allreduce(gcnorm1,1,'SUM')
 
       ! calculate cg_coefficient
       if ( (iter.eq.1) .or. (ncg.ge.num_cg_steps) ) then
@@ -804,7 +928,7 @@ subroutine internal_search_direction()
             gcfac = gcnorm1/gcnorm0     ! Fletcher-Reeves CG coefficient
             ! prevent CG coefficient from getting too large
             if (gcfac.gt.3.0_dp) then
-               if ( lprint .and. iprint>2 ) &
+               if ( lprint .and. iprint>2 .and. on_root) &
                     write(stdout,*) ' LINE --> CG coeff too large. Resetting :',gcfac
                gcfac = 0.0_dp
                ncg = 0
@@ -821,48 +945,56 @@ subroutine internal_search_direction()
       gcnorm0 = gcnorm1
 
       ! calculate search direction
+
       if(precond) then
-         cdq(:,:,:) = cdodq_precond(:,:,:) + cdqkeep(:,:,:) * gcfac
+         cdq_loc(:,:,:) = cdodq_precond_loc(:,:,:) + cdqkeep_loc(:,:,:) * gcfac !! JRY not MPI
       else
-         cdq(:,:,:) = cdodq(:,:,:) + cdqkeep(:,:,:) * gcfac
+        cdq_loc(:,:,:) = cdodq_loc(:,:,:) + cdqkeep_loc(:,:,:) * gcfac   
       end if
 
+
       ! add some random noise to search direction, if required
       if (lrandom) then
-         write(stdout,'(a,i3,a,i3,a)') &
+         if (on_root) write(stdout,'(a,i3,a,i3,a)') &
               ' [ Adding random noise to search direction. Time ',noise_count,' / ',conv_noise_num,' ]'
          call internal_random_noise()
       endif
       ! calculate gradient along search direction - Tr[gradient . search direction]
       ! NB gradient is anti-hermitian
-      doda0 = -real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdq,1),dp)
+      doda0 = -real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_loc,1,cdq_loc,1),dp)
+
+      call comms_allreduce(doda0,1,'SUM')
+
       doda0 = doda0 / (4.0_dp*wbtot)
 
       ! check search direction is not uphill
       if (doda0.gt.0.0_dp) then
          ! if doing a CG step then reset CG
          if (ncg.gt.0) then
-            if ( lprint .and. iprint>2 ) &
+            if ( lprint .and. iprint>2 .and. on_root) &
                  write(stdout,*) ' LINE --> Search direction uphill: resetting CG'
-            cdq(:,:,:) = cdodq(:,:,:)
+            cdq_loc(:,:,:) = cdodq_loc(:,:,:)
             if (lrandom) call internal_random_noise()
             ncg = 0
             gcfac = 0.0_dp
             ! re-calculate gradient along search direction
-            doda0 = -real(zdotc(num_kpts*num_wann*num_wann,cdodq,1,cdq,1),dp)
+            doda0 = -real(zdotc(counts(my_node_id)*num_wann*num_wann,cdodq_loc,1,cdq_loc,1),dp)
+
+            call comms_allreduce(doda0,1,'SUM')
+
             doda0 = doda0 / (4.0_dp*wbtot)
             ! if search direction still uphill then reverse search direction
             if (doda0.gt.0.0_dp) then
-               if ( lprint .and. iprint>2 ) &
+               if ( lprint .and. iprint>2 .and. on_root) &
                     write(stdout,*) ' LINE --> Search direction still uphill: reversing'
-               cdq(:,:,:) = -cdq(:,:,:)
+               cdq_loc(:,:,:) = -cdq_loc(:,:,:)
                doda0 = -doda0
             endif
             ! if doing a SD step then reverse search direction
          else
-            if ( lprint .and. iprint>2 ) &
+            if ( lprint .and. iprint>2 .and.on_root ) &
                  write(stdout,*) ' LINE --> Search direction uphill: reversing'
-            cdq(:,:,:) = -cdq(:,:,:)
+            cdq_loc(:,:,:) = -cdq_loc(:,:,:)
             doda0 = -doda0
          endif
       endif
@@ -870,7 +1002,7 @@ subroutine internal_search_direction()
  !~     ! calculate search direction
  !~     cdq(:,:,:) = cdodq(:,:,:) + cdqkeep(:,:,:) * gcfac
 
-      if (timing_level>1) call io_stopwatch('wann: main: search_direction',2)
+      if (timing_level>1.and.on_root) call io_stopwatch('wann: main: search_direction',2)
 
       lrandom=.false.
 
@@ -892,7 +1024,7 @@ subroutine internal_optimal_step()
 
       real(kind=dp) :: fac,shift,eqa,eqb
 
-      if (timing_level>1) call io_stopwatch('wann: main: optimal_step',1)
+      if (timing_level>1.and.on_root) call io_stopwatch('wann: main: optimal_step',1)
 
       fac = trial_spread%om_tot - wann_spread%om_tot
       if ( abs(fac) .gt. tiny(1.0_dp) ) then
@@ -910,7 +1042,7 @@ subroutine internal_optimal_step()
          falphamin = wann_spread%om_tot &
               - 0.25_dp * eqb * eqb / (fac * eqa) * (trial_step**2)
       else
-         if ( lprint .and. iprint>2 ) write(stdout,*) &
+         if ( lprint .and. iprint>2 .and. on_root ) write(stdout,*) &
               ' LINE --> Parabolic line search unstable: using trial step'
          lquad=.false.
          alphamin  = trial_step
@@ -918,14 +1050,14 @@ subroutine internal_optimal_step()
       endif
 
       if (doda0*alphamin.gt.0.0_dp) then
-         if ( lprint .and. iprint>2 ) write(stdout,*) &
+         if ( lprint .and. iprint>2 .and. on_root ) write(stdout,*) &
               ' LINE --> Line search unstable : using trial step'
          lquad=.false.
          alphamin=trial_step
          falphamin=trial_spread%om_tot
       endif
 
-      if (timing_level>1) call io_stopwatch('wann: main: optimal_step',2)
+      if (timing_level>1.and.on_root) call io_stopwatch('wann: main: optimal_step',2)
 
       return
 
@@ -944,30 +1076,31 @@ subroutine internal_new_u_and_m()
 
       implicit none
 
-      integer :: nkp,nn,nkp2,nsdim
+      integer :: nkp,nn,nkp2,nsdim,nkp_loc
       logical :: ltmp
 
-      if (timing_level>1) call io_stopwatch('wann: main: u_and_m',1)
+      if (timing_level>1.and.on_root) call io_stopwatch('wann: main: u_and_m',1)
 
-      do nkp=1,num_kpts
+      do nkp_loc = 1, counts(my_node_id)
+         nkp = nkp_loc + displs(my_node_id)
          if (lsitesymmetry) then                !YN: RS:
             if (ir2ik(ik2ir(nkp)).ne.nkp) cycle !YN: RS:
          end if                                 !YN: RS:
          ! cdq(nkp) is anti-Hermitian; tmp_cdq = i*cdq  is Hermitian
-         tmp_cdq(:,:) = cmplx_i * cdq(:,:,nkp)
+         tmp_cdq(:,:) = cmplx_i * cdq_loc(:,:,nkp_loc)
          ! Hermitian matrix eigen-solver
          call zheev('V','U',num_wann,tmp_cdq,num_wann,evals,cwork,4*num_wann,rwork,info)
          if (info.ne.0) then  
-            write(stdout,*) &
+            if (on_root) write(stdout,*) &
                  'wann_main: ZHEEV in internal_new_u_and_m failed, info= ',info
-            write(stdout,*) '           trying Schur decomposition instead'
-!~            call io_error('wann_main: problem in ZHEEV in internal_new_u_and_m') 
-            tmp_cdq(:,:) = cdq(:,:,nkp)
+            if (on_root) write(stdout,*) '           trying Schur decomposition instead'
+!!$            call io_error('wann_main: problem in ZHEEV in internal_new_u_and_m') 
+            tmp_cdq(:,:) = cdq_loc(:,:,nkp_loc)
             call zgees ('V', 'N', ltmp, num_wann, tmp_cdq, num_wann, nsdim, &
                  cwschur1, cz, num_wann, cwschur2, 10 * num_wann, cwschur3, &
                  cwschur4, info)
             if (info.ne.0) then  
-               write(stdout,*) 'wann_main: SCHUR failed, info= ', info  
+               if (on_root) write(stdout,*) 'wann_main: SCHUR failed, info= ', info  
                call io_error('wann_main: problem computing schur form 1') 
             endif
             do i=1,num_wann
@@ -975,50 +1108,63 @@ subroutine internal_new_u_and_m()
             enddo
             ! cmtmp   = tmp_cdq . cz^{dagger}
             call utility_zgemm(cmtmp,tmp_cdq,'N',cz,'C',num_wann)
-            cdq(:,:,nkp)=cmtmp(:,:)
+            cdq_loc(:,:,nkp_loc)=cmtmp(:,:)
          else
             do i=1,num_wann
                cmtmp(:,i) = tmp_cdq(:,i) * exp(-cmplx_i * evals(i))
             enddo
             ! cdq(nkp)   = cmtmp . tmp_cdq^{dagger}
-            call utility_zgemm(cdq(:,:,nkp),cmtmp,'N',tmp_cdq,'C',num_wann)
+            call utility_zgemm(cdq_loc(:,:,nkp_loc),cmtmp,'N',tmp_cdq,'C',num_wann)
          endif
       enddo
 
-!~      do nkp = 1, num_kpts  
-!~         tmp_cdq(:,:) = cdq(:,:,nkp)
-!~         call zgees ('V', 'N', ltmp, num_wann, tmp_cdq, num_wann, nsdim, &
-!~              cwschur1, cz, num_wann, cwschur2, 10 * num_wann, cwschur3, &
-!~              cwschur4, info)
-!~         if (info.ne.0) then  
-!~            write(stdout,*) 'SCHUR: ', info  
-!~            call io_error('wann_main: problem computing schur form 1') 
-!~         endif
-!~         do i=1,num_wann
-!~            tmp_cdq(:,i) = cz(:,i) * exp(cwschur1(i))
-!~         enddo
-!~         ! cmtmp   = tmp_cdq . cz^{dagger}
-!~         call utility_zgemm(cmtmp,tmp_cdq,'N',cz,'C',num_wann)
-!~         cdq(:,:,nkp)=cmtmp(:,:)
-!~      enddo
+      ! each process communicates its result to other processes
+      ! it would be enough to copy only next neighbors
+      call comms_gatherv(cdq_loc(1,1,1),num_wann*num_wann*counts(my_node_id),&
+                 cdq(1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs)
+      call comms_bcast(cdq(1,1,1),num_wann*num_wann*num_kpts)   
+
+
+!!$      do nkp = 1, num_kpts  
+!!$         tmp_cdq(:,:) = cdq(:,:,nkp)
+!!$         call zgees ('V', 'N', ltmp, num_wann, tmp_cdq, num_wann, nsdim, &
+!!$              cwschur1, cz, num_wann, cwschur2, 10 * num_wann, cwschur3, &
+!!$              cwschur4, info)
+!!$         if (info.ne.0) then  
+!!$            write(stdout,*) 'SCHUR: ', info  
+!!$            call io_error('wann_main: problem computing schur form 1') 
+!!$         endif
+!!$         do i=1,num_wann
+!!$            tmp_cdq(:,i) = cz(:,i) * exp(cwschur1(i))
+!!$         enddo
+!!$         ! cmtmp   = tmp_cdq . cz^{dagger}
+!!$         call utility_zgemm(cmtmp,tmp_cdq,'N',cz,'C',num_wann)
+!!$         cdq(:,:,nkp)=cmtmp(:,:)
+!!$      enddo
+
+      if (lsitesymmetry) then
+         call sitesym_symmetrize_rotation(cdq) !RS: calculate cdq(Rk) from k
+         cdq_loc(:,:,1:counts(my_node_id))=cdq(:,:,1+displs(my_node_id):displs(my_node_id)+counts(my_node_id))
+      endif
 
-      if (lsitesymmetry) call sitesym_symmetrize_rotation(cdq) !RS: calculate cdq(Rk) from k
       ! the orbitals are rotated
-      do nkp=1,num_kpts
+      do nkp_loc = 1, counts(my_node_id)
+         nkp = nkp_loc + displs(my_node_id)   
          ! cmtmp = U(k) . cdq(k)
-         call utility_zgemm(cmtmp,u_matrix(:,:,nkp),'N',cdq(:,:,nkp),'N',num_wann)
-         u_matrix(:,:,nkp)=cmtmp(:,:)
+         call utility_zgemm(cmtmp,u_matrix_loc(:,:,nkp_loc),'N',cdq_loc(:,:,nkp_loc),'N',num_wann)
+         u_matrix_loc(:,:,nkp_loc)=cmtmp(:,:)
       enddo
 
       ! and the M_ij are updated
-      do nkp = 1, num_kpts  
+      do nkp_loc = 1, counts(my_node_id)
+         nkp = nkp_loc + displs(my_node_id)  
          do nn = 1, nntot  
             nkp2 = nnlist (nkp, nn)  
             ! tmp_cdq = cdq^{dagger} . M
-            call utility_zgemm(tmp_cdq,cdq(:,:,nkp),'C',m_matrix(:,:,nn,nkp),'N',num_wann)
+            call utility_zgemm(tmp_cdq,cdq(:,:,nkp),'C',m_matrix_loc(:,:,nn,nkp_loc),'N',num_wann)
             ! cmtmp = tmp_cdq . cdq
             call utility_zgemm(cmtmp,tmp_cdq,'N',cdq(:,:,nkp2),'N',num_wann)
-            m_matrix(:,:,nn,nkp) = cmtmp(:,:)
+            m_matrix_loc(:,:,nn,nkp_loc) = cmtmp(:,:)
          enddo
       enddo
 
@@ -1250,10 +1396,10 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w)
     real(kind=dp)    :: smat(3,3),svec(3),sinv(3,3)  
     real(kind=dp)    :: xx0,det,brn
     complex(kind=dp) :: csumt
-    integer :: loop_wann,na,nkp,i,j,nn,ind,m
+    integer :: loop_wann,na,nkp,i,j,nn,ind,m,nkp_loc
 
 
-    if (timing_level>1) call io_stopwatch('wann: phases',1)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: phases',1)
 
 
     csum=cmplx_0; xx=0.0_dp
@@ -1268,9 +1414,10 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w)
           ! get average phase for each unique bk direction
           do na = 1, nnh  
              csum (na) = cmplx_0
-             do nkp = 1, num_kpts  
+             do nkp_loc = 1, counts(my_node_id)
+                nkp = nkp_loc + displs(my_node_id)
                 nn = neigh (nkp, na)  
-                csum (na) = csum (na) + m_matrix (loop_wann, loop_wann, nn, nkp)  
+                csum (na) = csum (na) + m_matrix_loc (loop_wann, loop_wann, nn, nkp_loc)  
              enddo
           enddo
 
@@ -1278,7 +1425,8 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w)
        
           do na = 1, nnh  
              csum (na) = cmplx_0
-             do nkp = 1, num_kpts  
+             do nkp_loc = 1, counts(my_node_id)
+                nkp = nkp_loc + displs(my_node_id)
                 nn = neigh (nkp, na)  
                 csum (na) = csum (na) &
                 + cmplx(m_w(loop_wann,loop_wann,2*nn-1),m_w(loop_wann,loop_wann,2*nn),dp)  
@@ -1286,6 +1434,8 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w)
           enddo
 
        end if
+
+       call comms_allreduce(csum(1),nnh,'SUM')
           
        ! now analyze that information to get good guess at
        ! wannier center
@@ -1415,7 +1565,7 @@ subroutine wann_phases (csheet, sheet, rguide, irguide, m_w)
 !       enddo
 !    enddo
 
-    if (timing_level>1) call io_stopwatch('wann: phases',2)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: phases',2)
 
     return  
 
@@ -1446,17 +1596,18 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread)
     !local variables
     real(kind=dp) :: summ,mnn2
     real(kind=dp) :: brn
-    integer :: ind,nkp,nn,m,n,iw
+    integer :: ind,nkp,nn,m,n,iw,nkp_loc
 
-    if (timing_level>1) call io_stopwatch('wann: omega',1)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: omega',1)
 
 
-    do nkp = 1, num_kpts
+    do nkp_loc = 1, counts(my_node_id)
+       nkp = nkp_loc + displs(my_node_id)
        do nn = 1, nntot
           do n = 1, num_wann
              ! Note that this ln_tmp is defined differently wrt the one in wann_domega
-             ln_tmp(n,nn,nkp)=( aimag(log(csheet(n,nn,nkp) &
-                     * m_matrix(n,n,nn,nkp))) - sheet(n,nn,nkp) )
+             ln_tmp_loc(n,nn,nkp_loc)=( aimag(log(csheet(n,nn,nkp) &
+                     * m_matrix_loc(n,n,nn,nkp_loc))) - sheet(n,nn,nkp) )
           end do
       end do
     end do
@@ -1465,14 +1616,18 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread)
     rave  = 0.0_dp
     do iw = 1, num_wann  
        do ind = 1, 3  
-          do nkp = 1, num_kpts  
+         do nkp_loc = 1, counts(my_node_id)
+             nkp = nkp_loc + displs(my_node_id)
              do nn = 1, nntot  
                 rave(ind,iw) = rave(ind,iw) + wb(nn) * bk(ind,nn,nkp) &
-                      *ln_tmp(iw,nn,nkp)
+                      *ln_tmp_loc(iw,nn,nkp_loc)
              enddo
           enddo
        enddo
     enddo
+
+    call comms_allreduce(rave(1,1),num_wann*3,'SUM')
+ 
     rave = -rave/real(num_kpts,dp)
 
     rave2 = 0.0_dp
@@ -1490,13 +1645,17 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread)
 
     r2ave = 0.0_dp
     do iw = 1, num_wann  
-       do nkp = 1, num_kpts  
+       do nkp_loc = 1, counts(my_node_id)
+          nkp = nkp_loc + displs(my_node_id)
           do nn = 1, nntot  
-             mnn2 = real(m_matrix(iw,iw,nn,nkp)*conjg(m_matrix(iw,iw,nn,nkp)),kind=dp)
-             r2ave(iw) = r2ave(iw) + wb(nn) * ( 1.0_dp - mnn2 + ln_tmp(iw,nn,nkp)**2 )
+             mnn2 = real(m_matrix_loc(iw,iw,nn,nkp_loc)*conjg(m_matrix_loc(iw,iw,nn,nkp_loc)),kind=dp)
+             r2ave(iw) = r2ave(iw) + wb(nn) * ( 1.0_dp - mnn2 + ln_tmp_loc(iw,nn,nkp_loc)**2 )
           enddo
        enddo
     enddo
+
+    call comms_allreduce(r2ave(1),num_wann,'SUM')
+
     r2ave = r2ave/real(num_kpts,dp)
 
 !~    wann_spread%om_1 = 0.0_dp  
@@ -1555,19 +1714,23 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread)
     ! on subsequent passes it may be set to omega_invariant
     if (first_pass) then
        wann_spread%om_i = 0.0_dp  
-       do nkp = 1, num_kpts  
+          nkp = nkp_loc + displs(my_node_id)
+       do nkp_loc = 1, counts(my_node_id)
           do nn = 1, nntot  
              summ = 0.0_dp  
              do m = 1, num_wann  
                 do n = 1, num_wann  
                    summ = summ &
-                        + real(m_matrix(n,m,nn,nkp)*conjg(m_matrix(n,m,nn,nkp)),kind=dp)
+                        + real(m_matrix_loc(n,m,nn,nkp_loc)*conjg(m_matrix_loc(n,m,nn,nkp_loc)),kind=dp)
                 enddo
              enddo
              wann_spread%om_i = wann_spread%om_i &
                   + wb(nn) * (real(num_wann,dp) - summ)
           enddo
        enddo
+
+       call comms_allreduce(wann_spread%om_i,1,'SUM')
+
        wann_spread%om_i = wann_spread%om_i / real(num_kpts,dp)
        first_pass=.false.
     else 
@@ -1575,35 +1738,43 @@ subroutine wann_omega(csheet,sheet,rave,r2ave,rave2,wann_spread)
     endif
 
     wann_spread%om_od = 0.0_dp  
-    do nkp = 1, num_kpts  
+    do nkp_loc = 1, counts(my_node_id)
+       nkp = nkp_loc + displs(my_node_id)
        do nn = 1, nntot  
           do m = 1, num_wann  
              do n = 1, num_wann  
                 if (m.ne.n) wann_spread%om_od = wann_spread%om_od &
-                     + wb(nn) * real( m_matrix(n,m,nn,nkp) &
-                     * conjg(m_matrix(n,m,nn,nkp)), kind=dp )
+                     + wb(nn) * real( m_matrix_loc(n,m,nn,nkp_loc) &
+                     * conjg(m_matrix_loc(n,m,nn,nkp_loc)), kind=dp )
              enddo
           enddo
        enddo
     enddo
+
+    call comms_allreduce(wann_spread%om_od,1,'SUM')
+
     wann_spread%om_od = wann_spread%om_od / real(num_kpts,dp)  
 
 
     wann_spread%om_d = 0.0_dp  
-    do nkp = 1, num_kpts  
+    do nkp_loc = 1, counts(my_node_id)
+       nkp = nkp_loc + displs(my_node_id)
        do nn = 1, nntot  
           do n = 1, num_wann  
              brn = sum(bk(:,nn,nkp)*rave(:,n))
              wann_spread%om_d = wann_spread%om_d + wb(nn) &
-                  * ( ln_tmp(n,nn,nkp) + brn)**2
+                  * ( ln_tmp_loc(n,nn,nkp_loc) + brn)**2
           enddo
        enddo
     enddo
+
+    call comms_allreduce(wann_spread%om_d,1,'SUM')
+
     wann_spread%om_d = wann_spread%om_d / real(num_kpts,dp)  
 
     wann_spread%om_tot = wann_spread%om_i + wann_spread%om_d + wann_spread%om_od
 
-    if (timing_level>1) call io_stopwatch('wann: omega',2)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: omega',2)
 
     return  
 
@@ -1628,17 +1799,19 @@ subroutine wann_domega(csheet,sheet,rave,cdodq)
     complex(kind=dp), intent(in)  :: csheet (:,:,:)    
     real(kind=dp),    intent(in)  :: sheet (:,:,:)     
     real(kind=dp),    intent(out) :: rave (:,:)        
-    complex(kind=dp), intent(out) :: cdodq (:,:,:)     
+    ! as we work on the local cdodq, returning the full cdodq array is now
+    ! made optional
+    complex(kind=dp), intent(out), optional :: cdodq (:,:,:)     
 
     !local
     complex(kind=dp), allocatable  :: cr (:,:)   
     complex(kind=dp), allocatable  :: crt (:,:)  
 
     ! local
-    integer :: iw,ind,nkp,nn,m,n,ierr
+    integer :: iw,ind,nkp,nn,m,n,ierr,nkp_loc
     complex(kind=dp) :: mnn
 
-    if (timing_level>1) call io_stopwatch('wann: domega',1)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: domega',1)
 
     allocate(  cr (num_wann, num_wann),stat=ierr ) 
     if (ierr/=0) call io_error('Error in allocating cr in wann_domega')
@@ -1646,73 +1819,99 @@ subroutine wann_domega(csheet,sheet,rave,cdodq)
     if (ierr/=0) call io_error('Error in allocating crt in wann_domega')
 
 
-    do nkp = 1, num_kpts
+    do nkp_loc = 1, counts(my_node_id)
+       nkp = nkp_loc + displs(my_node_id)
        do nn = 1, nntot
           do n = 1, num_wann
              ! Note that this ln_tmp is defined differently wrt the one in wann_omega
-             ln_tmp(n,nn,nkp)=wb(nn)*( aimag(log(csheet(n,nn,nkp) &
-                     * m_matrix(n,n,nn,nkp))) - sheet(n,nn,nkp) )
+             ln_tmp_loc(n,nn,nkp_loc)=wb(nn)*( aimag(log(csheet(n,nn,nkp) &
+                     * m_matrix_loc(n,n,nn,nkp_loc))) - sheet(n,nn,nkp) )
           end do
       end do
     end do
 
+
+
     ! recalculate rave
     rave = 0.0_dp
     do iw = 1, num_wann  
        do ind = 1, 3  
-          do nkp = 1, num_kpts  
+          do nkp_loc = 1, counts(my_node_id)
+             nkp = nkp_loc + displs(my_node_id)
              do nn = 1, nntot  
                 rave(ind,iw) = rave(ind,iw) +  bk(ind,nn,nkp) &
-                     * ln_tmp(iw,nn,nkp)
+                     * ln_tmp_loc(iw,nn,nkp_loc)
              enddo
           enddo
        enddo
     enddo
     rave = -rave/real(num_kpts,dp)
 
+    call comms_allreduce(rave(1,1),num_wann*3,'SUM')
+
     ! R_mn=M_mn/M_nn and q_m^{k,b} = Im phi_m^{k,b} + b.r_n are calculated
     rnkb = 0.0_dp
-    do nkp=1,num_kpts
+    rnkb_loc = 0.0_dp
+    do nkp_loc = 1, counts(my_node_id)
+       nkp = nkp_loc + displs(my_node_id)
        do nn=1,nntot
           do n=1,num_wann
-             rnkb(n,nn,nkp) = sum(bk(:,nn,nkp)*rave(:,n))
+             rnkb_loc(n,nn,nkp_loc) = sum(bk(:,nn,nkp)*rave(:,n))
           enddo
        enddo
     enddo
 
     ! cd0dq(m,n,nkp) is calculated
-    cdodq=cmplx_0
-    do nkp = 1, num_kpts  
+    cdodq_loc=cmplx_0
+    do nkp_loc = 1, counts(my_node_id)
+       nkp = nkp_loc + displs(my_node_id)
        do nn = 1, nntot  
           do n=1,num_wann
-             mnn = m_matrix(n,n,nn,nkp)
-             crt(:,n) = m_matrix(:,n,nn,nkp) / mnn
-             cr(:,n)  = m_matrix(:,n,nn,nkp) * conjg(mnn)
+             mnn = m_matrix_loc(n,n,nn,nkp_loc)
+             crt(:,n) = m_matrix_loc(:,n,nn,nkp_loc) / mnn
+             cr(:,n)  = m_matrix_loc(:,n,nn,nkp_loc) * conjg(mnn)
           enddo
 
           do n = 1, num_wann  
              do m = 1, num_wann  
                 ! A[R^{k,b}]=(R-Rdag)/2
-                cdodq(m,n,nkp) = cdodq(m,n,nkp) &
+                cdodq_loc(m,n,nkp_loc) = cdodq_loc(m,n,nkp_loc) &
                      + wb(nn) * 0.5_dp &
                      *( cr(m,n) - conjg(cr(n,m)) )
                 ! -S[T^{k,b}]=-(T+Tdag)/2i ; T_mn = Rt_mn q_n
-                cdodq(m,n,nkp) = cdodq(m,n,nkp) -  &
-                      ( crt(m,n) * ln_tmp(n,nn,nkp)  &
-                     + conjg( crt(n,m) * ln_tmp(m,nn,nkp) ) ) &
+                cdodq_loc(m,n,nkp_loc) = cdodq_loc(m,n,nkp_loc) -  &
+                      ( crt(m,n) * ln_tmp_loc(n,nn,nkp_loc)  &
+                     + conjg( crt(n,m) * ln_tmp_loc(m,nn,nkp_loc) ) ) &
                      * cmplx(0.0_dp,-0.5_dp,kind=dp)
-                cdodq(m,n,nkp) = cdodq(m,n,nkp) - wb(nn) &
-                     * ( crt(m,n) * rnkb(n,nn,nkp) + conjg(crt(n,m) &
-                     * rnkb(m,nn,nkp)) ) * cmplx(0.0_dp,-0.5_dp,kind=dp)
+                cdodq_loc(m,n,nkp_loc) = cdodq_loc(m,n,nkp_loc) - wb(nn) &
+                     * ( crt(m,n) * rnkb_loc(n,nn,nkp_loc) + conjg(crt(n,m) &
+                     * rnkb_loc(m,nn,nkp_loc)) ) * cmplx(0.0_dp,-0.5_dp,kind=dp)
              enddo
           enddo
        enddo
     enddo
-    cdodq = cdodq / real(num_kpts,dp) * 4.0_dp
+    cdodq_loc = cdodq_loc / real(num_kpts,dp) * 4.0_dp
+
+    if(present(cdodq)) then
+       ! each process communicates its result to other processes
+       call comms_gatherv(cdodq_loc(1,1,1),num_wann*num_wann*counts(my_node_id),&
+            cdodq(1,1,1),num_wann*num_wann*counts,num_wann*num_wann*displs)
+       call comms_bcast(cdodq(1,1,1),num_wann*num_wann*num_kpts)   
+       if (lsitesymmetry) then
+          call sitesym_symmetrize_gradient(1,cdodq) !RS:
+          cdodq_loc(:,:,1:counts(my_node_id))=cdodq(:,:,displs(my_node_id)+1:displs(my_node_id)+counts(my_node_id))
+       endif
+    end if
+
+
+    deallocate( cr, stat=ierr ) 
+    if (ierr/=0) call io_error('Error in deallocating cr in wann_domega')
+    deallocate( crt, stat=ierr ) 
+    if (ierr/=0) call io_error('Error in deallocating crt in wann_domega')
 
-    if (lsitesymmetry) call sitesym_symmetrize_gradient(1,cdodq) !RS:
 
-    if (timing_level>1) call io_stopwatch('wann: domega',2)
+
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: domega',2)
 
     return  
 
@@ -1762,14 +1961,16 @@ subroutine wann_calc_projection()
     integer :: nw,nb,nkp,counter
     real(kind=dp) :: summ
 
-    if (timing_level>1) call io_stopwatch('wann: calc_projection',1)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: calc_projection',1)
 
-    write(stdout,'(/1x,a78)') repeat('-',78)
-    write(stdout,'(1x,9x,a)') &
-         'Projection of Bands in Outer Window on all Wannier Functions'
-    write(stdout,'(1x,8x,62a)') repeat('-',62)
-    write(stdout,'(1x,16x,a)') '   Kpt  Band      Eigval      |Projection|^2'
-    write(stdout,'(1x,16x,a47)') repeat('-',47) 
+    if (on_root) then
+       write(stdout,'(/1x,a78)') repeat('-',78)
+       write(stdout,'(1x,9x,a)') &
+            'Projection of Bands in Outer Window on all Wannier Functions'
+       write(stdout,'(1x,8x,62a)') repeat('-',62)
+       write(stdout,'(1x,16x,a)') '   Kpt  Band      Eigval      |Projection|^2'
+       write(stdout,'(1x,16x,a47)') repeat('-',47) 
+    endif
 
     do nkp=1,num_kpts
        counter=0
@@ -1780,14 +1981,14 @@ subroutine wann_calc_projection()
              do nw=1,num_wann
                 summ=summ+abs(u_matrix_opt(counter,nw,nkp))**2
              enddo
-             write(stdout,'(1x,16x,i5,1x,i5,1x,f14.6,2x,f14.8)') &
+             if (on_root) write(stdout,'(1x,16x,i5,1x,i5,1x,f14.6,2x,f14.8)') &
                   nkp,nb,eigval(nb,nkp),summ
           endif
        enddo
     enddo
-    write(stdout,'(1x,a78/)') repeat('-',78)
+    if (on_root) write(stdout,'(1x,a78/)') repeat('-',78)
 
-    if (timing_level>1) call io_stopwatch('wann: calc_projection',2)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: calc_projection',2)
 
     return
 
@@ -2022,7 +2223,7 @@ subroutine wann_check_unitarity()
     integer :: nkp,i,j,m
     complex(kind=dp) :: ctmp1,ctmp2
 
-    if (timing_level>1) call io_stopwatch('wann: check_unitarity',1)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: check_unitarity',1)
 
     do nkp = 1, num_kpts  
        do i = 1, num_wann  
@@ -2035,23 +2236,23 @@ subroutine wann_check_unitarity()
              enddo
              if ( (i.eq.j) .and. (abs (ctmp1 - cmplx_1 ) .gt. eps5) ) &
                   then
-                write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
+                if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
                      ctmp1
                 call io_error('wann_check_unitarity: error 1')  
              endif
              if ( (i.eq.j) .and. (abs (ctmp2 - cmplx_1 ) .gt. eps5) ) &
                   then
-                write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
+                if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
                      ctmp2
                 call io_error('wann_check_unitarity: error 2')  
              endif
              if ( (i.ne.j) .and. (abs (ctmp1) .gt. eps5) ) then  
-                write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
+                if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
                      ctmp1
                 call io_error('wann_check_unitarity: error 3')  
              endif
              if ( (i.ne.j) .and. (abs (ctmp2) .gt. eps5) ) then  
-                write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
+                if (on_root) write ( stdout , * ) ' ERROR: unitariety of final U', nkp, i, j, &
                      ctmp2
                 call io_error('wann_check_unitarity: error 4')  
              endif
@@ -2059,7 +2260,7 @@ subroutine wann_check_unitarity()
        enddo
     enddo
 
-    if (timing_level>1) call io_stopwatch('wann: check_unitarity',2)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: check_unitarity',2)
 
     return
 
@@ -2136,7 +2337,7 @@ subroutine wann_svd_omega_i()
     integer :: nkp,nn,nb,na,ind
     real(kind=dp) :: omt1,omt2,omt3
 
-    if (timing_level>1) call io_stopwatch('wann: svd_omega_i',1)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: svd_omega_i',1)
 
     allocate( cw1 (10 * num_wann),stat=ierr  )
     if (ierr/=0) call io_error('Error in allocating cw1 in wann_svd_omega_i')
@@ -2181,13 +2382,15 @@ subroutine wann_svd_omega_i()
     omt1 = omt1 / real(num_kpts,dp)  
     omt2 = omt2 / real(num_kpts,dp)  
     omt3 = omt3 / real(num_kpts,dp)  
-    write ( stdout , * ) ' '  
-    write(stdout,'(2x,a,f15.9,1x,a)') 'Omega Invariant:   1-s^2 = ',&
-         omt1*lenconfac**2,'('//trim(length_unit)//'^2)'
-    write(stdout,'(2x,a,f15.9,1x,a)') '                 -2log s = ',&
-         omt2*lenconfac**2,'('//trim(length_unit)//'^2)'
-    write(stdout,'(2x,a,f15.9,1x,a)') '                  acos^2 = ',&
-         omt3*lenconfac**2,'('//trim(length_unit)//'^2)'
+    if (on_root) then
+       write ( stdout , * ) ' '  
+       write(stdout,'(2x,a,f15.9,1x,a)') 'Omega Invariant:   1-s^2 = ',&
+            omt1*lenconfac**2,'('//trim(length_unit)//'^2)'
+       write(stdout,'(2x,a,f15.9,1x,a)') '                 -2log s = ',&
+            omt2*lenconfac**2,'('//trim(length_unit)//'^2)'
+       write(stdout,'(2x,a,f15.9,1x,a)') '                  acos^2 = ',&
+            omt3*lenconfac**2,'('//trim(length_unit)//'^2)'
+    endif
 
     deallocate(cpad1,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating cpad1 in wann_svd_omega_i')
@@ -2202,7 +2405,7 @@ subroutine wann_svd_omega_i()
     deallocate(cw1,stat=ierr)
     if (ierr/=0) call io_error('Error in deallocating cw1 in wann_svd_omega_i')
 
-    if (timing_level>1) call io_stopwatch('wann: svd_omega_i',2)
+    if (timing_level>1.and.on_root) call io_stopwatch('wann: svd_omega_i',2)
 
     return
     
@@ -2419,7 +2622,7 @@ subroutine wann_main_gamma
        ldump=.false.
        if ( (num_dump_cycles.gt.0) .and. (mod(iter,num_dump_cycles).eq.0) ) ldump=.true.
 
-       if(lprint) write(stdout,'(1x,a,i6)') 'Cycle: ',iter
+       if(lprint.and.on_root) write(stdout,'(1x,a,i6)') 'Cycle: ',iter
 
 !~       ! initialize rguide as rave for use_bloch_phases
 !~       if ( (iter.gt.num_no_guide_iter) .and. lguide ) then 
diff --git a/test-suite/Makefile b/test-suite/Makefile
index 05a476fe6..2a519b14b 100644
--- a/test-suite/Makefile
+++ b/test-suite/Makefile
@@ -50,7 +50,7 @@ run-tests-wannier-serial : clean prolog
 	        env QE_USE_MPI=0 ${TESTCODE_DIR}/bin/testcode.py --verbose --category=wannier_all
 
 run-tests-wannier-parallel : clean prolog 
-	        env QE_USE_MPI=1 ${TESTCODE_DIR}/bin/testcode.py --verbose --category=wannier_all
+	        env QE_USE_MPI=1 ${TESTCODE_DIR}/bin/testcode.py --verbose --category=wannier_par
 
 run-custom-test-serial : clean prolog 
 	@if test -d $(testdir); then \
diff --git a/test-suite/config/extract/extract-wannier.x b/test-suite/config/extract/extract-wannier.x
index bff4d9c5f..756d29aab 100755
--- a/test-suite/config/extract/extract-wannier.x
+++ b/test-suite/config/extract/extract-wannier.x
@@ -34,9 +34,9 @@ omegaT=`grep "  Omega Total " $fname | awk '{print $7}'`
 # Wannier -pp
 nearn=`sed -n '/ Distance (Ang^-1)/{n;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;n;p;}'\
  $fname | awk '{print $2}; {print $3}; {print $4}'`
-compl=`sed -n '/ b_k(x) /{n;n;p;n;p;n;p;n;p;n;p;n;p;}' \
+compl=`sed -n '/ Completeness relation is fully satisfied /{n;n;n;n;n;n;p;n;p;n;p;n;p;n;p;n;p;}'\
  $fname | awk '{print $2; print $3; print $4; print $5; print $6}'`
-
+#compl=`sed -n '/ b_k(x) /{n;n;p;n;p;n;p;n;p;n;p;n;p;}' \
 
 
 proji=`sed -n '/ Projections/{n;p;n;p;n;p;n;p;}'\
diff --git a/test-suite/jobconfig b/test-suite/jobconfig
index 49d4d5ecd..311b32058 100644
--- a/test-suite/jobconfig
+++ b/test-suite/jobconfig
@@ -118,6 +118,7 @@ inputs_args = ('gaas1.win', '1'),('gaas2.win', '1')
 
 [categories]
 wannier_all = example*??  test_* wan_*
+wannier_par = example*?? wan_l* test_b* test_n* test_p*
 interface_all = pw_example*??
 test_all = example*??  pw_example*?? test_* wan_*
 _default_ =  example*?? test* wan_*
diff --git a/test-suite/run_tests_travis.sh b/test-suite/run_tests_travis.sh
index bfb594c4a..bbc893482 100755
--- a/test-suite/run_tests_travis.sh
+++ b/test-suite/run_tests_travis.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 ## Set here, if needed, the location of the executables
 THISDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export ESPRESSO_ROOT="${THISDIR}/external-codes/espresso/"
@@ -16,6 +18,10 @@ elif [ "$W90TESTSWITHINTERFACE" == "false" ]
 then
     # Only wannier tests
     make run-tests
+    if [ "$W90BINARYPARALLEL" == "true" ]
+    then
+      make run-tests-parallel
+    fi
 else
     # By default: run both
     make run-tests-all
diff --git a/test-suite/travis_copy_make.inc.sh b/test-suite/travis_copy_make.inc.sh
new file mode 100755
index 000000000..723e02385
--- /dev/null
+++ b/test-suite/travis_copy_make.inc.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#Stop the full script if one line crashes
+set -e
+
+if [ "$W90BINARYPARALLEL" == "true" ]
+then
+  cp config/make.inc.gfort.traviscimpi make.inc
+else
+ cp config/make.inc.gfort.travisci make.inc 
+fi