Merge pull request #1170 from ye-luo/delayed-update-merge

Delayed update on CPU It would be nice to show the old algorithm is actually preserved but this is difficult without a test of the original algorithm. But it doesn't break the CI so I'm merging this. Improvements to DiracDeterminant's API as certainly an issue for the future.
QMCPACK · Nov 28, 2018 · 90b9f83 · 90b9f83
2 parents 4e650e0 + 95ec0ee
commit 90b9f83
Show file tree

Hide file tree

Showing 30 changed files with 947 additions and 315 deletions.
diff --git a/manual/bibliography.bib b/manual/bibliography.bib
@@ -30,6 +30,23 @@ @article{CeperleyAlderPRL1980
 	url = {http://dx.doi.org/10.1103/PhysRevLett.45.566}
 }
 
+
+@article{Fahy1990,
+abstract = {A new method of calculating total energies of solids and atoms using nonlocal pseudopotentials in conjunction with the variational quantum Monte Carlo approach is presented in detail. The many-electron wave function is of the form of a Jastrow exponential factor multiplying a Slater determinant. By using pseudopotentials, the large fluctuations of the energies in the core region of the atoms which occur in quantum Monte Carlo all-electron calculations are avoided. The method is applied to calculate the binding energy and structural properties of diamond, graphite, and silicon. The results are in excellent agreement with experiment. Excellent results are also obtained for the electron affinities and ionization potentials of the carbon and silicon atoms.},
+author = {Fahy, S. and Wang, X. W. and Louie, Steven G.},
+doi = {10.1103/PhysRevB.42.3503},
+isbn = {038794527X},
+OPTissn = {01631829},
+journal = {Physical Review B},
+mendeley-groups = {QMC-methods/Update},
+number = {6},
+pages = {3503--3522},
+title = {{Variational quantum Monte Carlo nonlocal pseudopotential approach to solids: Formulation and application to diamond, graphite, and silicon}},
+volume = {42},
+year = {1990}
+}
+
+
 @article{Drummond2004,
   author = {Drummond, N. D. and Towler, M. D. and Needs, R. J.},
   doi = {10.1103/PhysRevB.70.235119},
@@ -585,4 +602,27 @@ @article{Luo2018hyb
   year={2018},
 }
 
+@Article{McDaniel2017,
+  author    = {T. McDaniel and E. F. D'Azevedo and Y. W. Li and K. Wong and P. R. C. Kent},
+  title     = {Delayed Slater determinant update algorithms for high efficiency quantum Monte Carlo},
+  journal   = {The Journal of Chemical Physics},
+  year      = {2017},
+  volume    = {147},
+  number    = {17},
+  pages     = {174107},
+  month     = {nov},
+  doi       = {10.1063/1.4998616},
+  publisher = {{AIP} Publishing},
+}
+
+@article{Luo2018delayedupdate,
+  author={Ye Luo and Jeongnim Kim},
+  title={An highly efficient delayed update algorithm for evaluating Slater determinants in quantum Monte Carlo},
+  journal={in preparation},
+  volume={},
+  number={},
+  pages={},
+  year={2018},
+}
+
 @Comment{jabref-meta: databaseType:bibtex;}
diff --git a/manual/qmcpack_manual.tex b/manual/qmcpack_manual.tex
@@ -178,6 +178,7 @@ \chapter{Specifying the system to be simulated}
 
 \chapter{Trial wavefunction specification}
 \input{intro_wavefunction}
+\input{singledeterminant}
 \input{spo}
 \input{jastrow}
 \input{multideterminants}

diff --git a/manual/qmcpack_papers.bib b/manual/qmcpack_papers.bib
@@ -1430,6 +1430,17 @@ @article{Luo2018hyb
   year={2018},
 }
 
+@article{Luo2018delayedupdate,
+  author={Ye Luo and Jeongnim Kim},
+  title={An highly efficient delayed update algorithm for evaluating Slater determinants in quantum Monte Carlo},
+  journal={in preparation},
+  volume={},
+  number={},
+  pages={},
+  year={2018},
+}
+
+
 @Comment{jabref-meta: databaseType:bibtex;}
 
 @Comment{jabref-meta: saveOrderConfig:specified;year;false;month;false;author;false;}
diff --git a/manual/singledeterminant.tex b/manual/singledeterminant.tex
@@ -0,0 +1,53 @@
+\section{Single determinant wavefunctions}
+\label{sec:singledeterminant}
+Placing a single determinant for each spin is the most used ansatz for the antisymmetric part of a trial wavefunction.
+The input xml block for \texttt{slaterdeterminant} is give in Listing~\ref{listing:singledet}. A list of options is given in
+Table~\ref{table:singledet}
+
+\begin{table}[h]
+\begin{center}
+\begin{tabularx}{\textwidth}{l l l l l l }
+\hline
+\multicolumn{6}{l}{\texttt{slaterdeterminant} element} \\
+\hline
+\multicolumn{2}{l}{parent elements:} & \multicolumn{4}{l}{\texttt{determinantset}}\\
+\multicolumn{2}{l}{child  elements:} & \multicolumn{4}{l}{\texttt{determinant}}\\
+\multicolumn{2}{l}{attribute      :} & \multicolumn{4}{l}{}\\
+   &   \bfseries name       & \bfseries datatype & \bfseries values & \bfseries default & \bfseries description \\
+   &   \texttt{delay\_rank} &  integer           &  >0              & 1           &  The number of delayed updates. \\
+   &   \texttt{optimize}    &  text              &  yes/no          & yes         &  Enable orbital optimization. \\
+  \hline
+\end{tabularx}
+\end{center}
+\caption{Options for the \texttt{slaterdeterminant} xml-block.}
+\label{table:singledet}
+\end{table}
+
+\begin{lstlisting}[caption=slaterdeterminant set XML element.\label{listing:singledet}]
+  <slaterdeterminant delay_rank="32">
+    <determinant id="updet" size="208">
+      <occupation mode="ground" spindataset="0">
+      </occupation>
+    </determinant>
+    <determinant id="downdet" size="208">
+      <occupation mode="ground" spindataset="0">
+      </occupation>
+    </determinant>
+  </slaterdeterminant>
+\end{lstlisting}
+
+Additional information:
+\begin{itemize}
+\item \texttt{delay\_rank}. This option enables the delayed updates of Slater matrix inverse when particle-by-particle move is used.
+By default, \texttt{delay\_rank=1} uses the Fahy's variant~\cite{Fahy1990} of the Sherman-Morrison rank-1 update which is mostly using memory bandwidth bound BLAS-2 calls.
+With \texttt{delay\_rank>1}, the delayed update algorithm~\cite{Luo2018delayedupdate,McDaniel2017} turns most of the computation to compute bound BLAS-3 calls.
+Tuning this parameter is highly recommended to gain the best performance on medium to large problem sizes ($>200$ electrons).
+We have seen up to an order of magnitude speed-up on large problem sizes.
+When studying the performance of QMCPACK, a scan of this parameter is required and we recommend to start from 32.
+The best \texttt{delay\_rank} giving the maximal speed-up depends the problem size.
+Usually the larger \texttt{delay\_rank} corresponds to a larger problem size.
+On CPUs, \texttt{delay\_rank} must be chosen a multiple of SIMD vector length for good performance of BLAS libraries.
+The best \texttt{delay\_rank} depends on the processor micro architecture.
+The GPU support is currently under development.
+\end{itemize}
+
diff --git a/src/QMCDrivers/CorrelatedSampling/CSVMCUpdatePbyP.cpp b/src/QMCDrivers/CorrelatedSampling/CSVMCUpdatePbyP.cpp
@@ -106,6 +106,8 @@ void CSVMCUpdatePbyP::advanceWalker(Walker_t& thisWalker, bool recompute)
     {
       ++nAllRejected;
     }
+    for(int ipsi=0; ipsi<nPsi; ipsi++)
+      Psi1[ipsi]->completeUpdates();
   }
 //  myTimers[1]->stop();
 //  myTimers[2]->start();

diff --git a/src/QMCDrivers/DMC/DMCUpdatePbyPFast.cpp b/src/QMCDrivers/DMC/DMCUpdatePbyPFast.cpp
@@ -133,7 +133,7 @@ void DMCUpdatePbyPWithRejectionFast::advanceWalker(Walker_t& thisWalker, bool re
       }
     }
   }
-
+  Psi.completeUpdates();
   W.donePbyP();
   myTimers[DMC_movePbyP]->stop();
 

diff --git a/src/QMCDrivers/RMC/RMCUpdatePbyP.cpp b/src/QMCDrivers/RMC/RMCUpdatePbyP.cpp
@@ -211,6 +211,7 @@ namespace qmcplusplus
 	  }
       }
     myTimers[1]->stop ();
+    Psi.completeUpdates();
     W.donePbyP();
 
     if (nAcceptTemp > 0)
@@ -344,6 +345,7 @@ namespace qmcplusplus
 	  }
       }
     myTimers[1]->stop ();
+    Psi.completeUpdates();
     W.donePbyP();
 // In the rare case that all proposed moves fail, we bounce.
     if (nAcceptTemp == 0)

diff --git a/src/QMCDrivers/VMC/VMCUpdatePbyP.cpp b/src/QMCDrivers/VMC/VMCUpdatePbyP.cpp
@@ -117,6 +117,7 @@ void VMCUpdatePbyP::advanceWalker(Walker_t& thisWalker, bool recompute)
         }
       }
     }
+    Psi.completeUpdates();
   }
   W.donePbyP();
   myTimers[1]->stop();

diff --git a/src/QMCHamiltonians/NonLocalECPotential.cpp b/src/QMCHamiltonians/NonLocalECPotential.cpp
@@ -313,6 +313,10 @@ NonLocalECPotential::makeNonLocalMovesPbyP(ParticleSet& P)
       }
     }
   }
+
+  if(NonLocalMoveAccepted>0)
+    Psi.completeUpdates();
+
   return NonLocalMoveAccepted;
 }
 

diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdate.h b/src/QMCWaveFunctions/Fermion/DelayedUpdate.h
@@ -0,0 +1,209 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2017 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_DELAYED_UPDATE_H
+#define QMCPLUSPLUS_DELAYED_UPDATE_H
+
+#include "Numerics/Blasf.h"
+#include <OhmmsPETE/OhmmsVector.h>
+#include <OhmmsPETE/OhmmsMatrix.h>
+#include <simd/simd.hpp>
+
+namespace qmcplusplus {
+
+  template<typename T>
+    class DelayedUpdate
+    {
+      /// orbital values of delayed electrons
+      Matrix<T> U;
+      /// rows of Ainv corresponding to delayed electrons
+      Matrix<T> V;
+      /// Matrix inverse of B, at maximum KxK
+      Matrix<T> Binv;
+      /// scratch space, used during inverse update
+      Matrix<T> tempMat;
+      /// temporal scratch space used by SM-1
+      Vector<T> temp;
+      /// new column of B
+      Vector<T> p;
+      /// list of delayed electrons
+      std::vector<int> delay_list;
+      /// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
+      int delay_count;
+
+      /// pointer to the row of up-to-date Ainv
+      const T* Ainv_row_ptr;
+      /// electron id of the up-to-date Ainv_row, checked by ratioGrad
+      int Ainv_row_ind;
+      /// current determinant ratio
+      T curRatio;
+
+    public:
+      /// default constructor
+      DelayedUpdate(): delay_count(0), Ainv_row_ptr(nullptr), Ainv_row_ind(-1) {}
+
+      ///resize the internal storage
+      /** resize the internal storage
+       * @param norb number of electrons/orbitals
+       * @param delay, maximum delay 0<delay<=norb
+       */
+      inline void resize(int norb, int delay)
+      {
+        V.resize(delay, norb);
+        U.resize(delay, norb);
+        p.resize(delay);
+        temp.resize(norb);
+        tempMat.resize(norb, delay);
+        Binv.resize(delay, delay);
+        delay_list.resize(delay);
+      }
+
+      /** compute the row of up-to-date Ainv
+       * @param Ainv inverse matrix
+       * @param rowchanged the row id corresponding to the proposed electron
+       */
+      inline void getInvRow(const Matrix<T>& Ainv, int rowchanged)
+      {
+        Ainv_row_ind = rowchanged;
+        if ( delay_count == 0 )
+        {
+          // Ainv is fresh, directly access Ainv
+          Ainv_row_ptr = Ainv[rowchanged];
+          return;
+        }
+        const T cone(1);
+        const T czero(0);
+        const T* AinvRow = Ainv[rowchanged];
+        const int norb = Ainv.rows();
+        const int lda_Binv = Binv.cols();
+        // save AinvRow to new_AinvRow
+        simd::copy_n(AinvRow, norb, V[delay_count]);
+        // multiply V (NxK) Binv(KxK) U(KxN) AinvRow right to the left
+        BLAS::gemv('T', norb, delay_count, cone, U.data(), norb, AinvRow, 1, czero, p.data(), 1);
+        BLAS::gemv('N', delay_count, delay_count, cone, Binv.data(), lda_Binv, p.data(), 1, czero, Binv[delay_count], 1);
+        BLAS::gemv('N', norb, delay_count, -cone, V.data(), norb, Binv[delay_count], 1, cone, V[delay_count], 1);
+        Ainv_row_ptr = V[delay_count];
+      }
+
+      /** compute determinant ratio of new determinant
+       * @param Ainv inverse matrix
+       * @param rowchanged the row id corresponding to the proposed electron
+       * @param psiV new orbital values
+       */
+      template<typename VVT>
+      inline T ratio(const Matrix<T>& Ainv, int rowchanged, const VVT& psiV)
+      {
+        getInvRow(Ainv, rowchanged);
+        return curRatio = simd::dot(Ainv_row_ptr,psiV.data(),Ainv.cols());
+      }
+
+      /** compute the old gradient
+       * @param Ainv inverse matrix
+       * @param rowchanged the row id corresponding to the proposed electron
+       * @param dpsiV old orbital derivatives
+       */
+      template<typename GT>
+      inline GT evalGrad(const Matrix<T>& Ainv, int rowchanged, const GT* dpsiV)
+      {
+        getInvRow(Ainv, rowchanged);
+        return simd::dot(Ainv_row_ptr,dpsiV,Ainv.cols());
+      }
+
+      /** compute determinant ratio and gradients of new determinant
+       * @param Ainv inverse matrix
+       * @param rowchanged the row id corresponding to the proposed electron
+       * @param psiV new orbital values
+       * @param dpsiV new orbital derivatives
+       * @param g new gradients
+       */
+      template<typename VVT, typename GGT, typename GT>
+      inline T ratioGrad(const Matrix<T>& Ainv, int rowchanged, const VVT& psiV, const GGT& dpsiV, GT& g)
+      {
+        // check Ainv_row_ind against rowchanged to ensure getInvRow() called before ratioGrad()
+        if(Ainv_row_ind != rowchanged)
+          getInvRow(Ainv, rowchanged);
+        g = simd::dot(Ainv_row_ptr,dpsiV.data(),Ainv.cols());
+        return curRatio = simd::dot(Ainv_row_ptr,psiV.data(),Ainv.cols());
+      }
+
+      /** accept a move with the update delayed
+       * @param Ainv inverse matrix
+       * @param rowchanged the row id corresponding to the proposed electron
+       * @param psiV new orbital values
+       *
+       * Before delay_count reaches the maximum delay, only Binv is updated with a recursive algorithm
+       */
+      template<typename VVT>
+      inline void acceptRow(Matrix<T>& Ainv, int rowchanged, const VVT& psiV)
+      {
+        // safe mechanism
+        Ainv_row_ind = -1;
+
+        const T cminusone(-1);
+        const T czero(0);
+        const int norb = Ainv.rows();
+        const int lda_Binv = Binv.cols();
+        simd::copy_n(Ainv[rowchanged], norb, V[delay_count]);
+        simd::copy_n(psiV.data(), norb, U[delay_count]);
+        delay_list[delay_count] = rowchanged;
+        // the new Binv is [[X Y] [Z x]]
+        BLAS::gemv('T', norb, delay_count+1, cminusone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
+        // x
+        T y = -p[delay_count];
+        for(int i=0; i<delay_count; i++)
+          y += Binv[delay_count][i] * p[i];
+        Binv[delay_count][delay_count] = y = T(1) / y;
+        // Y
+        BLAS::gemv('T', delay_count, delay_count, y, Binv.data(), lda_Binv, p.data(), 1, czero, Binv.data()+delay_count, lda_Binv);
+        // X
+        BLAS::ger(delay_count, delay_count, cminusone, Binv[delay_count], 1, Binv.data()+delay_count, lda_Binv, Binv.data(), lda_Binv);
+        // Z
+        for(int i=0; i<delay_count; i++)
+          Binv[delay_count][i] *= -y;
+        delay_count++;
+        // update Ainv when maximal delay is reached
+        if(delay_count==lda_Binv) updateInvMat(Ainv);
+      }
+
+      /** update the full Ainv and reset delay_count
+       * @param Ainv inverse matrix
+       */
+      inline void updateInvMat(Matrix<T>& Ainv)
+      {
+        if(delay_count==0) return;
+        // update the inverse matrix
+        const T cone(1);
+        const T czero(0);
+        const int norb=Ainv.rows();
+        if(delay_count==1)
+        {
+          // this is a special case invoking the Fahy's variant of Sherman-Morrison update.
+          // Only use the first norb elements of tempMat as a temporal array
+          BLAS::gemv('T', norb, norb, cone, Ainv.data(), norb, U[0], 1, czero, temp.data(), 1);
+          temp[delay_list[0]] -= cone;
+          BLAS::ger(norb,norb,-Binv[0][0],V[0],1,temp.data(),1,Ainv.data(),norb);
+        }
+        else
+        {
+          const int lda_Binv=Binv.cols();
+          BLAS::gemm('T', 'N', delay_count, norb, norb, cone, U.data(), norb, Ainv.data(), norb, czero, tempMat.data(), lda_Binv);
+          for(int i=0; i<delay_count; i++) tempMat(delay_list[i], i) -= cone;
+          BLAS::gemm('N', 'N', norb, delay_count, delay_count, cone, V.data(), norb, Binv.data(), lda_Binv, czero, U.data(), norb);
+          BLAS::gemm('N', 'N', norb, norb, delay_count, -cone, U.data(), norb, tempMat.data(), lda_Binv, cone, Ainv.data(), norb);
+        }
+        delay_count = 0;
+        Ainv_row_ind = -1;
+      }
+    };
+}
+
+#endif // QMCPLUSPLUS_DELAYED_UPDATE_H
+
-Original file line number
+Diff line change
@@ Expand Up @@
           }
         }
       }
+      Psi.completeUpdates();
       W.donePbyP();
       myTimers[DMC_movePbyP]->stop();
@@ Expand Down @@