QMCPACK · ye-luo · Oct 20, 2021 · Sep 22, 2021 · Sep 22, 2021 · Sep 27, 2021
diff --git a/src/Containers/tests/makeRngSpdMatrix.hpp b/src/Containers/tests/makeRngSpdMatrix.hpp
@@ -123,6 +123,26 @@ class MakeRngSpdMatrix
   testing::RandomForTest<RngValueType<T>> rng;
 };
 
+/** make a random Vector
+ */
+template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+void makeRngVector(testing::RandomForTest<RngValueType<T>>& rng, Vector<T>& vec)
+{
+  int n = vec.size();
+  rng.fillBufferRng(vec.data(), vec.size());
+}
+
+/** Functor to provide scope for rng when making SpdMatrix for testing.
+ */
+template<typename T>
+class MakeRngVector
+{
+public:
+  void operator()(Vector<T>& vec) { makeRngSpdMatrix(rng, vec); }
+private:
+  testing::RandomForTest<RngValueType<T>> rng;
+};
+
 extern template class MakeRngSpdMatrix<double>;
 extern template class MakeRngSpdMatrix<float>;
 extern template class MakeRngSpdMatrix<std::complex<double>>;

diff --git a/src/QMCDrivers/QMCDriverNew.cpp b/src/QMCDrivers/QMCDriverNew.cpp
@@ -530,7 +530,13 @@ bool QMCDriverNew::checkLogAndGL(Crowd& crowd)
   ps_dispatcher.flex_update(walker_elecs);
   twf_dispatcher.flex_evaluateLog(walker_twfs, walker_elecs);
 
-  const RealType threshold = 100 * std::numeric_limits<float>::epsilon();
+  RealType threshold;
+  // mixed precision can't make this test with cuda direct inversion
+  if constexpr (std::is_same<RealType, FullPrecRealType>::value)
+    threshold = 100 * std::numeric_limits<float>::epsilon();
+  else
+    threshold = 0.5e-5;
+
   for (int iw = 0; iw < log_values.size(); iw++)
   {
     auto& ref_G = walker_twfs[iw].G;

diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp
@@ -26,17 +26,20 @@ namespace qmcplusplus
  *@param first index of the first particle
  */
 template<typename DET_ENGINE>
-DiracDeterminantBatched<DET_ENGINE>::DiracDeterminantBatched(std::shared_ptr<SPOSet>&& spos, int first, int last, int ndelay)
+DiracDeterminantBatched<DET_ENGINE>::DiracDeterminantBatched(std::shared_ptr<SPOSet>&& spos,
+                                                             int first,
+                                                             int last,
+                                                             int ndelay)
     : DiracDeterminantBase("DiracDeterminantBatched", std::move(spos), first, last),
       ndelay_(ndelay),
       D2HTimer(*timer_manager.createTimer("DiracDeterminantBatched::D2H", timer_level_fine)),
-      H2DTimer(*timer_manager.createTimer("DiracDeterminantBatched::H2D", timer_level_fine))
+      H2DTimer(*timer_manager.createTimer("DiracDeterminantBatched::H2D", timer_level_fine)),
+      batched_inverse_timer_(*timer_manager.createTimer("DiracDeterminantBatched::batched_inverse", timer_level_fine))
 {
   static_assert(std::is_same<SPOSet::ValueType, typename DET_ENGINE::Value>::value);
   resize(NumPtcls, NumPtcls);
   if (Optimizable)
     Phi->buildOptVariables(NumPtcls);
-
 }
 
 template<typename DET_ENGINE>
@@ -54,12 +57,12 @@ void DiracDeterminantBatched<DET_ENGINE>::invertPsiM(const DualMatrix<Value>& ps
 
 template<typename DET_ENGINE>
 void DiracDeterminantBatched<DET_ENGINE>::mw_invertPsiM(const RefVectorWithLeader<WaveFunctionComponent>& wfc_list,
-                                                        RefVector<const DualMatrix<Value>>& logdetT_list,
+                                                        RefVector<DualMatrix<Value>>& logdetT_list,
                                                         RefVector<DualMatrix<Value>>& a_inv_list,
                                                         const std::vector<bool>& compute_mask) const
 {
   auto& wfc_leader = wfc_list.getCastedLeader<DiracDeterminantBatched<DET_ENGINE>>();
-  ScopedTimer inverse_timer(wfc_leader.InverseTimer);
+  ScopedTimer inverse_timer(wfc_leader.batched_inverse_timer_);
   const auto nw = wfc_list.size();
 
   RefVectorWithLeader<DET_ENGINE> engine_list(wfc_leader.det_engine_);
@@ -122,7 +125,7 @@ typename DiracDeterminantBatched<DET_ENGINE>::Grad DiracDeterminantBatched<DET_E
 {
   ScopedTimer local_timer(RatioTimer);
   const int WorkingIndex = iat - FirstIndex;
-  Grad g                 = simd::dot(det_engine_.get_ref_psiMinv()[WorkingIndex], dpsiM[WorkingIndex], NumOrbitals);
+  Grad g                 = simd::dot(det_engine_.get_psiMinv()[WorkingIndex], dpsiM[WorkingIndex], NumOrbitals);
   assert(checkG(g));
   return g;
 }
@@ -174,11 +177,11 @@ typename DiracDeterminantBatched<DET_ENGINE>::PsiValue DiracDeterminantBatched<D
 
   {
     ScopedTimer local_timer(RatioTimer);
-    auto& psiMinv          = det_engine_.get_ref_psiMinv();
+    auto& psiMinv          = det_engine_.get_psiMinv();
     const int WorkingIndex = iat - FirstIndex;
     curRatio               = simd::dot(psiMinv[WorkingIndex], psiV.data(), NumOrbitals);
     grad_iat += static_cast<Value>(static_cast<PsiValue>(1.0) / curRatio) *
-        simd::dot(det_engine_.get_ref_psiMinv()[WorkingIndex], dpsiV.data(), NumOrbitals);
+        simd::dot(psiMinv[WorkingIndex], dpsiV.data(), NumOrbitals);
   }
   return curRatio;
 }
@@ -218,7 +221,7 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_ratioGrad(const RefVectorWithLeader
     grad_new_local.resize(wfc_list.size());
 
     VectorSoaContainer<Value, DIM + 2> phi_vgl_v_view(phi_vgl_v.data(), NumOrbitals * wfc_list.size(),
-                                                          phi_vgl_v.capacity());
+                                                      phi_vgl_v.capacity());
     wfc_leader.Phi->mw_evaluateVGLandDetRatioGrads(phi_list, p_list, iat, psiMinv_row_dev_ptr_list, phi_vgl_v_view,
                                                    ratios_local, grad_new_local);
   }
@@ -293,7 +296,7 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_accept_rejectMove(
       psiM_g_dev_ptr_list[count] = det.psiM_vgl.device_data() + psiM_vgl.capacity() + NumOrbitals * WorkingIndex * DIM;
       psiM_l_dev_ptr_list[count] = det.psiM_vgl.device_data() + psiM_vgl.capacity() * 4 + NumOrbitals * WorkingIndex;
       det.log_value_ += convertValueToLog(det.curRatio);
-     count++;
+      count++;
     }
     det.curRatio = 1.0;
   }
@@ -536,7 +539,7 @@ typename DiracDeterminantBatched<DET_ENGINE>::PsiValue DiracDeterminantBatched<D
     Phi->evaluateValue(P, iat, psiV_host_view);
   }
   {
-    auto& psiMinv = det_engine_.get_ref_psiMinv();
+    auto& psiMinv = det_engine_.get_psiMinv();
     ScopedTimer local_timer(RatioTimer);
     curRatio = simd::dot(psiMinv[WorkingIndex], psiV.data(), NumOrbitals);
   }
@@ -638,7 +641,7 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_evaluateRatios(
       if (Phi->isOMPoffload())
         invRow_ptr_list.push_back(det.det_engine_.getRow_psiMinv_offload(WorkingIndex));
       else
-        invRow_ptr_list.push_back(det.det_engine_.get_ref_psiMinv()[WorkingIndex]);
+        invRow_ptr_list.push_back(det.det_engine_.get_psiMinv()[WorkingIndex]);
     }
   }
 
@@ -684,7 +687,7 @@ typename DiracDeterminantBatched<DET_ENGINE>::Grad DiracDeterminantBatched<DET_E
   {
     resizeScratchObjectsForIonDerivs();
     Phi->evaluateGradSource(P, FirstIndex, LastIndex, source, iat, grad_source_psiM);
-    auto& psiMinv = det_engine_.get_ref_psiMinv();
+    auto& psiMinv = det_engine_.get_psiMinv();
     // psiMinv columns have padding but grad_source_psiM ones don't
     for (int i = 0; i < psiMinv.rows(); i++)
       g += simd::dot(psiMinv[i], grad_source_psiM[i], NumOrbitals);
@@ -736,7 +739,7 @@ typename DiracDeterminantBatched<DET_ENGINE>::Grad DiracDeterminantBatched<DET_E
     Phi->evaluateGradSource(P, FirstIndex, LastIndex, source, iat, grad_source_psiM, grad_grad_source_psiM,
                             grad_lapl_source_psiM);
 
-    auto& psiMinv = det_engine_.get_ref_psiMinv();
+    auto& psiMinv = det_engine_.get_psiMinv();
 
     // Compute matrices
     phi_alpha_Minv      = 0.0;
@@ -789,8 +792,7 @@ typename DiracDeterminantBatched<DET_ENGINE>::Grad DiracDeterminantBatched<DET_E
           // Second term, eq 9
           if (j == i)
             for (int dim_el = 0; dim_el < OHMMS_DIM; dim_el++)
-              lapl_grad[dim][iel] -=
-                  (Real)2.0 * grad_phi_alpha_Minv(j, i)(dim, dim_el) * grad_phi_Minv(i, j)[dim_el];
+              lapl_grad[dim][iel] -= (Real)2.0 * grad_phi_alpha_Minv(j, i)(dim, dim_el) * grad_phi_Minv(i, j)[dim_el];
           // Third term, eq 9
           // First term, eq 10
           lapl_grad[dim][iel] -= phi_alpha_Minv(j, i)[dim] * lapl_phi_Minv(i, j);
@@ -870,9 +872,11 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_recompute(const RefVectorWithLeader
   RefVectorWithLeader<ParticleSet> p_filtered_list(p_list.getLeader());
   RefVectorWithLeader<SPOSet> phi_list(*wfc_leader.Phi);
   std::vector<Matrix<Value>> psiM_host_views;
-  RefVector<Matrix<Value>> psiM_temp_list;
+  RefVector<DualMatrix<Value>> psiM_temp_list;
+  RefVector<Matrix<Value>> psiM_host_list;
   RefVector<Matrix<Grad>> dpsiM_list;
   RefVector<Matrix<Value>> d2psiM_list;
+  RefVector<DualMatrix<Value>> psiMinv_list;
 
   wfc_filtered_list.reserve(nw);
   p_filtered_list.reserve(nw);
@@ -881,6 +885,7 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_recompute(const RefVectorWithLeader
   psiM_temp_list.reserve(nw);
   dpsiM_list.reserve(nw);
   d2psiM_list.reserve(nw);
+  psiMinv_list.reserve(nw);
 
   for (int iw = 0; iw < nw; iw++)
     if (recompute[iw])
@@ -890,37 +895,39 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_recompute(const RefVectorWithLeader
 
       auto& det = wfc_list.getCastedElement<DiracDeterminantBatched<DET_ENGINE>>(iw);
       phi_list.push_back(*det.Phi);
-      psiM_host_views.emplace_back(det.psiM_temp.data(), det.psiM_temp.rows(), det.psiM_temp.cols());
-      psiM_temp_list.push_back(psiM_host_views.back());
+      psiM_temp_list.push_back(det.psiM_temp);
+      psiM_host_list.push_back(det.psiM_host);
       dpsiM_list.push_back(det.dpsiM);
       d2psiM_list.push_back(det.d2psiM);
+      // We need get_ref_psiMinv because C++ can't deduce the correct overload from
+      // return type.
+      psiMinv_list.push_back(det.get_det_engine().get_ref_psiMinv());
     }
 
   if (!wfc_filtered_list.size())
     return;
 
   {
     ScopedTimer spo_timer(wfc_leader.SPOVGLTimer);
+    // I think through the magic of OMPtarget psiM_host_list actually results in psiM_temp being updated
+    // on the device. For dspiM_list, d2psiM_list I think they are calculated on CPU and this is not true
+    // This is the reason for the strange look omp target update to below.
     wfc_leader.Phi->mw_evaluate_notranspose(phi_list, p_filtered_list, wfc_leader.FirstIndex, wfc_leader.LastIndex,
-                                            psiM_temp_list, dpsiM_list, d2psiM_list);
+                                            psiM_host_list, dpsiM_list, d2psiM_list);
   }
 
   { // transfer dpsiM, d2psiM, psiMinv to device
     ScopedTimer d2h(H2DTimer);
 
-    RefVector<const DualMatrix<Value>> const_psiM_temp_list;
-    RefVector<DualMatrix<Value>> psiMinv_list;
 
     for (int iw = 0; iw < wfc_filtered_list.size(); iw++)
     {
       auto& det          = wfc_filtered_list.getCastedElement<DiracDeterminantBatched<DET_ENGINE>>(iw);
       auto* psiM_vgl_ptr = det.psiM_vgl.data();
       size_t stride      = wfc_leader.psiM_vgl.capacity();
       PRAGMA_OFFLOAD("omp target update to(psiM_vgl_ptr[stride:stride*4]) nowait")
-      const_psiM_temp_list.push_back(det.psiM_temp);
-      psiMinv_list.push_back(det.get_det_engine().get_ref_psiMinv());
     }
-    mw_invertPsiM(wfc_filtered_list, const_psiM_temp_list, psiMinv_list, recompute);
+    mw_invertPsiM(wfc_filtered_list, psiM_temp_list, psiMinv_list, recompute);
     PRAGMA_OFFLOAD("omp taskwait")
   }
 }
@@ -937,7 +944,8 @@ void DiracDeterminantBatched<DET_ENGINE>::evaluateDerivatives(ParticleSet& P,
 template<typename DET_ENGINE>
 DiracDeterminantBatched<DET_ENGINE>* DiracDeterminantBatched<DET_ENGINE>::makeCopy(std::shared_ptr<SPOSet>&& spo) const
 {
-  DiracDeterminantBatched<DET_ENGINE>* dclone = new DiracDeterminantBatched<DET_ENGINE>(std::move(spo), FirstIndex, LastIndex, ndelay_);
+  DiracDeterminantBatched<DET_ENGINE>* dclone =
+      new DiracDeterminantBatched<DET_ENGINE>(std::move(spo), FirstIndex, LastIndex, ndelay_);
   return dclone;
 }
 

diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h
@@ -279,7 +279,7 @@ class DiracDeterminantBatched : public DiracDeterminantBase
    *  the compute mask. See future PR for those changes, or drop of compute_mask argument.
    */
   void mw_invertPsiM(const RefVectorWithLeader<WaveFunctionComponent>& wfc_list,
-                     RefVector<const DualMatrix<Value>>& logdetT_list,
+                     RefVector<DualMatrix<Value>>& logdetT_list,
                      RefVector<DualMatrix<Value>>& a_inv_list,
                      const std::vector<bool>& compute_mask) const;
 
@@ -311,7 +311,7 @@ class DiracDeterminantBatched : public DiracDeterminantBase
   const int ndelay_;
 
   /// timers
-  NewTimer &D2HTimer, &H2DTimer;
+  NewTimer &D2HTimer, &H2DTimer, &batched_inverse_timer_;
 };
 
 extern template class DiracDeterminantBatched<>;

diff --git a/src/QMCWaveFunctions/Fermion/DiracMatrixComputeCUDA.hpp b/src/QMCWaveFunctions/Fermion/DiracMatrixComputeCUDA.hpp
@@ -44,8 +44,8 @@ template<typename VALUE_FP>
 class DiracMatrixComputeCUDA : public Resource
 {
   using FullPrecReal = RealAlias<VALUE_FP>;
-  using LogValue = std::complex<FullPrecReal>;
-  
+  using LogValue     = std::complex<FullPrecReal>;
+
   template<typename T>
   using DualMatrix = Matrix<T, PinnedDualAllocator<T>>;
 
@@ -102,7 +102,7 @@ class DiracMatrixComputeCUDA : public Resource
     int ldinv = inv_a_mats[0].get().cols();
     Vector<const VALUE_FP*> M_ptr_buffer(reinterpret_cast<const TMAT**>(psiM_ptrs_.data()), nw);
     Vector<const VALUE_FP*> invM_ptr_buffer(reinterpret_cast<const TMAT**>(invM_ptrs_.data()), nw);
-    cudaStream_t hstream = cuda_handles.hstream;
+    cudaStream_t hstream    = cuda_handles.hstream;
     cublasHandle_t h_cublas = cuda_handles.h_cublas;
 
     for (int iw = 0; iw < nw; ++iw)
@@ -141,9 +141,8 @@ class DiracMatrixComputeCUDA : public Resource
                                      inv_a_mats[iw].get().size() * sizeof(TMAT), cudaMemcpyDeviceToHost, hstream),
                      "cudaMemcpyAsync failed copying DiracMatrixBatch::inv_psiM to host");
     }
-    cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(),
-                                   log_values.size() * sizeof(LogValue), cudaMemcpyDeviceToHost,
-                                   hstream),
+    cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(), log_values.size() * sizeof(LogValue),
+                                   cudaMemcpyDeviceToHost, hstream),
                    "cudaMemcpyAsync log_values failed!");
     cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!");
   }
@@ -182,7 +181,7 @@ class DiracMatrixComputeCUDA : public Resource
     infos_.resize(nw);
     LU_diags_fp_.resize(n * nw);
 
-    cudaStream_t hstream = cuda_handles.hstream;
+    cudaStream_t hstream    = cuda_handles.hstream;
     cublasHandle_t h_cublas = cuda_handles.h_cublas;
     cudaErrorCheck(cudaMemcpyAsync(psi_Ms.device_data(), psi_Ms.data(), psi_Ms.size() * sizeof(VALUE_FP),
                                    cudaMemcpyHostToDevice, hstream),
@@ -210,9 +209,8 @@ class DiracMatrixComputeCUDA : public Resource
     cudaErrorCheck(cudaMemcpyAsync(inv_Ms.data(), inv_Ms.device_data(), inv_Ms.size() * sizeof(VALUE_FP),
                                    cudaMemcpyDeviceToHost, hstream),
                    "cudaMemcpyAsync failed copying back DiracMatrixBatch::invM_fp from device");
-    cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(),
-                                   log_values.size() * sizeof(LogValue), cudaMemcpyDeviceToHost,
-                                   hstream),
+    cudaErrorCheck(cudaMemcpyAsync(log_values.data(), log_values.device_data(), log_values.size() * sizeof(LogValue),
+                                   cudaMemcpyDeviceToHost, hstream),
                    "cudaMemcpyAsync log_values failed!");
     cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!");
   }
@@ -242,15 +240,14 @@ class DiracMatrixComputeCUDA : public Resource
                         DualMatrix<TMAT>& inv_a_mat,
                         DualVector<LogValue>& log_values)
   {
-    const int n        = a_mat.rows();
-    const int lda      = a_mat.cols();
+    const int n   = a_mat.rows();
+    const int lda = a_mat.cols();
     psiM_fp_.resize(n * lda);
     invM_fp_.resize(n * lda);
     std::fill(log_values.begin(), log_values.end(), LogValue{0.0, 0.0});
     // making sure we know the log_values are zero'd on the device.
-    cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(),
-                                   log_values.size() * sizeof(LogValue), cudaMemcpyHostToDevice,
-                                   cuda_handles.hstream),
+    cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(), log_values.size() * sizeof(LogValue),
+                                   cudaMemcpyHostToDevice, cuda_handles.hstream),
                    "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
     simd::transpose(a_mat.data(), n, lda, psiM_fp_.data(), n, lda);
     cudaErrorCheck(cudaMemcpyAsync(psiM_fp_.device_data(), psiM_fp_.data(), psiM_fp_.size() * sizeof(VALUE_FP),
@@ -282,21 +279,20 @@ class DiracMatrixComputeCUDA : public Resource
       const std::vector<bool>& compute_mask)
   {
     assert(log_values.size() == a_mats.size());
-    int nw             = a_mats.size();
-    const int n        = a_mats[0].get().rows();
-    const int lda      = a_mats[0].get().cols();
-    size_t nsqr        = n * n;
+    int nw        = a_mats.size();
+    const int n   = a_mats[0].get().rows();
+    const int lda = a_mats[0].get().cols();
+    size_t nsqr   = n * n;
     psiM_fp_.resize(n * lda * nw);
     invM_fp_.resize(n * lda * nw);
     std::fill(log_values.begin(), log_values.end(), LogValue{0.0, 0.0});
     // making sure we know the log_values are zero'd on the device.
-    cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(),
-                                   log_values.size() * sizeof(LogValue), cudaMemcpyHostToDevice,
-                                   cuda_handles.hstream),
+    cudaErrorCheck(cudaMemcpyAsync(log_values.device_data(), log_values.data(), log_values.size() * sizeof(LogValue),
+                                   cudaMemcpyHostToDevice, cuda_handles.hstream),
                    "cudaMemcpyAsync failed copying DiracMatrixBatch::log_values to device");
     for (int iw = 0; iw < nw; ++iw)
       simd::transpose(a_mats[iw].get().data(), n, a_mats[iw].get().cols(), psiM_fp_.data() + nsqr * iw, n, lda);
-    mw_computeInvertAndLog(cuda_handles.h_cublas, psiM_fp_, invM_fp_, n, lda, log_values);
+    mw_computeInvertAndLog(cuda_handles, psiM_fp_, invM_fp_, n, lda, log_values);
     for (int iw = 0; iw < a_mats.size(); ++iw)
     {
       DualMatrix<VALUE_FP> data_ref_matrix;
@@ -313,8 +309,8 @@ class DiracMatrixComputeCUDA : public Resource
 
   /** Batched inversion and calculation of log determinants.
    *  When TMAT is full precision we can use the a_mat and inv_mat directly
-   *  Side effect of this is after this call a_mats contains the LU factorization
-   *  matrix.
+   *  Side effect of this is after this call the device copy of  a_mats contains
+   *  the LU factorization matrix.
    */
   template<typename TMAT>
   inline std::enable_if_t<std::is_same<VALUE_FP, TMAT>::value> mw_invertTranspose(