From 517dea7090ca3f75ba6e2e235fa2008d6816917f Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Thu, 30 Sep 2021 01:14:49 -0500 Subject: [PATCH 1/2] Cure non-determinisitic offload J2. Reproducer: NiO a64 batched_driver performance test. Run 1 VMC step with 1 thread over and over. The scalar.dat is not deterministic. Kinetic is different. mw_updateVGL. Inject print before and after the offload region. walker 13 and electron 741. Sometimes the value is not updated even if a walker is accepted. --- src/QMCWaveFunctions/Jastrow/BsplineFunctor.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h index 9358bbd087..d6ea578045 100644 --- a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h +++ b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h @@ -218,6 +218,7 @@ struct BsplineFunctor : public OptimizableFunctorBase PRAGMA_OFFLOAD("omp parallel for reduction(+: val_sum, grad_x, grad_y, grad_z, lapl)") for (int j = 0; j < n_src; j++) { + if (j == iat) continue; const int ig = grp_ids[j]; const T* coefs = mw_coefs[ig]; T DeltaRInv = mw_DeltaRInv[ig]; @@ -227,7 +228,7 @@ struct BsplineFunctor : public OptimizableFunctorBase T u(0); T dudr(0); T d2udr2(0); - if (j != iat && r < cutoff_radius) + if (r < cutoff_radius) { u = evaluate_impl(dist[j], coefs, DeltaRInv, dudr, d2udr2); dudr *= T(1) / r; @@ -541,7 +542,7 @@ struct BsplineFunctor : public OptimizableFunctorBase T* mw_DeltaRInv = reinterpret_cast(transfer_buffer_ptr + sizeof(T*) * num_groups); T* mw_cutoff_radius = mw_DeltaRInv + num_groups; int* accepted_indices = reinterpret_cast(transfer_buffer_ptr + (sizeof(T*) + sizeof(T) * 2) * num_groups); - int ip = accepted_indices[iw]; + const int ip = accepted_indices[iw]; const T* dist_new = mw_dist + ip * dist_stride; const T* dipl_x_new = dist_new + n_padded; @@ -564,6 +565,7 @@ struct BsplineFunctor : public OptimizableFunctorBase PRAGMA_OFFLOAD("omp parallel for") for (int j = 0; j < n_src; j++) { + if (j == iat) continue; const int ig = grp_ids[j]; const T* coefs = mw_coefs[ig]; T DeltaRInv = mw_DeltaRInv[ig]; @@ -573,7 +575,7 @@ struct BsplineFunctor : public OptimizableFunctorBase T u(0); T dudr(0); T d2udr2(0); - if (j != iat && r < cutoff_radius) + if (r < cutoff_radius) { u = evaluate_impl(dist_old[j], coefs, DeltaRInv, dudr, d2udr2); dudr *= T(1) / r; From d546e4e3730648aaa8a458191d79cb718a1e43fe Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Thu, 30 Sep 2021 22:19:53 -0500 Subject: [PATCH 2/2] Minimize recompute in J2. --- src/Containers/OhmmsSoA/VectorSoaContainer.h | 3 +- src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 41 +++++++++++++++++--- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/Containers/OhmmsSoA/VectorSoaContainer.h b/src/Containers/OhmmsSoA/VectorSoaContainer.h index 40f0021139..86aad416c5 100644 --- a/src/Containers/OhmmsSoA/VectorSoaContainer.h +++ b/src/Containers/OhmmsSoA/VectorSoaContainer.h @@ -62,7 +62,8 @@ struct VectorSoaContainer { if (myData != in.myData) { - resize(in.nLocal); + if (nLocal != in.nLocal) + resize(in.nLocal); std::copy_n(in.myData, nGhosts * D, myData); } return *this; diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp index 61fad92901..1ca950b601 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -69,12 +69,23 @@ void J2OMPTarget::acquireResource(ResourceCollection& collection, mw_allUat.resize(N_padded * (DIM + 2) * nw); for (size_t iw = 0; iw < nw; iw++) { - size_t offset = N_padded * (DIM + 2) * iw; - auto& wfc = wfc_list.getCastedElement>(iw); + // copy per walker Uat, dUat, d2Uat to shared buffer and attach buffer + auto& wfc = wfc_list.getCastedElement>(iw); + + Vector> Uat_view(mw_allUat.data() + iw * N_padded, N); + Uat_view = wfc.Uat; wfc.Uat.free(); wfc.Uat.attachReference(mw_allUat.data() + iw * N_padded, N); + + VectorSoaContainer> dUat_view(mw_allUat.data() + nw * N_padded + + iw * N_padded * DIM, + N, N_padded); + dUat_view = wfc.dUat; wfc.dUat.free(); wfc.dUat.attachReference(N, N_padded, mw_allUat.data() + nw * N_padded + iw * N_padded * DIM); + + Vector> d2Uat_view(mw_allUat.data() + nw * N_padded * (DIM + 1) + iw * N_padded, N); + d2Uat_view = wfc.d2Uat; wfc.d2Uat.free(); wfc.d2Uat.attachReference(mw_allUat.data() + nw * N_padded * (DIM + 1) + iw * N_padded, N); } @@ -86,14 +97,31 @@ void J2OMPTarget::releaseResource(ResourceCollection& collection, const RefVectorWithLeader& wfc_list) const { auto& wfc_leader = wfc_list.getCastedLeader>(); - collection.takebackResource(std::move(wfc_leader.mw_mem_)); - for (size_t iw = 0; iw < wfc_list.size(); iw++) + const size_t nw = wfc_list.size(); + auto& mw_allUat = wfc_leader.mw_mem_->mw_allUat; + for (size_t iw = 0; iw < nw; iw++) { + // detach buffer and copy per walker Uat, dUat, d2Uat from shared buffer auto& wfc = wfc_list.getCastedElement>(iw); + + Vector> Uat_view(mw_allUat.data() + iw * N_padded, N); wfc.Uat.free(); + wfc.Uat.resize(N); + wfc.Uat = Uat_view; + + VectorSoaContainer> dUat_view(mw_allUat.data() + nw * N_padded + + iw * N_padded * DIM, + N, N_padded); wfc.dUat.free(); + wfc.dUat.resize(N); + wfc.dUat = dUat_view; + + Vector> d2Uat_view(mw_allUat.data() + nw * N_padded * (DIM + 1) + iw * N_padded, N); wfc.d2Uat.free(); + wfc.d2Uat.resize(N); + wfc.d2Uat = d2Uat_view; } + collection.takebackResource(std::move(wfc_leader.mw_mem_)); } template @@ -674,7 +702,8 @@ void J2OMPTarget::mw_recompute(const RefVectorWithLeadermw_allUat.updateTo(); } @@ -738,7 +767,7 @@ void J2OMPTarget::mw_evaluateGL(const RefVectorWithLeader>(iw); + auto& wfc = wfc_list.getCastedElement>(iw); wfc.log_value_ = wfc.computeGL(G_list[iw], L_list[iw]); } }