From 521a6a26856db62e1cfbe26dd9268551412d067b Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 24 Jul 2021 17:56:41 -0500 Subject: [PATCH 01/13] Change VP list in API to carry leader. --- src/Particle/VirtualParticleSet.cpp | 5 ++++- src/QMCHamiltonians/NonLocalECPComponent.cpp | 2 +- .../BsplineFactory/SplineC2COMPTarget.cpp | 2 +- .../BsplineFactory/SplineC2COMPTarget.h | 2 +- .../BsplineFactory/SplineC2ROMPTarget.cpp | 2 +- .../BsplineFactory/SplineC2ROMPTarget.h | 2 +- src/QMCWaveFunctions/Fermion/DiracDeterminant.cpp | 2 +- src/QMCWaveFunctions/Fermion/DiracDeterminant.h | 2 +- .../Fermion/DiracDeterminantBatched.cpp | 2 +- .../Fermion/DiracDeterminantBatched.h | 2 +- src/QMCWaveFunctions/Fermion/SlaterDet.h | 11 +++++++---- src/QMCWaveFunctions/SPOSet.cpp | 2 +- src/QMCWaveFunctions/SPOSet.h | 2 +- src/QMCWaveFunctions/TWFdispatcher.cpp | 2 +- src/QMCWaveFunctions/TWFdispatcher.h | 2 +- src/QMCWaveFunctions/TrialWaveFunction.cpp | 2 +- src/QMCWaveFunctions/TrialWaveFunction.h | 2 +- src/QMCWaveFunctions/WaveFunctionComponent.cpp | 2 +- src/QMCWaveFunctions/WaveFunctionComponent.h | 6 +++--- 19 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/Particle/VirtualParticleSet.cpp b/src/Particle/VirtualParticleSet.cpp index 4dc95c8a4b..6d46fa9830 100644 --- a/src/Particle/VirtualParticleSet.cpp +++ b/src/Particle/VirtualParticleSet.cpp @@ -61,7 +61,10 @@ void VirtualParticleSet::mw_makeMoves(const RefVectorWithLeader>& joblist, bool sphere) { - RefVectorWithLeader p_list(vp_list.getLeader()); + auto& vp_leader = vp_list.getLeader(); + vp_leader.onSphere = sphere; + + RefVectorWithLeader p_list(vp_leader); p_list.reserve(vp_list.size()); for (int iw = 0; iw < vp_list.size(); iw++) diff --git a/src/QMCHamiltonians/NonLocalECPComponent.cpp b/src/QMCHamiltonians/NonLocalECPComponent.cpp index cda87f995c..b7af5a2c27 100644 --- a/src/QMCHamiltonians/NonLocalECPComponent.cpp +++ b/src/QMCHamiltonians/NonLocalECPComponent.cpp @@ -199,7 +199,7 @@ void NonLocalECPComponent::mw_evaluateOne(const RefVectorWithLeader vp_list(*ecp_component_leader.VP); - RefVector const_vp_list; + RefVectorWithLeader const_vp_list(*ecp_component_leader.VP); RefVector> deltaV_list; RefVector> psiratios_list; vp_list.reserve(ecp_component_list.size()); diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp index f04f187277..676f872712 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.cpp @@ -299,7 +299,7 @@ void SplineC2COMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, template void SplineC2COMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector& psi_list, const std::vector& invRow_ptr_list, std::vector>& ratios_list) const diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h index 92c67939b6..1b81d13042 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h @@ -253,7 +253,7 @@ class SplineC2COMPTarget : public BsplineSet std::vector& ratios) override; virtual void mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector& psi_list, const std::vector& invRow_ptr_list, std::vector>& ratios_list) const override; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp index 1279a312c2..0b316f0374 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.cpp @@ -381,7 +381,7 @@ void SplineC2ROMPTarget::evaluateDetRatios(const VirtualParticleSet& VP, template void SplineC2ROMPTarget::mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector& psi_list, const std::vector& invRow_ptr_list, std::vector>& ratios_list) const diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h index e559d8e2df..fb0b02f2f9 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h @@ -261,7 +261,7 @@ class SplineC2ROMPTarget : public BsplineSet std::vector& ratios) override; virtual void mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector& psi_list, const std::vector& invRow_ptr_list, std::vector>& ratios_list) const override; diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminant.cpp b/src/QMCWaveFunctions/Fermion/DiracDeterminant.cpp index f6ea51c97b..3b90835b03 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminant.cpp +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminant.cpp @@ -414,7 +414,7 @@ void DiracDeterminant::evaluateRatios(const VirtualParticleSet& VP, std template void DiracDeterminant::mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, std::vector>& ratios) const { const size_t nw = wfc_list.size(); diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminant.h b/src/QMCWaveFunctions/Fermion/DiracDeterminant.h index f0f4d760db..b3b2e33a7a 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminant.h +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminant.h @@ -97,7 +97,7 @@ class DiracDeterminant : public DiracDeterminantBase void evaluateRatios(const VirtualParticleSet& VP, std::vector& ratios) override; void mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, std::vector>& ratios) const override; PsiValueType ratioGrad(ParticleSet& P, int iat, GradType& grad_iat) override; diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp index cf68a758ce..5eb197482e 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp @@ -592,7 +592,7 @@ void DiracDeterminantBatched::evaluateRatios(const VirtualParticleSe template void DiracDeterminantBatched::mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, std::vector>& ratios) const { assert(this == &wfc_list.getLeader()); diff --git a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h index 6bfcb1a024..268bd4e3ba 100644 --- a/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h +++ b/src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.h @@ -119,7 +119,7 @@ class DiracDeterminantBatched : public DiracDeterminantBase void evaluateRatios(const VirtualParticleSet& VP, std::vector& ratios) override; void mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, std::vector>& ratios) const override; PsiValueType ratioGrad(ParticleSet& P, int iat, GradType& grad_iat) override; diff --git a/src/QMCWaveFunctions/Fermion/SlaterDet.h b/src/QMCWaveFunctions/Fermion/SlaterDet.h index 8f21e807a6..f1d294d80f 100644 --- a/src/QMCWaveFunctions/Fermion/SlaterDet.h +++ b/src/QMCWaveFunctions/Fermion/SlaterDet.h @@ -109,12 +109,15 @@ class SlaterDet : public WaveFunctionComponent } inline void mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, std::vector>& ratios) const override { - // assuming all the VP.refPtcl are identical - const int det_id = getDetID(vp_list[0].get().refPtcl); - return Dets[det_id]->mw_evaluateRatios(extract_DetRef_list(wfc_list, det_id), vp_list, ratios); + if (wfc_list.size()) + { + // assuming all the VP.refPtcl are identical + const int det_id = getDetID(vp_list[0].refPtcl); + Dets[det_id]->mw_evaluateRatios(extract_DetRef_list(wfc_list, det_id), vp_list, ratios); + } } PsiValueType ratioGrad(ParticleSet& P, int iat, GradType& grad_iat) override; diff --git a/src/QMCWaveFunctions/SPOSet.cpp b/src/QMCWaveFunctions/SPOSet.cpp index 255cec5782..d5081883ea 100644 --- a/src/QMCWaveFunctions/SPOSet.cpp +++ b/src/QMCWaveFunctions/SPOSet.cpp @@ -46,7 +46,7 @@ void SPOSet::evaluateDetRatios(const VirtualParticleSet& VP, } void SPOSet::mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector& psi_list, const std::vector& invRow_ptr_list, std::vector>& ratios_list) const diff --git a/src/QMCWaveFunctions/SPOSet.h b/src/QMCWaveFunctions/SPOSet.h index cf4a1255dc..b533dc55d3 100644 --- a/src/QMCWaveFunctions/SPOSet.h +++ b/src/QMCWaveFunctions/SPOSet.h @@ -214,7 +214,7 @@ class SPOSet : public QMCTraits * @param ratios_list a list of returning determinant ratios */ virtual void mw_evaluateDetRatios(const RefVectorWithLeader& spo_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector& psi_list, const std::vector& invRow_ptr_list, std::vector>& ratios_list) const; diff --git a/src/QMCWaveFunctions/TWFdispatcher.cpp b/src/QMCWaveFunctions/TWFdispatcher.cpp index 1cd2ba4b51..c433d1f800 100644 --- a/src/QMCWaveFunctions/TWFdispatcher.cpp +++ b/src/QMCWaveFunctions/TWFdispatcher.cpp @@ -147,7 +147,7 @@ void TWFdispatcher::flex_evaluateGL(const RefVectorWithLeader } void TWFdispatcher::flex_evaluateRatios(const RefVectorWithLeader& wf_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector>& ratios_list, ComputeType ct) const { diff --git a/src/QMCWaveFunctions/TWFdispatcher.h b/src/QMCWaveFunctions/TWFdispatcher.h index ec5b938daa..998a2afe5f 100644 --- a/src/QMCWaveFunctions/TWFdispatcher.h +++ b/src/QMCWaveFunctions/TWFdispatcher.h @@ -73,7 +73,7 @@ class TWFdispatcher bool fromscratch) const; void flex_evaluateRatios(const RefVectorWithLeader& wf_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector>& ratios_list, ComputeType ct) const; diff --git a/src/QMCWaveFunctions/TrialWaveFunction.cpp b/src/QMCWaveFunctions/TrialWaveFunction.cpp index fa91ad361d..79df11a5ba 100644 --- a/src/QMCWaveFunctions/TrialWaveFunction.cpp +++ b/src/QMCWaveFunctions/TrialWaveFunction.cpp @@ -998,7 +998,7 @@ void TrialWaveFunction::evaluateRatios(const VirtualParticleSet& VP, std::vector } void TrialWaveFunction::mw_evaluateRatios(const RefVectorWithLeader& wf_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, const RefVector>& ratios_list, ComputeType ct) { diff --git a/src/QMCWaveFunctions/TrialWaveFunction.h b/src/QMCWaveFunctions/TrialWaveFunction.h index b7ad73a380..142d90055a 100644 --- a/src/QMCWaveFunctions/TrialWaveFunction.h +++ b/src/QMCWaveFunctions/TrialWaveFunction.h @@ -277,7 +277,7 @@ class TrialWaveFunction * Note: unlike other mw_ static functions, *this is the batch leader instead of wf_list[0]. */ static void mw_evaluateRatios(const RefVectorWithLeader& wf_list, - const RefVector& Vp_list, + const RefVectorWithLeader& Vp_list, const RefVector>& ratios_list, ComputeType ct = ComputeType::ALL); diff --git a/src/QMCWaveFunctions/WaveFunctionComponent.cpp b/src/QMCWaveFunctions/WaveFunctionComponent.cpp index 5f420e7ec7..87ce9fe3f2 100644 --- a/src/QMCWaveFunctions/WaveFunctionComponent.cpp +++ b/src/QMCWaveFunctions/WaveFunctionComponent.cpp @@ -208,7 +208,7 @@ void WaveFunctionComponent::evaluateRatios(const VirtualParticleSet& P, std::vec } void WaveFunctionComponent::mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, std::vector>& ratios) const { assert(this == &wfc_list.getLeader()); diff --git a/src/QMCWaveFunctions/WaveFunctionComponent.h b/src/QMCWaveFunctions/WaveFunctionComponent.h index 8a098fde7d..1ae9075718 100644 --- a/src/QMCWaveFunctions/WaveFunctionComponent.h +++ b/src/QMCWaveFunctions/WaveFunctionComponent.h @@ -17,6 +17,8 @@ #ifndef QMCPLUSPLUS_WAVEFUNCTIONCOMPONENT_H #define QMCPLUSPLUS_WAVEFUNCTIONCOMPONENT_H + +#include #include "Message/Communicate.h" #include "Configuration.h" #include "Particle/ParticleSet.h" @@ -30,8 +32,6 @@ #include "type_traits/CUDATypes.h" #endif -#include - /**@file WaveFunctionComponent.h *@brief Declaration of WaveFunctionComponent */ @@ -513,7 +513,7 @@ struct WaveFunctionComponent : public QMCTraits * @param ratios of all the virtual moves of all the walkers */ virtual void mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVector& vp_list, + const RefVectorWithLeader& vp_list, std::vector>& ratios) const; /** evaluate ratios to evaluate the non-local PP From e565f99d6d172bd8de4f57cfa8947514799c71a3 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 17 Jul 2021 17:28:12 -0500 Subject: [PATCH 02/13] Split J2KECorrection.h off J2OrbitalSoA.h --- src/QMCWaveFunctions/Jastrow/J2KECorrection.h | 88 +++++++++++++++++++ src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h | 66 +------------- 2 files changed, 89 insertions(+), 65 deletions(-) create mode 100644 src/QMCWaveFunctions/Jastrow/J2KECorrection.h diff --git a/src/QMCWaveFunctions/Jastrow/J2KECorrection.h b/src/QMCWaveFunctions/Jastrow/J2KECorrection.h new file mode 100644 index 0000000000..09b7e18309 --- /dev/null +++ b/src/QMCWaveFunctions/Jastrow/J2KECorrection.h @@ -0,0 +1,88 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2021 QMCPACK developers. +// +// File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. +// Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp. +// Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// +// -*- C++ -*- +#ifndef QMCPLUSPLUS_J2KECORRECTION_H +#define QMCPLUSPLUS_J2KECORRECTION_H + +#include +#include +#include + +namespace qmcplusplus +{ +// helper class to activate KEcorr during optimizing Jastrow +template +class J2KECorrection +{ + size_t num_groups_; + std::vector num_elec_in_groups_; + RT num_elecs_; + RT vol; + RT G0mag; + const std::vector& F_; + bool SK_enabled; + +public: + J2KECorrection(const ParticleSet& targetPtcl, const std::vector& F) + : num_groups_(targetPtcl.groups()), + num_elecs_(targetPtcl.getTotalNum()), + vol(targetPtcl.Lattice.Volume), + F_(F), + SK_enabled(targetPtcl.SK != nullptr) + { + // compute num_elec_in_groups_ + num_elec_in_groups_.reserve(3); + for (int i = 0; i < num_groups_; i++) + num_elec_in_groups_.push_back(targetPtcl.last(i) - targetPtcl.first(i)); + + if (SK_enabled) + G0mag = std::sqrt(targetPtcl.SK->KLists.ksq[0]); + } + + RT computeKEcorr() + { + if (!SK_enabled) + return 0; + + const int numPoints = 1000; + RT uk = 0.0; + RT a = 1.0; + + for (int i = 0; i < num_groups_; i++) + { + int Ni = num_elec_in_groups_[i]; + for (int j = 0; j < num_groups_; j++) + { + int Nj = num_elec_in_groups_[j]; + if (F_[i * num_groups_ + j]) + { + FT& ufunc = *(F_[i * num_groups_ + j]); + RT radius = ufunc.cutoff_radius; + RT k = G0mag; + RT dr = radius / (RT)(numPoints - 1); + for (int ir = 0; ir < numPoints; ir++) + { + RT r = dr * (RT)ir; + RT u = ufunc.evaluate(r); + uk += 0.5 * 4.0 * M_PI * r * std::sin(k * r) / k * u * dr * (RT)Nj / (RT)(Ni + Nj); + } + } + } + } + for (int iter = 0; iter < 20; iter++) + a = uk / (4.0 * M_PI * (1.0 / (G0mag * G0mag) - 1.0 / (G0mag * G0mag + 1.0 / a))); + return 4.0 * M_PI * a / (4.0 * vol) * num_elecs_; + } +}; +} // namespace qmcplusplus +#endif diff --git a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h index b9fe5e893a..74f966e744 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h +++ b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h @@ -24,74 +24,10 @@ #include "Particle/DistanceTableData.h" #include "LongRange/StructFact.h" #include "CPU/SIMD/aligned_allocator.hpp" +#include "J2KECorrection.h" namespace qmcplusplus { -// helper class to activate KEcorr during optimizing Jastrow -template -class J2KECorrection -{ - size_t num_groups_; - std::vector num_elec_in_groups_; - RT num_elecs_; - RT vol; - RT G0mag; - const std::vector& F_; - bool SK_enabled; - -public: - J2KECorrection(const ParticleSet& targetPtcl, const std::vector& F) - : num_groups_(targetPtcl.groups()), - num_elecs_(targetPtcl.getTotalNum()), - vol(targetPtcl.Lattice.Volume), - F_(F), - SK_enabled(targetPtcl.SK != nullptr) - { - // compute num_elec_in_groups_ - num_elec_in_groups_.reserve(3); - for (int i = 0; i < num_groups_; i++) - num_elec_in_groups_.push_back(targetPtcl.last(i) - targetPtcl.first(i)); - - if (SK_enabled) - G0mag = std::sqrt(targetPtcl.SK->KLists.ksq[0]); - } - - RT computeKEcorr() - { - if (!SK_enabled) - return 0; - - const int numPoints = 1000; - RT uk = 0.0; - RT a = 1.0; - - for (int i = 0; i < num_groups_; i++) - { - int Ni = num_elec_in_groups_[i]; - for (int j = 0; j < num_groups_; j++) - { - int Nj = num_elec_in_groups_[j]; - if (F_[i * num_groups_ + j]) - { - FT& ufunc = *(F_[i * num_groups_ + j]); - RT radius = ufunc.cutoff_radius; - RT k = G0mag; - RT dr = radius / (RT)(numPoints - 1); - for (int ir = 0; ir < numPoints; ir++) - { - RT r = dr * (RT)ir; - RT u = ufunc.evaluate(r); - uk += 0.5 * 4.0 * M_PI * r * std::sin(k * r) / k * u * dr * (RT)Nj / (RT)(Ni + Nj); - } - } - } - } - for (int iter = 0; iter < 20; iter++) - a = uk / (4.0 * M_PI * (1.0 / (G0mag * G0mag) - 1.0 / (G0mag * G0mag + 1.0 / a))); - return 4.0 * M_PI * a / (4.0 * vol) * num_elecs_; - } -}; - /** @ingroup WaveFunctionComponent * @brief Specialization for two-body Jastrow function using multiple functors * From 3e58e928195ae6bccb42625e8b7d4874c2d2c213 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 17 Jul 2021 18:48:59 -0500 Subject: [PATCH 03/13] Add J2OMPTarget --- src/QMCWaveFunctions/CMakeLists.txt | 4 + .../Fermion/SlaterDetBuilder.cpp | 11 +- src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 510 ++++++++++++++++++ src/QMCWaveFunctions/Jastrow/J2OMPTarget.h | 179 ++++++ .../Jastrow/RadialJastrowBuilder.cpp | 116 ++-- .../Jastrow/RadialJastrowBuilder.h | 12 +- .../tests/test_bspline_jastrow.cpp | 2 +- 7 files changed, 789 insertions(+), 45 deletions(-) create mode 100644 src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp create mode 100644 src/QMCWaveFunctions/Jastrow/J2OMPTarget.h diff --git a/src/QMCWaveFunctions/CMakeLists.txt b/src/QMCWaveFunctions/CMakeLists.txt index 256372efdb..a31d4b30ab 100644 --- a/src/QMCWaveFunctions/CMakeLists.txt +++ b/src/QMCWaveFunctions/CMakeLists.txt @@ -59,6 +59,10 @@ set(JASTROW_SRCS Fermion/SPOSetProxy.cpp Fermion/SPOSetProxyForMSD.cpp) +if(ENABLE_OFFLOAD) + set(JASTROW_SRCS ${JASTROW_SRCS} Jastrow/J2OMPTarget.cpp) +endif() + if(QMC_COMPLEX) set(FERMION_SRCS ${FERMION_SRCS} ElectronGas/ElectronGasComplexOrbitalBuilder.cpp) else(QMC_COMPLEX) diff --git a/src/QMCWaveFunctions/Fermion/SlaterDetBuilder.cpp b/src/QMCWaveFunctions/Fermion/SlaterDetBuilder.cpp index b1221ce0fe..1714ce8054 100644 --- a/src/QMCWaveFunctions/Fermion/SlaterDetBuilder.cpp +++ b/src/QMCWaveFunctions/Fermion/SlaterDetBuilder.cpp @@ -390,17 +390,16 @@ bool SlaterDetBuilder::putDeterminant(xmlNodePtr cur, int spin_group) #else std::string use_batch("no"); #endif -#if defined(ENABLE_CUDA) - std::string useGPU("yes"); -#else - std::string useGPU("no"); -#endif + std::string useGPU; int delay_rank(0); + OhmmsAttributeSet sdAttrib; sdAttrib.add(delay_rank, "delay_rank"); sdAttrib.add(optimize, "optimize"); sdAttrib.add(use_batch, "batch"); - sdAttrib.add(useGPU, "gpu"); +#if defined(ENABLE_CUDA) || defined(ENABLE_OFFLOAD) + sdAttrib.add(useGPU, "gpu", {"yes", "no"}); +#endif sdAttrib.put(cur->parent); { //check determinant@group diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp new file mode 100644 index 0000000000..c92919841e --- /dev/null +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -0,0 +1,510 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. +// Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp. +// Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. +////////////////////////////////////////////////////////////////////////////////////// +// -*- C++ -*- + + +#include "J2OMPTarget.h" +#include "CPU/SIMD/algorithm.hpp" +#include "BsplineFunctor.h" +#include "PadeFunctors.h" +#include "UserFunctor.h" + +namespace qmcplusplus +{ +template +void J2OMPTarget::checkInVariables(opt_variables_type& active) +{ + myVars.clear(); + auto it(J2Unique.begin()), it_end(J2Unique.end()); + while (it != it_end) + { + (*it).second->checkInVariables(active); + (*it).second->checkInVariables(myVars); + ++it; + } +} + +template +void J2OMPTarget::checkOutVariables(const opt_variables_type& active) +{ + myVars.getIndex(active); + Optimizable = myVars.is_optimizable(); + auto it(J2Unique.begin()), it_end(J2Unique.end()); + while (it != it_end) + { + (*it).second->checkOutVariables(active); + ++it; + } + if (dPsi) + dPsi->checkOutVariables(active); +} + +template +void J2OMPTarget::resetParameters(const opt_variables_type& active) +{ + if (!Optimizable) + return; + auto it(J2Unique.begin()), it_end(J2Unique.end()); + while (it != it_end) + { + (*it).second->resetParameters(active); + ++it; + } + if (dPsi) + dPsi->resetParameters(active); + for (int i = 0; i < myVars.size(); ++i) + { + int ii = myVars.Index[i]; + if (ii >= 0) + myVars[i] = active[ii]; + } +} + +template +void J2OMPTarget::reportStatus(std::ostream& os) +{ + auto it(J2Unique.begin()), it_end(J2Unique.end()); + while (it != it_end) + { + (*it).second->myVars.print(os); + ++it; + } +} + +template +void J2OMPTarget::evaluateRatios(const VirtualParticleSet& VP, std::vector& ratios) +{ + for (int k = 0; k < ratios.size(); ++k) + ratios[k] = std::exp(Uat[VP.refPtcl] - computeU(VP.refPS, VP.refPtcl, VP.getDistTable(my_table_ID_).getDistRow(k))); +} + +template +void J2OMPTarget::registerData(ParticleSet& P, WFBufferType& buf) +{ + if (Bytes_in_WFBuffer == 0) + { + Bytes_in_WFBuffer = buf.current(); + buf.add(Uat.begin(), Uat.end()); + buf.add(dUat.data(), dUat.end()); + buf.add(d2Uat.begin(), d2Uat.end()); + Bytes_in_WFBuffer = buf.current() - Bytes_in_WFBuffer; + // free local space + Uat.free(); + dUat.free(); + d2Uat.free(); + } + else + { + buf.forward(Bytes_in_WFBuffer); + } +} + +template +void J2OMPTarget::copyFromBuffer(ParticleSet& P, WFBufferType& buf) +{ + Uat.attachReference(buf.lendReference(N), N); + dUat.attachReference(N, N_padded, buf.lendReference(N_padded * OHMMS_DIM)); + d2Uat.attachReference(buf.lendReference(N), N); +} + +template +typename J2OMPTarget::LogValueType J2OMPTarget::updateBuffer(ParticleSet& P, + WFBufferType& buf, + bool fromscratch) +{ + evaluateGL(P, P.G, P.L, false); + buf.forward(Bytes_in_WFBuffer); + return LogValue; +} + +template +typename J2OMPTarget::valT J2OMPTarget::computeU(const ParticleSet& P, int iat, const DistRow& dist) +{ + valT curUat(0); + const int igt = P.GroupID[iat] * NumGroups; + for (int jg = 0; jg < NumGroups; ++jg) + { + const FuncType& f2(*F[igt + jg]); + int iStart = P.first(jg); + int iEnd = P.last(jg); + curUat += f2.evaluateV(iat, iStart, iEnd, dist.data(), DistCompressed.data()); + } + return curUat; +} + +template +typename J2OMPTarget::posT J2OMPTarget::accumulateG(const valT* restrict du, const DisplRow& displ) const +{ + posT grad; + for (int idim = 0; idim < OHMMS_DIM; ++idim) + { + const valT* restrict dX = displ.data(idim); + valT s = valT(); + +#pragma omp simd reduction(+ : s) aligned(du, dX : QMC_SIMD_ALIGNMENT) + for (int jat = 0; jat < N; ++jat) + s += du[jat] * dX[jat]; + grad[idim] = s; + } + return grad; +} + +template +J2OMPTarget::J2OMPTarget(const std::string& obj_name, ParticleSet& p) + : WaveFunctionComponent("J2OMPTarget", obj_name), my_table_ID_(p.addTable(p, DTModes::NEED_TEMP_DATA_ON_HOST)), j2_ke_corr_helper(p, F) +{ + if (myName.empty()) + throw std::runtime_error("J2OMPTarget object name cannot be empty!"); + init(p); + KEcorr = 0.0; +} + +template +J2OMPTarget::~J2OMPTarget() = default; + +template +void J2OMPTarget::init(ParticleSet& p) +{ + N = p.getTotalNum(); + N_padded = getAlignedSize(N); + NumGroups = p.groups(); + + Uat.resize(N); + dUat.resize(N); + d2Uat.resize(N); + cur_u.resize(N); + cur_du.resize(N); + cur_d2u.resize(N); + old_u.resize(N); + old_du.resize(N); + old_d2u.resize(N); + F.resize(NumGroups * NumGroups, nullptr); + DistCompressed.resize(N); + DistIndice.resize(N); +} + +template +void J2OMPTarget::addFunc(int ia, int ib, std::unique_ptr j) +{ + assert(ia < NumGroups); + assert(ib < NumGroups); + if (ia == ib) + { + if (ia == 0) //first time, assign everything + { + int ij = 0; + for (int ig = 0; ig < NumGroups; ++ig) + for (int jg = 0; jg < NumGroups; ++jg, ++ij) + if (F[ij] == nullptr) + F[ij] = j.get(); + } + else + F[ia * NumGroups + ib] = j.get(); + } + else + { + // a very special case, 1 particle of each type (e.g. 1 up + 1 down) + // uu/dd/etc. was prevented by the builder + if (N == NumGroups) + for (int ig = 0; ig < NumGroups; ++ig) + F[ig * NumGroups + ig] = j.get(); + // generic case + F[ia * NumGroups + ib] = j.get(); + F[ib * NumGroups + ia] = j.get(); + } + std::stringstream aname; + aname << ia << ib; + J2Unique[aname.str()] = std::move(j); +} + +template +std::unique_ptr J2OMPTarget::makeClone(ParticleSet& tqp) const +{ + auto j2copy = std::make_unique>(myName, tqp); + if (dPsi) + j2copy->dPsi = dPsi->makeClone(tqp); + std::map fcmap; + for (int ig = 0; ig < NumGroups; ++ig) + for (int jg = ig; jg < NumGroups; ++jg) + { + int ij = ig * NumGroups + jg; + if (F[ij] == 0) + continue; + typename std::map::iterator fit = fcmap.find(F[ij]); + if (fit == fcmap.end()) + { + auto fc = std::make_unique(*F[ij]); + fcmap[F[ij]] = fc.get(); + j2copy->addFunc(ig, jg, std::move(fc)); + } + } + j2copy->KEcorr = KEcorr; + j2copy->Optimizable = Optimizable; + return j2copy; +} + +/** intenal function to compute \f$\sum_j u(r_j), du/dr, d2u/dr2\f$ + * @param P particleset + * @param iat particle index + * @param dist starting distance + * @param u starting value + * @param du starting first deriv + * @param d2u starting second deriv + */ +template +void J2OMPTarget::computeU3(const ParticleSet& P, + int iat, + const DistRow& dist, + RealType* restrict u, + RealType* restrict du, + RealType* restrict d2u, + bool triangle) +{ + const int jelmax = triangle ? iat : N; + constexpr valT czero(0); + std::fill_n(u, jelmax, czero); + std::fill_n(du, jelmax, czero); + std::fill_n(d2u, jelmax, czero); + + const int igt = P.GroupID[iat] * NumGroups; + for (int jg = 0; jg < NumGroups; ++jg) + { + const FuncType& f2(*F[igt + jg]); + int iStart = P.first(jg); + int iEnd = std::min(jelmax, P.last(jg)); + f2.evaluateVGL(iat, iStart, iEnd, dist.data(), u, du, d2u, DistCompressed.data(), DistIndice.data()); + } + //u[iat]=czero; + //du[iat]=czero; + //d2u[iat]=czero; +} + +template +typename J2OMPTarget::PsiValueType J2OMPTarget::ratio(ParticleSet& P, int iat) +{ + //only ratio, ready to compute it again + UpdateMode = ORB_PBYP_RATIO; + cur_Uat = computeU(P, iat, P.getDistTable(my_table_ID_).getTempDists()); + return std::exp(static_cast(Uat[iat] - cur_Uat)); +} + +template +void J2OMPTarget::evaluateRatiosAlltoOne(ParticleSet& P, std::vector& ratios) +{ + const auto& d_table = P.getDistTable(my_table_ID_); + const auto& dist = d_table.getTempDists(); + + for (int ig = 0; ig < NumGroups; ++ig) + { + const int igt = ig * NumGroups; + valT sumU(0); + for (int jg = 0; jg < NumGroups; ++jg) + { + const FuncType& f2(*F[igt + jg]); + int iStart = P.first(jg); + int iEnd = P.last(jg); + sumU += f2.evaluateV(-1, iStart, iEnd, dist.data(), DistCompressed.data()); + } + + for (int i = P.first(ig); i < P.last(ig); ++i) + { + // remove self-interaction + const valT Uself = F[igt + ig]->evaluate(dist[i]); + ratios[i] = std::exp(Uat[i] + Uself - sumU); + } + } +} + +template +typename J2OMPTarget::GradType J2OMPTarget::evalGrad(ParticleSet& P, int iat) +{ + return GradType(dUat[iat]); +} + +template +typename J2OMPTarget::PsiValueType J2OMPTarget::ratioGrad(ParticleSet& P, int iat, GradType& grad_iat) +{ + UpdateMode = ORB_PBYP_PARTIAL; + + computeU3(P, iat, P.getDistTable(my_table_ID_).getTempDists(), cur_u.data(), cur_du.data(), cur_d2u.data()); + cur_Uat = simd::accumulate_n(cur_u.data(), N, valT()); + DiffVal = Uat[iat] - cur_Uat; + grad_iat += accumulateG(cur_du.data(), P.getDistTable(my_table_ID_).getTempDispls()); + return std::exp(static_cast(DiffVal)); +} + +template +void J2OMPTarget::acceptMove(ParticleSet& P, int iat, bool safe_to_delay) +{ + // get the old u, du, d2u + const auto& d_table = P.getDistTable(my_table_ID_); + computeU3(P, iat, d_table.getOldDists(), old_u.data(), old_du.data(), old_d2u.data()); + if (UpdateMode == ORB_PBYP_RATIO) + { //ratio-only during the move; need to compute derivatives + const auto& dist = d_table.getTempDists(); + computeU3(P, iat, dist, cur_u.data(), cur_du.data(), cur_d2u.data()); + } + + valT cur_d2Uat(0); + const auto& new_dr = d_table.getTempDispls(); + const auto& old_dr = d_table.getOldDispls(); + constexpr valT lapfac = OHMMS_DIM - RealType(1); +#pragma omp simd reduction(+ : cur_d2Uat) + for (int jat = 0; jat < N; jat++) + { + const valT du = cur_u[jat] - old_u[jat]; + const valT newl = cur_d2u[jat] + lapfac * cur_du[jat]; + const valT dl = old_d2u[jat] + lapfac * old_du[jat] - newl; + Uat[jat] += du; + d2Uat[jat] += dl; + cur_d2Uat -= newl; + } + posT cur_dUat; + for (int idim = 0; idim < OHMMS_DIM; ++idim) + { + const valT* restrict new_dX = new_dr.data(idim); + const valT* restrict old_dX = old_dr.data(idim); + const valT* restrict cur_du_pt = cur_du.data(); + const valT* restrict old_du_pt = old_du.data(); + valT* restrict save_g = dUat.data(idim); + valT cur_g = cur_dUat[idim]; +#pragma omp simd reduction(+ : cur_g) aligned(old_dX, new_dX, save_g, cur_du_pt, old_du_pt : QMC_SIMD_ALIGNMENT) + for (int jat = 0; jat < N; jat++) + { + const valT newg = cur_du_pt[jat] * new_dX[jat]; + const valT dg = newg - old_du_pt[jat] * old_dX[jat]; + save_g[jat] -= dg; + cur_g += newg; + } + cur_dUat[idim] = cur_g; + } + LogValue += Uat[iat] - cur_Uat; + Uat[iat] = cur_Uat; + dUat(iat) = cur_dUat; + d2Uat[iat] = cur_d2Uat; +} + +template +void J2OMPTarget::recompute(const ParticleSet& P) +{ + const auto& d_table = P.getDistTable(my_table_ID_); + for (int ig = 0; ig < NumGroups; ++ig) + { + for (int iat = P.first(ig), last = P.last(ig); iat < last; ++iat) + { + computeU3(P, iat, d_table.getDistRow(iat), cur_u.data(), cur_du.data(), cur_d2u.data(), true); + Uat[iat] = simd::accumulate_n(cur_u.data(), iat, valT()); + posT grad; + valT lap(0); + const valT* restrict u = cur_u.data(); + const valT* restrict du = cur_du.data(); + const valT* restrict d2u = cur_d2u.data(); + const auto& displ = d_table.getDisplRow(iat); + constexpr valT lapfac = OHMMS_DIM - RealType(1); +#pragma omp simd reduction(+ : lap) aligned(du, d2u : QMC_SIMD_ALIGNMENT) + for (int jat = 0; jat < iat; ++jat) + lap += d2u[jat] + lapfac * du[jat]; + for (int idim = 0; idim < OHMMS_DIM; ++idim) + { + const valT* restrict dX = displ.data(idim); + valT s = valT(); +#pragma omp simd reduction(+ : s) aligned(du, dX : QMC_SIMD_ALIGNMENT) + for (int jat = 0; jat < iat; ++jat) + s += du[jat] * dX[jat]; + grad[idim] = s; + } + dUat(iat) = grad; + d2Uat[iat] = -lap; +// add the contribution from the upper triangle +#pragma omp simd aligned(u, du, d2u : QMC_SIMD_ALIGNMENT) + for (int jat = 0; jat < iat; jat++) + { + Uat[jat] += u[jat]; + d2Uat[jat] -= d2u[jat] + lapfac * du[jat]; + } + for (int idim = 0; idim < OHMMS_DIM; ++idim) + { + valT* restrict save_g = dUat.data(idim); + const valT* restrict dX = displ.data(idim); +#pragma omp simd aligned(save_g, du, dX : QMC_SIMD_ALIGNMENT) + for (int jat = 0; jat < iat; jat++) + save_g[jat] -= du[jat] * dX[jat]; + } + } + } +} + +template +typename J2OMPTarget::LogValueType J2OMPTarget::evaluateLog(const ParticleSet& P, + ParticleSet::ParticleGradient_t& G, + ParticleSet::ParticleLaplacian_t& L) +{ + return evaluateGL(P, G, L, true); +} + +template +WaveFunctionComponent::LogValueType J2OMPTarget::evaluateGL(const ParticleSet& P, + ParticleSet::ParticleGradient_t& G, + ParticleSet::ParticleLaplacian_t& L, + bool fromscratch) +{ + if (fromscratch) + recompute(P); + LogValue = valT(0); + for (int iat = 0; iat < N; ++iat) + { + LogValue += Uat[iat]; + G[iat] += dUat[iat]; + L[iat] += d2Uat[iat]; + } + + return LogValue = -LogValue * 0.5; +} + +template +void J2OMPTarget::evaluateHessian(ParticleSet& P, HessVector_t& grad_grad_psi) +{ + LogValue = 0.0; + const DistanceTableData& d_ee(P.getDistTable(my_table_ID_)); + valT dudr, d2udr2; + + Tensor ident; + grad_grad_psi = 0.0; + ident.diagonal(1.0); + + for (int i = 1; i < N; ++i) + { + const auto& dist = d_ee.getDistRow(i); + const auto& displ = d_ee.getDisplRow(i); + auto ig = P.GroupID[i]; + const int igt = ig * NumGroups; + for (int j = 0; j < i; ++j) + { + auto r = dist[j]; + auto rinv = 1.0 / r; + auto dr = displ[j]; + auto jg = P.GroupID[j]; + auto uij = F[igt + jg]->evaluate(r, dudr, d2udr2); + LogValue -= uij; + auto hess = rinv * rinv * outerProduct(dr, dr) * (d2udr2 - dudr * rinv) + ident * dudr * rinv; + grad_grad_psi[i] -= hess; + grad_grad_psi[j] -= hess; + } + } +} + +template class J2OMPTarget>; +template class J2OMPTarget>; +template class J2OMPTarget>; + +} // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h new file mode 100644 index 0000000000..057a5890a4 --- /dev/null +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h @@ -0,0 +1,179 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. +// Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp. +// Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. +////////////////////////////////////////////////////////////////////////////////////// +// -*- C++ -*- +#ifndef QMCPLUSPLUS_TWOBODYJASTROW_OMPTARGET_H +#define QMCPLUSPLUS_TWOBODYJASTROW_OMPTARGET_H + +#include +#include +#include "Configuration.h" +#if !defined(QMC_BUILD_SANDBOX_ONLY) +#include "QMCWaveFunctions/WaveFunctionComponent.h" +#include "QMCWaveFunctions/Jastrow/DiffTwoBodyJastrowOrbital.h" +#endif +#include "Particle/DistanceTableData.h" +#include "LongRange/StructFact.h" +#include "CPU/SIMD/aligned_allocator.hpp" +#include "J2KECorrection.h" + +namespace qmcplusplus +{ +/** @ingroup WaveFunctionComponent + * @brief Specialization for two-body Jastrow function using multiple functors + * + * Each pair-type can have distinct function \f$u(r_{ij})\f$. + * For electrons, distinct pair correlation functions are used + * for spins up-up/down-down and up-down/down-up. + * + * Based on J2OMPTarget.h with these considerations + * - DistanceTableData using SoA containers + * - support mixed precision: FT::real_type != OHMMS_PRECISION + * - loops over the groups: elminated PairID + * - support simd function + * - double the loop counts + * - Memory use is O(N). + */ +template +class J2OMPTarget : public WaveFunctionComponent +{ +public: + ///alias FuncType + using FuncType = FT; + ///type of each component U, dU, d2U; + using valT = typename FT::real_type; + ///element position type + using posT = TinyVector; + ///use the same container + using DistRow = DistanceTableData::DistRow; + using DisplRow = DistanceTableData::DisplRow; + using gContainer_type = VectorSoaContainer; + +protected: + ///number of particles + size_t N; + ///number of particles + padded + size_t N_padded; + ///number of groups of the target particleset + size_t NumGroups; + ///diff value + RealType DiffVal; + ///Correction + RealType KEcorr; + ///\f$Uat[i] = sum_(j) u_{i,j}\f$ + Vector Uat; + ///\f$dUat[i] = sum_(j) du_{i,j}\f$ + gContainer_type dUat; + ///\f$d2Uat[i] = sum_(j) d2u_{i,j}\f$ + Vector d2Uat; + valT cur_Uat; + aligned_vector cur_u, cur_du, cur_d2u; + aligned_vector old_u, old_du, old_d2u; + aligned_vector DistCompressed; + aligned_vector DistIndice; + ///Uniquue J2 set for cleanup + std::map> J2Unique; + ///Container for \f$F[ig*NumGroups+jg]\f$. treat every pointer as a reference. + std::vector F; + /// e-e table ID + const int my_table_ID_; + // helper for compute J2 Chiesa KE correction + J2KECorrection j2_ke_corr_helper; + +public: + J2OMPTarget(const std::string& obj_name, ParticleSet& p); + J2OMPTarget(const J2OMPTarget& rhs) = delete; + ~J2OMPTarget() override; + + /* initialize storage */ + void init(ParticleSet& p); + + /** add functor for (ia,ib) pair */ + void addFunc(int ia, int ib, std::unique_ptr j); + + /** check in an optimizable parameter + * @param o a super set of optimizable variables + */ + void checkInVariables(opt_variables_type& active) override; + + /** check out optimizable variables + */ + void checkOutVariables(const opt_variables_type& active) override; + + ///reset the value of all the unique Two-Body Jastrow functions + void resetParameters(const opt_variables_type& active) override; + + inline void finalizeOptimization() override { KEcorr = j2_ke_corr_helper.computeKEcorr(); } + + /** print the state, e.g., optimizables */ + void reportStatus(std::ostream& os) override; + + std::unique_ptr makeClone(ParticleSet& tqp) const override; + + LogValueType evaluateLog(const ParticleSet& P, + ParticleSet::ParticleGradient_t& G, + ParticleSet::ParticleLaplacian_t& L) override; + + void evaluateHessian(ParticleSet& P, HessVector_t& grad_grad_psi) override; + + /** recompute internal data assuming distance table is fully ready */ + void recompute(const ParticleSet& P) override; + + PsiValueType ratio(ParticleSet& P, int iat) override; + void evaluateRatios(const VirtualParticleSet& VP, std::vector& ratios) override; + void evaluateRatiosAlltoOne(ParticleSet& P, std::vector& ratios) override; + + GradType evalGrad(ParticleSet& P, int iat) override; + + PsiValueType ratioGrad(ParticleSet& P, int iat, GradType& grad_iat) override; + + void acceptMove(ParticleSet& P, int iat, bool safe_to_delay = false) override; + inline void restore(int iat) override {} + + /** compute G and L after the sweep + */ + LogValueType evaluateGL(const ParticleSet& P, + ParticleSet::ParticleGradient_t& G, + ParticleSet::ParticleLaplacian_t& L, + bool fromscratch = false) override; + + void registerData(ParticleSet& P, WFBufferType& buf) override; + + void copyFromBuffer(ParticleSet& P, WFBufferType& buf) override; + + LogValueType updateBuffer(ParticleSet& P, WFBufferType& buf, bool fromscratch = false) override; + + /*@{ internal compute engines*/ + valT computeU(const ParticleSet& P, int iat, const DistRow& dist); + + void computeU3(const ParticleSet& P, + int iat, + const DistRow& dist, + RealType* restrict u, + RealType* restrict du, + RealType* restrict d2u, + bool triangle = false); + + /** compute gradient + */ + posT accumulateG(const valT* restrict du, const DisplRow& displ) const; + /**@} */ + + inline RealType ChiesaKEcorrection() { return KEcorr = j2_ke_corr_helper.computeKEcorr(); } + + inline RealType KECorrection() override { return KEcorr; } + + const std::vector& getPairFunctions() const { return F; } +}; + +} // namespace qmcplusplus +#endif diff --git a/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.cpp b/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.cpp index 6887b5d651..770a12c9be 100644 --- a/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.cpp +++ b/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.cpp @@ -15,6 +15,10 @@ #include "QMCWaveFunctions/Jastrow/J1OrbitalSoA.h" #include "QMCWaveFunctions/Jastrow/J2OrbitalSoA.h" +#if defined(ENABLE_OFFLOAD) +#include "QMCWaveFunctions/Jastrow/J2OMPTarget.h" +#endif + #if defined(QMC_CUDA) #include "QMCWaveFunctions/Jastrow/OneBodyJastrowOrbitalBspline.h" #include "QMCWaveFunctions/Jastrow/TwoBodyJastrowOrbitalBspline.h" @@ -37,6 +41,54 @@ namespace qmcplusplus { + +// quick helper class to allow use of RPA +class RPAFunctor +{}; + +// helper class to simplify and localize ugly ifdef stuff for types +template +class JastrowTypeHelper +{ +public: + using J1OrbitalType = J1OrbitalSoA; + using J2OrbitalType = J2OrbitalSoA; + using DiffJ2OrbitalType = DiffTwoBodyJastrowOrbital; +}; + +#if defined(QMC_CUDA) +template<> +class JastrowTypeHelper, RadialJastrowBuilder::detail::CUDA_LEGACY> +{ +public: + using RadFuncType = BsplineFunctor; + using J1OrbitalType = OneBodyJastrowOrbitalBspline; + using J2OrbitalType = TwoBodyJastrowOrbitalBspline; + using DiffJ2OrbitalType = DiffTwoBodyJastrowOrbital; +}; +#endif + +template<> +class JastrowTypeHelper, RadialJastrowBuilder::detail::CPU> +{ +public: + using RadFuncType = BsplineFunctor; + using J1OrbitalType = J1OrbitalSoA; + using J2OrbitalType = J2OrbitalSoA; + using DiffJ2OrbitalType = DiffTwoBodyJastrowOrbital; +}; + +#if defined(ENABLE_OFFLOAD) +template<> +class JastrowTypeHelper, RadialJastrowBuilder::detail::OMPTARGET> +{ +public: + using RadFuncType = BsplineFunctor; + using J2OrbitalType = J2OMPTarget; + using DiffJ2OrbitalType = DiffTwoBodyJastrowOrbital; +}; +#endif + RadialJastrowBuilder::RadialJastrowBuilder(Communicate* comm, ParticleSet& target, ParticleSet& source) : WaveFunctionComponentBuilder(comm, target), SourcePtcl(&source) { @@ -77,36 +129,6 @@ void RadialJastrowBuilder::guardAgainstPBC() } } -// quick helper class to allow use of RPA -class RPAFunctor -{}; - -// helper class to simplify and localize ugly ifdef stuff for types -template -class JastrowTypeHelper -{ -public: - using J1OrbitalType = J1OrbitalSoA; - using J2OrbitalType = J2OrbitalSoA; - using DiffJ2OrbitalType = DiffTwoBodyJastrowOrbital; -}; - -template<> -class JastrowTypeHelper> -{ -public: - using RadFuncType = BsplineFunctor; -#if defined(QMC_CUDA) - using J1OrbitalType = OneBodyJastrowOrbitalBspline; - using J2OrbitalType = TwoBodyJastrowOrbitalBspline; -#endif -#if !defined(QMC_CUDA) - using J1OrbitalType = J1OrbitalSoA; - using J2OrbitalType = J2OrbitalSoA; -#endif - using DiffJ2OrbitalType = DiffTwoBodyJastrowOrbital; -}; - template void RadialJastrowBuilder::initTwoBodyFunctor(RadFuncType& functor, double fac) {} @@ -142,13 +164,13 @@ void RadialJastrowBuilder::initTwoBodyFunctor(BsplineFunctor& bfunc, d } -template +template std::unique_ptr RadialJastrowBuilder::createJ2(xmlNodePtr cur) { ReportEngine PRE(ClassName, "createJ2(xmlNodePtr)"); using Real = typename RadFuncType::real_type; - using J2OrbitalType = typename JastrowTypeHelper::J2OrbitalType; - using DiffJ2OrbitalType = typename JastrowTypeHelper::DiffJ2OrbitalType; + using J2OrbitalType = typename JastrowTypeHelper::J2OrbitalType; + using DiffJ2OrbitalType = typename JastrowTypeHelper::DiffJ2OrbitalType; XMLAttrString input_name(cur, "name"); std::string j2name = input_name.empty() ? "J2_" + Jastfunction : input_name; @@ -326,12 +348,12 @@ std::unique_ptr RadialJastrowBuilder::createJ2 +template std::unique_ptr RadialJastrowBuilder::createJ1(xmlNodePtr cur) { ReportEngine PRE(ClassName, "createJ1(xmlNodePtr)"); using Real = typename RadFuncType::real_type; - using J1OrbitalType = typename JastrowTypeHelper::J1OrbitalType; + using J1OrbitalType = typename JastrowTypeHelper::J1OrbitalType; XMLAttrString input_name(cur, "name"); std::string jname = input_name.empty() ? Jastfunction : input_name; @@ -494,11 +516,15 @@ std::unique_ptr RadialJastrowBuilder::createJ1 RadialJastrowBuilder::buildComponent(xmlNodePtr cur) { ReportEngine PRE(ClassName, "put(xmlNodePtr)"); + std::string useGPU; OhmmsAttributeSet aAttrib; aAttrib.add(NameOpt, "name"); aAttrib.add(TypeOpt, "type"); aAttrib.add(Jastfunction, "function"); aAttrib.add(SpinOpt, "spin"); +#if defined(ENABLE_OFFLOAD) + aAttrib.add(useGPU, "gpu", {"yes", "no"}); +#endif aAttrib.put(cur); tolower(NameOpt); tolower(TypeOpt); @@ -513,7 +539,11 @@ std::unique_ptr RadialJastrowBuilder::buildComponent(xmlN // it's a one body jastrow factor if (Jastfunction == "bspline") { +#if defined(QMC_CUDA) + return createJ1, detail::CUDA_LEGACY>(cur); +#else return createJ1>(cur); +#endif } else if (Jastfunction == "pade") { @@ -551,7 +581,21 @@ std::unique_ptr RadialJastrowBuilder::buildComponent(xmlN // it's a two body jastrow factor if (Jastfunction == "bspline") { - return createJ2>(cur); +#if defined(QMC_CUDA) + return createJ2, detail::CUDA_LEGACY>(cur); +#else +#if defined(ENABLE_OFFLOAD) + if (useGPU == "yes") + { + static_assert(std::is_same, OMPTARGET>::J2OrbitalType, + J2OMPTarget>>::value, "check consistent type"); + app_summary() << " Running on an accelerator via OpenMP offload." << std::endl; + return createJ2, detail::OMPTARGET>(cur); + } + else +#endif + return createJ2>(cur); +#endif } else if (Jastfunction == "pade") { diff --git a/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.h b/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.h index 56e376ae4a..5f686491cb 100644 --- a/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.h +++ b/src/QMCWaveFunctions/Jastrow/RadialJastrowBuilder.h @@ -33,6 +33,14 @@ namespace qmcplusplus class RadialJastrowBuilder : public WaveFunctionComponentBuilder { public: + enum detail + { + CPU, + CUDA_LEGACY, + CUDA, + OMPTARGET + }; + // one body constructor RadialJastrowBuilder(Communicate* comm, ParticleSet& target, ParticleSet& source); // two body constructor @@ -53,10 +61,10 @@ class RadialJastrowBuilder : public WaveFunctionComponentBuilder ParticleSet* SourcePtcl; // has a specialization for RPAFunctor in cpp file - template + template std::unique_ptr createJ1(xmlNodePtr cur); - template + template std::unique_ptr createJ2(xmlNodePtr cur); template diff --git a/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp b/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp index 47c1380bbc..0d3e433fa0 100644 --- a/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp +++ b/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp @@ -90,7 +90,7 @@ TEST_CASE("BSpline builder Jastrow J2", "[wavefunction]") elec_.resetGroups(); const char* particles = " \ - \ + \ \ 0.02904699284 -0.1004179 -0.1752703883 -0.2232576505 -0.2728029201 -0.3253286875 -0.3624525145 -0.3958223107 -0.4268582166 -0.4394531176 \ \ From 9cbcb1a80bebd704a6af298ce13bdcc2a3d42cd7 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 17 Jul 2021 20:46:42 -0500 Subject: [PATCH 04/13] Split test_J2_bspline from test_bspline_jastrow. --- src/QMCWaveFunctions/tests/CMakeLists.txt | 1 + .../tests/test_J2_bspline.cpp | 234 ++++++++++++++++++ .../tests/test_bspline_jastrow.cpp | 201 +-------------- 3 files changed, 236 insertions(+), 200 deletions(-) create mode 100644 src/QMCWaveFunctions/tests/test_J2_bspline.cpp diff --git a/src/QMCWaveFunctions/tests/CMakeLists.txt b/src/QMCWaveFunctions/tests/CMakeLists.txt index 0521f4dbf3..fa84ccbb91 100644 --- a/src/QMCWaveFunctions/tests/CMakeLists.txt +++ b/src/QMCWaveFunctions/tests/CMakeLists.txt @@ -93,6 +93,7 @@ set(JASTROW_SRC test_pade_jastrow.cpp test_short_range_cusp_jastrow.cpp test_J1OrbitalSoA.cpp + test_J2_bspline.cpp test_DiffTwoBodyJastrowOrbital.cpp) set(DETERMINANT_SRC FakeSPO.cpp diff --git a/src/QMCWaveFunctions/tests/test_J2_bspline.cpp b/src/QMCWaveFunctions/tests/test_J2_bspline.cpp new file mode 100644 index 0000000000..323c3caca4 --- /dev/null +++ b/src/QMCWaveFunctions/tests/test_J2_bspline.cpp @@ -0,0 +1,234 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign +// +// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign +////////////////////////////////////////////////////////////////////////////////////// + + +#include "catch.hpp" + +#include "OhmmsData/Libxml2Doc.h" +#include "OhmmsPETE/OhmmsMatrix.h" +#include "Particle/ParticleSet.h" +#include "QMCWaveFunctions/WaveFunctionComponent.h" +#include "QMCWaveFunctions/Jastrow/BsplineFunctor.h" +#include "QMCWaveFunctions/Jastrow/RadialJastrowBuilder.h" +#include "ParticleBase/ParticleAttribOps.h" +#include "QMCWaveFunctions/Jastrow/J2OrbitalSoA.h" + +#include +#include + + +// Uncomment to print information and values from the underlying functor +//#define PRINT_SPLINE_DATA + +using std::string; + +namespace qmcplusplus +{ +using RealType = WaveFunctionComponent::RealType; +using PsiValueType = WaveFunctionComponent::PsiValueType; + +TEST_CASE("BSpline builder Jastrow J2", "[wavefunction]") +{ + Communicate* c; + c = OHMMS::Controller; + + ParticleSet ions_; + ParticleSet elec_; + + ions_.setName("ion"); + ions_.create(1); + ions_.R[0][0] = 2.0; + ions_.R[0][1] = 0.0; + ions_.R[0][2] = 0.0; + + elec_.setName("elec"); + std::vector ud(2); + ud[0] = ud[1] = 1; + elec_.create(ud); + elec_.R[0][0] = 1.00; + elec_.R[0][1] = 0.0; + elec_.R[0][2] = 0.0; + elec_.R[1][0] = 0.0; + elec_.R[1][1] = 0.0; + elec_.R[1][2] = 0.0; + + SpeciesSet& tspecies = elec_.getSpeciesSet(); + int upIdx = tspecies.addSpecies("u"); + int downIdx = tspecies.addSpecies("d"); + int chargeIdx = tspecies.addAttribute("charge"); + tspecies(chargeIdx, upIdx) = -1; + tspecies(chargeIdx, downIdx) = -1; + elec_.resetGroups(); + + const char* particles = " \ + \ + \ + 0.02904699284 -0.1004179 -0.1752703883 -0.2232576505 -0.2728029201 -0.3253286875 -0.3624525145 -0.3958223107 -0.4268582166 -0.4394531176 \ + \ + \ + \ +"; + Libxml2Document doc; + bool okay = doc.parseFromString(particles); + REQUIRE(okay); + + xmlNodePtr root = doc.getRoot(); + + xmlNodePtr jas1 = xmlFirstElementChild(root); + + RadialJastrowBuilder jastrow(c, elec_); + + typedef J2OrbitalSoA> J2Type; + auto j2_uptr = jastrow.buildComponent(jas1); + J2Type* j2 = dynamic_cast(j2_uptr.get()); + REQUIRE(j2); + + // update all distance tables + elec_.update(); + + double logpsi_real = std::real(j2->evaluateLog(elec_, elec_.G, elec_.L)); + REQUIRE(logpsi_real == Approx(0.1012632641)); // note: number not validated + + double KE = -0.5 * (Dot(elec_.G, elec_.G) + Sum(elec_.L)); + REQUIRE(KE == Approx(-0.1616624771)); // note: number not validated + + + // now test evaluateHessian + WaveFunctionComponent::HessVector_t grad_grad_psi; + grad_grad_psi.resize(elec_.getTotalNum()); + grad_grad_psi = 0.0; + + std::cout << "eval hess" << std::endl; + j2->evaluateHessian(elec_, grad_grad_psi); + std::vector hess_values = { + -0.0627236, 0, 0, 0, 0.10652, 0, 0, 0, 0.10652, -0.0627236, 0, 0, 0, 0.10652, 0, 0, 0, 0.10652, + }; + + int m = 0; + for (int n = 0; n < elec_.getTotalNum(); n++) + for (int i = 0; i < OHMMS_DIM; i++) + for (int j = 0; j < OHMMS_DIM; j++, m++) + { + REQUIRE(std::real(grad_grad_psi[n](i, j)) == Approx(hess_values[m])); + } + + + struct JValues + { + double r; + double u; + double du; + double ddu; + }; + + // Cut and paste from output of gen_bspline_jastrow.py + const int N = 20; + JValues Vals[N] = {{0.00, 0.1374071801, -0.5, 0.7866949593}, + {0.60, -0.04952403966, -0.1706645865, 0.3110897524}, + {1.20, -0.121361995, -0.09471371432, 0.055337302}, + {1.80, -0.1695590431, -0.06815900213, 0.0331784053}, + {2.40, -0.2058414025, -0.05505192964, 0.01049597156}, + {3.00, -0.2382237097, -0.05422744821, -0.002401552969}, + {3.60, -0.2712606182, -0.05600918024, -0.003537553803}, + {4.20, -0.3047843679, -0.05428535477, 0.0101841028}, + {4.80, -0.3347515004, -0.04506573714, 0.01469003611}, + {5.40, -0.3597048574, -0.03904232165, 0.005388015505}, + {6.00, -0.3823503292, -0.03657502025, 0.003511355265}, + {6.60, -0.4036800017, -0.03415678101, 0.007891305516}, + {7.20, -0.4219818468, -0.02556305518, 0.02075444724}, + {7.80, -0.4192355508, 0.06799438701, 0.3266190181}, + {8.40, -0.3019238309, 0.32586994, 0.2880861726}, + {9.00, -0.09726352421, 0.2851358014, -0.4238666348}, + {9.60, -0.006239062395, 0.04679296796, -0.2339648398}, + {10.20, 0, 0, 0}, + {10.80, 0, 0, 0}, + {11.40, 0, 0, 0}}; + + + BsplineFunctor* bf = j2->getPairFunctions()[0]; + + for (int i = 0; i < N; i++) + { + RealType dv = 0.0; + RealType ddv = 0.0; + RealType val = bf->evaluate(Vals[i].r, dv, ddv); + REQUIRE(Vals[i].u == Approx(val)); + REQUIRE(Vals[i].du == Approx(dv)); + REQUIRE(Vals[i].ddu == Approx(ddv)); + } + +#ifdef PRINT_SPLINE_DATA + // write out values of the Bspline functor + //BsplineFunctor *bf = j2->F[0]; + printf("NumParams = %d\n", bf->NumParams); + printf("CuspValue = %g\n", bf->CuspValue); + printf("DeltaR = %g\n", bf->DeltaR); + printf("SplineCoeffs size = %d\n", bf->SplineCoefs.size()); + for (int j = 0; j < bf->SplineCoefs.size(); j++) + { + printf("%d %g\n", j, bf->SplineCoefs[j]); + } + printf("\n"); + + for (int i = 0; i < 20; i++) + { + double r = 0.6 * i; + elec_.R[0][0] = r; + elec_.update(); + double logpsi_real = std::real(j2->evaluateLog(elec_, elec_.G, elec_.L)); + //double alt_val = bf->evaluate(r); + double dv = 0.0; + double ddv = 0.0; + double alt_val = bf->evaluate(r, dv, ddv); + printf("%g %g %g %g %g\n", r, logpsi_real, alt_val, dv, ddv); + } +#endif + + typedef QMCTraits::ValueType ValueType; + typedef QMCTraits::PosType PosType; + + // set virtutal particle position + PosType newpos(0.3, 0.2, 0.5); + + elec_.makeVirtualMoves(newpos); + std::vector ratios(elec_.getTotalNum()); + j2->evaluateRatiosAlltoOne(elec_, ratios); + + REQUIRE(std::real(ratios[0]) == Approx(0.9522052017)); + REQUIRE(std::real(ratios[1]) == Approx(0.9871985577)); + + elec_.makeMove(0, newpos - elec_.R[0]); + PsiValueType ratio_0 = j2->ratio(elec_, 0); + elec_.rejectMove(0); + + REQUIRE(std::real(ratio_0) == Approx(0.9522052017)); + + VirtualParticleSet VP(elec_, 2); + std::vector newpos2(2); + std::vector ratios2(2); + newpos2[0] = newpos - elec_.R[1]; + newpos2[1] = PosType(0.2, 0.5, 0.3) - elec_.R[1]; + VP.makeMoves(1, elec_.R[1], newpos2); + j2->evaluateRatios(VP, ratios2); + + REQUIRE(std::real(ratios2[0]) == Approx(0.9871985577)); + REQUIRE(std::real(ratios2[1]) == Approx(0.9989268241)); + + //test acceptMove + elec_.makeMove(1, newpos - elec_.R[1]); + PsiValueType ratio_1 = j2->ratio(elec_, 1); + j2->acceptMove(elec_, 1); + elec_.acceptMove(1); + + REQUIRE(std::real(ratio_1) == Approx(0.9871985577)); + REQUIRE(std::real(j2->LogValue) == Approx(0.0883791773)); +} +} // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp b/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp index 0d3e433fa0..9e140a0508 100644 --- a/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp +++ b/src/QMCWaveFunctions/tests/test_bspline_jastrow.cpp @@ -13,16 +13,14 @@ #include "catch.hpp" #include "OhmmsData/Libxml2Doc.h" -#include "OhmmsPETE/OhmmsMatrix.h" #include "Particle/ParticleSet.h" #include "QMCWaveFunctions/WaveFunctionComponent.h" #include "QMCWaveFunctions/Jastrow/BsplineFunctor.h" #include "QMCWaveFunctions/Jastrow/RadialJastrowBuilder.h" #include "ParticleBase/ParticleAttribOps.h" -#include "QMCWaveFunctions/Jastrow/J2OrbitalSoA.h" #include "QMCWaveFunctions/Jastrow/J1OrbitalSoA.h" -#include +#include #include @@ -56,203 +54,6 @@ TEST_CASE("BSpline functor one", "[wavefunction]") REQUIRE(u == 0.0); } -TEST_CASE("BSpline builder Jastrow J2", "[wavefunction]") -{ - Communicate* c; - c = OHMMS::Controller; - - ParticleSet ions_; - ParticleSet elec_; - - ions_.setName("ion"); - ions_.create(1); - ions_.R[0][0] = 2.0; - ions_.R[0][1] = 0.0; - ions_.R[0][2] = 0.0; - - elec_.setName("elec"); - std::vector ud(2); - ud[0] = ud[1] = 1; - elec_.create(ud); - elec_.R[0][0] = 1.00; - elec_.R[0][1] = 0.0; - elec_.R[0][2] = 0.0; - elec_.R[1][0] = 0.0; - elec_.R[1][1] = 0.0; - elec_.R[1][2] = 0.0; - - SpeciesSet& tspecies = elec_.getSpeciesSet(); - int upIdx = tspecies.addSpecies("u"); - int downIdx = tspecies.addSpecies("d"); - int chargeIdx = tspecies.addAttribute("charge"); - tspecies(chargeIdx, upIdx) = -1; - tspecies(chargeIdx, downIdx) = -1; - elec_.resetGroups(); - - const char* particles = " \ - \ - \ - 0.02904699284 -0.1004179 -0.1752703883 -0.2232576505 -0.2728029201 -0.3253286875 -0.3624525145 -0.3958223107 -0.4268582166 -0.4394531176 \ - \ - \ - \ -"; - Libxml2Document doc; - bool okay = doc.parseFromString(particles); - REQUIRE(okay); - - xmlNodePtr root = doc.getRoot(); - - xmlNodePtr jas1 = xmlFirstElementChild(root); - - RadialJastrowBuilder jastrow(c, elec_); - - typedef J2OrbitalSoA> J2Type; - auto j2_uptr = jastrow.buildComponent(jas1); - J2Type* j2 = dynamic_cast(j2_uptr.get()); - REQUIRE(j2); - - // update all distance tables - elec_.update(); - - double logpsi_real = std::real(j2->evaluateLog(elec_, elec_.G, elec_.L)); - REQUIRE(logpsi_real == Approx(0.1012632641)); // note: number not validated - - double KE = -0.5 * (Dot(elec_.G, elec_.G) + Sum(elec_.L)); - REQUIRE(KE == Approx(-0.1616624771)); // note: number not validated - - - // now test evaluateHessian - WaveFunctionComponent::HessVector_t grad_grad_psi; - grad_grad_psi.resize(elec_.getTotalNum()); - grad_grad_psi = 0.0; - - std::cout << "eval hess" << std::endl; - j2->evaluateHessian(elec_, grad_grad_psi); - std::vector hess_values = { - -0.0627236, 0, 0, 0, 0.10652, 0, 0, 0, 0.10652, -0.0627236, 0, 0, 0, 0.10652, 0, 0, 0, 0.10652, - }; - - int m = 0; - for (int n = 0; n < elec_.getTotalNum(); n++) - for (int i = 0; i < OHMMS_DIM; i++) - for (int j = 0; j < OHMMS_DIM; j++, m++) - { - REQUIRE(std::real(grad_grad_psi[n](i, j)) == Approx(hess_values[m])); - } - - - struct JValues - { - double r; - double u; - double du; - double ddu; - }; - - // Cut and paste from output of gen_bspline_jastrow.py - const int N = 20; - JValues Vals[N] = {{0.00, 0.1374071801, -0.5, 0.7866949593}, - {0.60, -0.04952403966, -0.1706645865, 0.3110897524}, - {1.20, -0.121361995, -0.09471371432, 0.055337302}, - {1.80, -0.1695590431, -0.06815900213, 0.0331784053}, - {2.40, -0.2058414025, -0.05505192964, 0.01049597156}, - {3.00, -0.2382237097, -0.05422744821, -0.002401552969}, - {3.60, -0.2712606182, -0.05600918024, -0.003537553803}, - {4.20, -0.3047843679, -0.05428535477, 0.0101841028}, - {4.80, -0.3347515004, -0.04506573714, 0.01469003611}, - {5.40, -0.3597048574, -0.03904232165, 0.005388015505}, - {6.00, -0.3823503292, -0.03657502025, 0.003511355265}, - {6.60, -0.4036800017, -0.03415678101, 0.007891305516}, - {7.20, -0.4219818468, -0.02556305518, 0.02075444724}, - {7.80, -0.4192355508, 0.06799438701, 0.3266190181}, - {8.40, -0.3019238309, 0.32586994, 0.2880861726}, - {9.00, -0.09726352421, 0.2851358014, -0.4238666348}, - {9.60, -0.006239062395, 0.04679296796, -0.2339648398}, - {10.20, 0, 0, 0}, - {10.80, 0, 0, 0}, - {11.40, 0, 0, 0}}; - - - BsplineFunctor* bf = j2->getPairFunctions()[0]; - - for (int i = 0; i < N; i++) - { - RealType dv = 0.0; - RealType ddv = 0.0; - RealType val = bf->evaluate(Vals[i].r, dv, ddv); - REQUIRE(Vals[i].u == Approx(val)); - REQUIRE(Vals[i].du == Approx(dv)); - REQUIRE(Vals[i].ddu == Approx(ddv)); - } - -#ifdef PRINT_SPLINE_DATA - // write out values of the Bspline functor - //BsplineFunctor *bf = j2->F[0]; - printf("NumParams = %d\n", bf->NumParams); - printf("CuspValue = %g\n", bf->CuspValue); - printf("DeltaR = %g\n", bf->DeltaR); - printf("SplineCoeffs size = %d\n", bf->SplineCoefs.size()); - for (int j = 0; j < bf->SplineCoefs.size(); j++) - { - printf("%d %g\n", j, bf->SplineCoefs[j]); - } - printf("\n"); - - for (int i = 0; i < 20; i++) - { - double r = 0.6 * i; - elec_.R[0][0] = r; - elec_.update(); - double logpsi_real = std::real(j2->evaluateLog(elec_, elec_.G, elec_.L)); - //double alt_val = bf->evaluate(r); - double dv = 0.0; - double ddv = 0.0; - double alt_val = bf->evaluate(r, dv, ddv); - printf("%g %g %g %g %g\n", r, logpsi_real, alt_val, dv, ddv); - } -#endif - - typedef QMCTraits::ValueType ValueType; - typedef QMCTraits::PosType PosType; - - // set virtutal particle position - PosType newpos(0.3, 0.2, 0.5); - - elec_.makeVirtualMoves(newpos); - std::vector ratios(elec_.getTotalNum()); - j2->evaluateRatiosAlltoOne(elec_, ratios); - - REQUIRE(std::real(ratios[0]) == Approx(0.9522052017)); - REQUIRE(std::real(ratios[1]) == Approx(0.9871985577)); - - elec_.makeMove(0, newpos - elec_.R[0]); - PsiValueType ratio_0 = j2->ratio(elec_, 0); - elec_.rejectMove(0); - - REQUIRE(std::real(ratio_0) == Approx(0.9522052017)); - - VirtualParticleSet VP(elec_, 2); - std::vector newpos2(2); - std::vector ratios2(2); - newpos2[0] = newpos - elec_.R[1]; - newpos2[1] = PosType(0.2, 0.5, 0.3) - elec_.R[1]; - VP.makeMoves(1, elec_.R[1], newpos2); - j2->evaluateRatios(VP, ratios2); - - REQUIRE(std::real(ratios2[0]) == Approx(0.9871985577)); - REQUIRE(std::real(ratios2[1]) == Approx(0.9989268241)); - - //test acceptMove - elec_.makeMove(1, newpos - elec_.R[1]); - PsiValueType ratio_1 = j2->ratio(elec_, 1); - j2->acceptMove(elec_, 1); - elec_.acceptMove(1); - - REQUIRE(std::real(ratio_1) == Approx(0.9871985577)); - REQUIRE(std::real(j2->LogValue) == Approx(0.0883791773)); -} - TEST_CASE("BSpline builder Jastrow J1", "[wavefunction]") { Communicate* c; From 33717f2eecf851a5a42f05233c311e85f327146d Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 17 Jul 2021 22:57:58 -0500 Subject: [PATCH 05/13] Use static constexpr in BsplineFunctor --- .../Jastrow/BsplineFunctor.cpp | 72 ----- src/QMCWaveFunctions/Jastrow/BsplineFunctor.h | 248 +++++++++--------- 2 files changed, 126 insertions(+), 194 deletions(-) delete mode 100644 src/QMCWaveFunctions/Jastrow/BsplineFunctor.cpp diff --git a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.cpp b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.cpp deleted file mode 100644 index f7752092e2..0000000000 --- a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.cpp +++ /dev/null @@ -1,72 +0,0 @@ -////////////////////////////////////////////////////////////////////////////////////// -// This file is distributed under the University of Illinois/NCSA Open Source License. -// See LICENSE file in top directory for details. -// -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. -// -// File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign -// Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign -// Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign -// -// File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign -////////////////////////////////////////////////////////////////////////////////////// - - -#include "Configuration.h" -#include "BsplineFunctor.h" - -namespace qmcplusplus -{ -template<> -const double BsplineFunctor::A[16] = {-1.0 / 6.0, - 3.0 / 6.0, - -3.0 / 6.0, - 1.0 / 6.0, - 3.0 / 6.0, - -6.0 / 6.0, - 0.0 / 6.0, - 4.0 / 6.0, - -3.0 / 6.0, - 3.0 / 6.0, - 3.0 / 6.0, - 1.0 / 6.0, - 1.0 / 6.0, - 0.0 / 6.0, - 0.0 / 6.0, - 0.0 / 6.0}; - -template<> -const double BsplineFunctor::dA[16] = - {0.0, -0.5, 1.0, -0.5, 0.0, 1.5, -2.0, 0.0, 0.0, -1.5, 1.0, 0.5, 0.0, 0.5, 0.0, 0.0}; - -template<> -const double BsplineFunctor::d2A[16] = - {0.0, 0.0, -1.0, 1.0, 0.0, 0.0, 3.0, -2.0, 0.0, 0.0, -3.0, 1.0, 0.0, 0.0, 1.0, 0.0}; - - -template<> -const double BsplineFunctor::d3A[16] = - {0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 0.0, 1.0}; - -// template<> -// const float BsplineFunctor::A[16] = -// { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0, -// 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0, -// -3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0, -// 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0 }; - -// template<> -// const float BsplineFunctor::dA[16] = -// { 0.0, -0.5, 1.0, -0.5, -// 0.0, 1.5, -2.0, 0.0, -// 0.0, -1.5, 1.0, 0.5, -// 0.0, 0.5, 0.0, 0.0 }; - -// template<> -// const float BsplineFunctor::d2A[16] = -// { 0.0, 0.0, -1.0, 1.0, -// 0.0, 0.0, 3.0, -2.0, -// 0.0, 0.0, -3.0, 1.0, -// 0.0, 0.0, 1.0, 0.0 }; - -} // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h index 9da51f1a33..6afeb33571 100644 --- a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h +++ b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h @@ -20,29 +20,55 @@ #ifndef QMCPLUSPLUS_BSPLINE_FUNCTOR_H #define QMCPLUSPLUS_BSPLINE_FUNCTOR_H + +#include +#include #include "Numerics/OptimizableFunctorBase.h" #include "Utilities/ProgressReportEngine.h" #include "OhmmsData/AttributeSet.h" +#include "OhmmsPETE/OhmmsVector.h" #include "Numerics/LinearFit.h" -#include "CPU/SIMD/aligned_allocator.hpp" -#include +#include "OMPTarget/OMPallocator.hpp" +#include "Platforms/PinnedAllocator.h" namespace qmcplusplus { template struct BsplineFunctor : public OptimizableFunctorBase { - typedef real_type value_type; - int NumParams; - int Dummy; - const real_type A[16], dA[16], d2A[16], d3A[16]; - aligned_vector SplineCoefs; + using value_type = real_type; + template + using OffloadAllocator = OMPallocator>; + template + using OffloadPinnedAllocator = OMPallocator>; + + static constexpr real_type A0 = -1.0 / 6.0, A1 = 3.0 / 6.0, A2 = -3.0 / 6.0, A3 = 1.0 / 6.0; + static constexpr real_type A4 = 3.0 / 6.0, A5 = -6.0 / 6.0, A6 = 0.0 / 6.0, A7 = 4.0 / 6.0; + static constexpr real_type A8 = -3.0 / 6.0, A9 = 3.0 / 6.0, A10 = 3.0 / 6.0, A11 = 1.0 / 6.0; + static constexpr real_type A12 = 1.0 / 6.0, A13 = 0.0 / 6.0, A14 = 0.0 / 6.0, A15 = 0.0 / 6.0; + + static constexpr real_type dA0 = 0.0, dA1 = -0.5, dA2 = 1.0, dA3 = -0.5; + static constexpr real_type dA4 = 0.0, dA5 = 1.5, dA6 = -2.0, dA7 = 0.0; + static constexpr real_type dA8 = 0.0, dA9 = -1.5, dA10 = 1.0, dA11 = 0.5; + static constexpr real_type dA12 = 0.0, dA13 = 0.5, dA14 = 0.0, dA15 = 0.0; + + static constexpr real_type d2A0 = 0.0, d2A1 = 0.0, d2A2 = -1.0, d2A3 = 1.0; + static constexpr real_type d2A4 = 0.0, d2A5 = 0.0, d2A6 = 3.0, d2A7 = -2.0; + static constexpr real_type d2A8 = 0.0, d2A9 = 0.0, d2A10 = -3.0, d2A11 = 1.0; + static constexpr real_type d2A12 = 0.0, d2A13 = 0.0, d2A14 = 1.0, d2A15 = 0.0; + + static constexpr real_type d3A0 = 0.0, d3A1 = 0.0, d3A2 = 0.0, d3A3 = -1.0; + static constexpr real_type d3A4 = 0.0, d3A5 = 0.0, d3A6 = 0.0, d3A7 = 3.0; + static constexpr real_type d3A8 = 0.0, d3A9 = 0.0, d3A10 = 0.0, d3A11 = -3.0; + static constexpr real_type d3A12 = 0.0, d3A13 = 0.0, d3A14 = 0.0, d3A15 = 1.0; + + std::shared_ptr>> spline_coefs_; - //static const real_type A[16], dA[16], d2A[16]; + int NumParams; real_type DeltaR, DeltaRInv; real_type CuspValue; real_type Y, dY, d2Y; - // Stores the derivatives w.r.t. SplineCoefs + // Stores the derivatives w.r.t. coefs // of the u, du/dr, and d2u/dr2 std::vector> SplineDerivs; std::vector Parameters; @@ -50,36 +76,11 @@ struct BsplineFunctor : public OptimizableFunctorBase std::string elementType, pairType; std::string fileName; - int ResetCount; bool notOpt; bool periodic; ///constructor - BsplineFunctor(real_type cusp = 0.0) - : NumParams(0), - A{-1.0 / 6.0, - 3.0 / 6.0, - -3.0 / 6.0, - 1.0 / 6.0, - 3.0 / 6.0, - -6.0 / 6.0, - 0.0 / 6.0, - 4.0 / 6.0, - -3.0 / 6.0, - 3.0 / 6.0, - 3.0 / 6.0, - 1.0 / 6.0, - 1.0 / 6.0, - 0.0 / 6.0, - 0.0 / 6.0, - 0.0 / 6.0}, - dA{0.0, -0.5, 1.0, -0.5, 0.0, 1.5, -2.0, 0.0, 0.0, -1.5, 1.0, 0.5, 0.0, 0.5, 0.0, 0.0}, - d2A{0.0, 0.0, -1.0, 1.0, 0.0, 0.0, 3.0, -2.0, 0.0, 0.0, -3.0, 1.0, 0.0, 0.0, 1.0, 0.0}, - d3A{0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 0.0, 1.0}, - CuspValue(cusp), - ResetCount(0), - notOpt(false), - periodic(true) + BsplineFunctor(real_type cusp = 0.0) : NumParams(0), CuspValue(cusp), notOpt(false), periodic(true) { cutoff_radius = 0.0; } @@ -98,24 +99,28 @@ struct BsplineFunctor : public OptimizableFunctorBase DeltaR = cutoff_radius / (real_type)(numKnots - 1); DeltaRInv = 1.0 / DeltaR; Parameters.resize(n); - SplineCoefs.resize(numCoefs); + spline_coefs_ = std::make_shared>>(numCoefs); SplineDerivs.resize(numCoefs); } + /** reset coefs from Parameters + */ void reset() override { - int numCoefs = NumParams + 4; - int numKnots = numCoefs - 2; + const int numCoefs = NumParams + 4; + const int numKnots = numCoefs - 2; DeltaR = cutoff_radius / (real_type)(numKnots - 1); DeltaRInv = 1.0 / DeltaR; - for (int i = 0; i < SplineCoefs.size(); i++) - SplineCoefs[i] = 0.0; + auto& coefs = *spline_coefs_; + for (int i = 0; i < coefs.size(); i++) + coefs[i] = 0.0; // Ensure that cusp conditions is satisfied at the origin - SplineCoefs[1] = Parameters[0]; - SplineCoefs[2] = Parameters[1]; - SplineCoefs[0] = Parameters[1] - 2.0 * DeltaR * CuspValue; + coefs[1] = Parameters[0]; + coefs[2] = Parameters[1]; + coefs[0] = Parameters[1] - 2.0 * DeltaR * CuspValue; for (int i = 2; i < Parameters.size(); i++) - SplineCoefs[i + 1] = Parameters[i]; + coefs[i + 1] = Parameters[i]; + coefs.updateTo(); } /** compute value, gradient and laplacian for [iStart, iEnd) pairs @@ -166,10 +171,11 @@ struct BsplineFunctor : public OptimizableFunctorBase tp[1] = t * t; tp[2] = t; tp[3] = 1.0; - return (SplineCoefs[i + 0] * (A[0] * tp[0] + A[1] * tp[1] + A[2] * tp[2] + A[3] * tp[3]) + - SplineCoefs[i + 1] * (A[4] * tp[0] + A[5] * tp[1] + A[6] * tp[2] + A[7] * tp[3]) + - SplineCoefs[i + 2] * (A[8] * tp[0] + A[9] * tp[1] + A[10] * tp[2] + A[11] * tp[3]) + - SplineCoefs[i + 3] * (A[12] * tp[0] + A[13] * tp[1] + A[14] * tp[2] + A[15] * tp[3])); + auto& coefs = *spline_coefs_; + return (coefs[i + 0] * (A0 * tp[0] + A1 * tp[1] + A2 * tp[2] + A3 * tp[3]) + + coefs[i + 1] * (A4 * tp[0] + A5 * tp[1] + A6 * tp[2] + A7 * tp[3]) + + coefs[i + 2] * (A8 * tp[0] + A9 * tp[1] + A10 * tp[2] + A11 * tp[3]) + + coefs[i + 3] * (A12 * tp[0] + A13 * tp[1] + A14 * tp[2] + A15 * tp[3])); } inline real_type evaluate(real_type r, real_type rinv) { return Y = evaluate(r, dY, d2Y); } @@ -194,26 +200,27 @@ struct BsplineFunctor : public OptimizableFunctorBase tp[1] = t * t; tp[2] = t; tp[3] = 1.0; + auto& coefs = *spline_coefs_; d2udr2 = DeltaRInv * DeltaRInv * - (SplineCoefs[i + 0] * (d2A[0] * tp[0] + d2A[1] * tp[1] + d2A[2] * tp[2] + d2A[3] * tp[3]) + - SplineCoefs[i + 1] * (d2A[4] * tp[0] + d2A[5] * tp[1] + d2A[6] * tp[2] + d2A[7] * tp[3]) + - SplineCoefs[i + 2] * (d2A[8] * tp[0] + d2A[9] * tp[1] + d2A[10] * tp[2] + d2A[11] * tp[3]) + - SplineCoefs[i + 3] * (d2A[12] * tp[0] + d2A[13] * tp[1] + d2A[14] * tp[2] + d2A[15] * tp[3])); + (coefs[i + 0] * (d2A0 * tp[0] + d2A1 * tp[1] + d2A2 * tp[2] + d2A3 * tp[3]) + + coefs[i + 1] * (d2A4 * tp[0] + d2A5 * tp[1] + d2A6 * tp[2] + d2A7 * tp[3]) + + coefs[i + 2] * (d2A8 * tp[0] + d2A9 * tp[1] + d2A10 * tp[2] + d2A11 * tp[3]) + + coefs[i + 3] * (d2A12 * tp[0] + d2A13 * tp[1] + d2A14 * tp[2] + d2A15 * tp[3])); dudr = DeltaRInv * - (SplineCoefs[i + 0] * (dA[0] * tp[0] + dA[1] * tp[1] + dA[2] * tp[2] + dA[3] * tp[3]) + - SplineCoefs[i + 1] * (dA[4] * tp[0] + dA[5] * tp[1] + dA[6] * tp[2] + dA[7] * tp[3]) + - SplineCoefs[i + 2] * (dA[8] * tp[0] + dA[9] * tp[1] + dA[10] * tp[2] + dA[11] * tp[3]) + - SplineCoefs[i + 3] * (dA[12] * tp[0] + dA[13] * tp[1] + dA[14] * tp[2] + dA[15] * tp[3])); + (coefs[i + 0] * (dA0 * tp[0] + dA1 * tp[1] + dA2 * tp[2] + dA3 * tp[3]) + + coefs[i + 1] * (dA4 * tp[0] + dA5 * tp[1] + dA6 * tp[2] + dA7 * tp[3]) + + coefs[i + 2] * (dA8 * tp[0] + dA9 * tp[1] + dA10 * tp[2] + dA11 * tp[3]) + + coefs[i + 3] * (dA12 * tp[0] + dA13 * tp[1] + dA14 * tp[2] + dA15 * tp[3])); // if (std::abs(dudr_FD-dudr) > 1.0e-8) // std::cerr << "Error in BsplineFunction: dudr = " << dudr // << " dudr_FD = " << dudr_FD << std::endl; // if (std::abs(d2udr2_FD-d2udr2) > 1.0e-4) // std::cerr << "Error in BsplineFunction: r = " << r << " d2udr2 = " << dudr // << " d2udr2_FD = " << d2udr2_FD << " rcut = " << cutoff_radius << std::endl; - return (SplineCoefs[i + 0] * (A[0] * tp[0] + A[1] * tp[1] + A[2] * tp[2] + A[3] * tp[3]) + - SplineCoefs[i + 1] * (A[4] * tp[0] + A[5] * tp[1] + A[6] * tp[2] + A[7] * tp[3]) + - SplineCoefs[i + 2] * (A[8] * tp[0] + A[9] * tp[1] + A[10] * tp[2] + A[11] * tp[3]) + - SplineCoefs[i + 3] * (A[12] * tp[0] + A[13] * tp[1] + A[14] * tp[2] + A[15] * tp[3])); + return (coefs[i + 0] * (A0 * tp[0] + A1 * tp[1] + A2 * tp[2] + A3 * tp[3]) + + coefs[i + 1] * (A4 * tp[0] + A5 * tp[1] + A6 * tp[2] + A7 * tp[3]) + + coefs[i + 2] * (A8 * tp[0] + A9 * tp[1] + A10 * tp[2] + A11 * tp[3]) + + coefs[i + 3] * (A12 * tp[0] + A13 * tp[1] + A14 * tp[2] + A15 * tp[3])); } @@ -240,21 +247,22 @@ struct BsplineFunctor : public OptimizableFunctorBase tp[1] = t * t; tp[2] = t; tp[3] = 1.0; + auto& coefs = *spline_coefs_; d3udr3 = DeltaRInv * DeltaRInv * DeltaRInv * - (SplineCoefs[i + 0] * (d3A[0] * tp[0] + d3A[1] * tp[1] + d3A[2] * tp[2] + d3A[3] * tp[3]) + - SplineCoefs[i + 1] * (d3A[4] * tp[0] + d3A[5] * tp[1] + d3A[6] * tp[2] + d3A[7] * tp[3]) + - SplineCoefs[i + 2] * (d3A[8] * tp[0] + d3A[9] * tp[1] + d3A[10] * tp[2] + d3A[11] * tp[3]) + - SplineCoefs[i + 3] * (d3A[12] * tp[0] + d3A[13] * tp[1] + d3A[14] * tp[2] + d3A[15] * tp[3])); + (coefs[i + 0] * (d3A0 * tp[0] + d3A1 * tp[1] + d3A2 * tp[2] + d3A3 * tp[3]) + + coefs[i + 1] * (d3A4 * tp[0] + d3A5 * tp[1] + d3A6 * tp[2] + d3A7 * tp[3]) + + coefs[i + 2] * (d3A8 * tp[0] + d3A9 * tp[1] + d3A10 * tp[2] + d3A11 * tp[3]) + + coefs[i + 3] * (d3A12 * tp[0] + d3A13 * tp[1] + d3A14 * tp[2] + d3A15 * tp[3])); d2udr2 = DeltaRInv * DeltaRInv * - (SplineCoefs[i + 0] * (d2A[0] * tp[0] + d2A[1] * tp[1] + d2A[2] * tp[2] + d2A[3] * tp[3]) + - SplineCoefs[i + 1] * (d2A[4] * tp[0] + d2A[5] * tp[1] + d2A[6] * tp[2] + d2A[7] * tp[3]) + - SplineCoefs[i + 2] * (d2A[8] * tp[0] + d2A[9] * tp[1] + d2A[10] * tp[2] + d2A[11] * tp[3]) + - SplineCoefs[i + 3] * (d2A[12] * tp[0] + d2A[13] * tp[1] + d2A[14] * tp[2] + d2A[15] * tp[3])); + (coefs[i + 0] * (d2A0 * tp[0] + d2A1 * tp[1] + d2A2 * tp[2] + d2A3 * tp[3]) + + coefs[i + 1] * (d2A4 * tp[0] + d2A5 * tp[1] + d2A6 * tp[2] + d2A7 * tp[3]) + + coefs[i + 2] * (d2A8 * tp[0] + d2A9 * tp[1] + d2A10 * tp[2] + d2A11 * tp[3]) + + coefs[i + 3] * (d2A12 * tp[0] + d2A13 * tp[1] + d2A14 * tp[2] + d2A15 * tp[3])); dudr = DeltaRInv * - (SplineCoefs[i + 0] * (dA[0] * tp[0] + dA[1] * tp[1] + dA[2] * tp[2] + dA[3] * tp[3]) + - SplineCoefs[i + 1] * (dA[4] * tp[0] + dA[5] * tp[1] + dA[6] * tp[2] + dA[7] * tp[3]) + - SplineCoefs[i + 2] * (dA[8] * tp[0] + dA[9] * tp[1] + dA[10] * tp[2] + dA[11] * tp[3]) + - SplineCoefs[i + 3] * (dA[12] * tp[0] + dA[13] * tp[1] + dA[14] * tp[2] + dA[15] * tp[3])); + (coefs[i + 0] * (dA0 * tp[0] + dA1 * tp[1] + dA2 * tp[2] + dA3 * tp[3]) + + coefs[i + 1] * (dA4 * tp[0] + dA5 * tp[1] + dA6 * tp[2] + dA7 * tp[3]) + + coefs[i + 2] * (dA8 * tp[0] + dA9 * tp[1] + dA10 * tp[2] + dA11 * tp[3]) + + coefs[i + 3] * (dA12 * tp[0] + dA13 * tp[1] + dA14 * tp[2] + dA15 * tp[3])); // if (std::abs(dudr_FD-dudr) > 1.0e-8) // std::cerr << "Error in BsplineFunction: dudr = " << dudr // << " dudr_FD = " << dudr_FD << std::endl; @@ -264,10 +272,10 @@ struct BsplineFunctor : public OptimizableFunctorBase // if (std::abs(d3udr3_FD-d3udr3) > 1.0e-4) // std::cerr << "Error in BsplineFunction: r = " << r << " d3udr3 = " << dudr // << " d3udr3_FD = " << d3udr3_FD << " rcut = " << cutoff_radius << std::endl; - return (SplineCoefs[i + 0] * (A[0] * tp[0] + A[1] * tp[1] + A[2] * tp[2] + A[3] * tp[3]) + - SplineCoefs[i + 1] * (A[4] * tp[0] + A[5] * tp[1] + A[6] * tp[2] + A[7] * tp[3]) + - SplineCoefs[i + 2] * (A[8] * tp[0] + A[9] * tp[1] + A[10] * tp[2] + A[11] * tp[3]) + - SplineCoefs[i + 3] * (A[12] * tp[0] + A[13] * tp[1] + A[14] * tp[2] + A[15] * tp[3])); + return (coefs[i + 0] * (A0 * tp[0] + A1 * tp[1] + A2 * tp[2] + A3 * tp[3]) + + coefs[i + 1] * (A4 * tp[0] + A5 * tp[1] + A6 * tp[2] + A7 * tp[3]) + + coefs[i + 2] * (A8 * tp[0] + A9 * tp[1] + A10 * tp[2] + A11 * tp[3]) + + coefs[i + 3] * (A12 * tp[0] + A13 * tp[1] + A14 * tp[2] + A15 * tp[3])); } @@ -285,22 +293,23 @@ struct BsplineFunctor : public OptimizableFunctorBase tp[2] = t; tp[3] = 1.0; + auto& coefs = *spline_coefs_; SplineDerivs[0] = TinyVector(0.0); // d/dp_i u(r) - SplineDerivs[i + 0][0] = A[0] * tp[0] + A[1] * tp[1] + A[2] * tp[2] + A[3] * tp[3]; - SplineDerivs[i + 1][0] = A[4] * tp[0] + A[5] * tp[1] + A[6] * tp[2] + A[7] * tp[3]; - SplineDerivs[i + 2][0] = A[8] * tp[0] + A[9] * tp[1] + A[10] * tp[2] + A[11] * tp[3]; - SplineDerivs[i + 3][0] = A[12] * tp[0] + A[13] * tp[1] + A[14] * tp[2] + A[15] * tp[3]; + SplineDerivs[i + 0][0] = A0 * tp[0] + A1 * tp[1] + A2 * tp[2] + A3 * tp[3]; + SplineDerivs[i + 1][0] = A4 * tp[0] + A5 * tp[1] + A6 * tp[2] + A7 * tp[3]; + SplineDerivs[i + 2][0] = A8 * tp[0] + A9 * tp[1] + A10 * tp[2] + A11 * tp[3]; + SplineDerivs[i + 3][0] = A12 * tp[0] + A13 * tp[1] + A14 * tp[2] + A15 * tp[3]; // d/dp_i du/dr - SplineDerivs[i + 0][1] = DeltaRInv * (dA[1] * tp[1] + dA[2] * tp[2] + dA[3] * tp[3]); - SplineDerivs[i + 1][1] = DeltaRInv * (dA[5] * tp[1] + dA[6] * tp[2] + dA[7] * tp[3]); - SplineDerivs[i + 2][1] = DeltaRInv * (dA[9] * tp[1] + dA[10] * tp[2] + dA[11] * tp[3]); - SplineDerivs[i + 3][1] = DeltaRInv * (dA[13] * tp[1] + dA[14] * tp[2] + dA[15] * tp[3]); + SplineDerivs[i + 0][1] = DeltaRInv * (dA1 * tp[1] + dA2 * tp[2] + dA3 * tp[3]); + SplineDerivs[i + 1][1] = DeltaRInv * (dA5 * tp[1] + dA6 * tp[2] + dA7 * tp[3]); + SplineDerivs[i + 2][1] = DeltaRInv * (dA9 * tp[1] + dA10 * tp[2] + dA11 * tp[3]); + SplineDerivs[i + 3][1] = DeltaRInv * (dA13 * tp[1] + dA14 * tp[2] + dA15 * tp[3]); // d/dp_i d2u/dr2 - SplineDerivs[i + 0][2] = DeltaRInv * DeltaRInv * (d2A[2] * tp[2] + d2A[3] * tp[3]); - SplineDerivs[i + 1][2] = DeltaRInv * DeltaRInv * (d2A[6] * tp[2] + d2A[7] * tp[3]); - SplineDerivs[i + 2][2] = DeltaRInv * DeltaRInv * (d2A[10] * tp[2] + d2A[11] * tp[3]); - SplineDerivs[i + 3][2] = DeltaRInv * DeltaRInv * (d2A[14] * tp[2] + d2A[15] * tp[3]); + SplineDerivs[i + 0][2] = DeltaRInv * DeltaRInv * (d2A2 * tp[2] + d2A3 * tp[3]); + SplineDerivs[i + 1][2] = DeltaRInv * DeltaRInv * (d2A6 * tp[2] + d2A7 * tp[3]); + SplineDerivs[i + 2][2] = DeltaRInv * DeltaRInv * (d2A10 * tp[2] + d2A11 * tp[3]); + SplineDerivs[i + 3][2] = DeltaRInv * DeltaRInv * (d2A14 * tp[2] + d2A15 * tp[3]); int imin = std::max(i, 1); int imax = std::min(i + 4, NumParams + 1); @@ -311,18 +320,18 @@ struct BsplineFunctor : public OptimizableFunctorBase //real_type v[4],dv[4],d2v[4]; //v[0] = A[ 0]*tp[0] + A[ 1]*tp[1] + A[ 2]*tp[2] + A[ 3]*tp[3]; //v[1] = A[ 4]*tp[0] + A[ 5]*tp[1] + A[ 6]*tp[2] + A[ 7]*tp[3]; - //v[2] = A[ 8]*tp[0] + A[ 9]*tp[1] + A[10]*tp[2] + A[11]*tp[3]; - //v[3] = A[12]*tp[0] + A[13]*tp[1] + A[14]*tp[2] + A[15]*tp[3]; + //v[2] = A[ 8]*tp[0] + A[ 9]*tp[1] + A10*tp[2] + A11*tp[3]; + //v[3] = A12*tp[0] + A13*tp[1] + A14*tp[2] + A15*tp[3]; //// d/dp_i du/dr //dv[0] = DeltaRInv * (dA[ 1]*tp[1] + dA[ 2]*tp[2] + dA[ 3]*tp[3]); //dv[1] = DeltaRInv * (dA[ 5]*tp[1] + dA[ 6]*tp[2] + dA[ 7]*tp[3]); - //dv[2] = DeltaRInv * (dA[ 9]*tp[1] + dA[10]*tp[2] + dA[11]*tp[3]); - //dv[3] = DeltaRInv * (dA[13]*tp[1] + dA[14]*tp[2] + dA[15]*tp[3]); + //dv[2] = DeltaRInv * (dA[ 9]*tp[1] + dA10*tp[2] + dA11*tp[3]); + //dv[3] = DeltaRInv * (dA13*tp[1] + dA14*tp[2] + dA15*tp[3]); //// d/dp_i d2u/dr2 //d2v[0] = DeltaRInv * DeltaRInv * (d2A[ 2]*tp[2] + d2A[ 3]*tp[3]); //d2v[1] = DeltaRInv * DeltaRInv * (d2A[ 6]*tp[2] + d2A[ 7]*tp[3]); - //d2v[2] = DeltaRInv * DeltaRInv * (d2A[10]*tp[2] + d2A[11]*tp[3]); - //d2v[3] = DeltaRInv * DeltaRInv * (d2A[14]*tp[2] + d2A[15]*tp[3]); + //d2v[2] = DeltaRInv * DeltaRInv * (d2A10*tp[2] + d2A11*tp[3]); + //d2v[3] = DeltaRInv * DeltaRInv * (d2A14*tp[2] + d2A15*tp[3]); //int imin=std::max(i,1); //int imax=std::min(i+4,NumParams+1)-1; @@ -347,10 +356,10 @@ struct BsplineFunctor : public OptimizableFunctorBase tp[1] = t * t; tp[2] = t; tp[3] = 1.0; - v[0] = A[0] * tp[0] + A[1] * tp[1] + A[2] * tp[2] + A[3] * tp[3]; - v[1] = A[4] * tp[0] + A[5] * tp[1] + A[6] * tp[2] + A[7] * tp[3]; - v[2] = A[8] * tp[0] + A[9] * tp[1] + A[10] * tp[2] + A[11] * tp[3]; - v[3] = A[12] * tp[0] + A[13] * tp[1] + A[14] * tp[2] + A[15] * tp[3]; + v[0] = A0 * tp[0] + A1 * tp[1] + A2 * tp[2] + A3 * tp[3]; + v[1] = A4 * tp[0] + A5 * tp[1] + A6 * tp[2] + A7 * tp[3]; + v[2] = A8 * tp[0] + A9 * tp[1] + A10 * tp[2] + A11 * tp[3]; + v[3] = A12 * tp[0] + A13 * tp[1] + A14 * tp[2] + A15 * tp[3]; int i = (int)ipart; int imin = std::max(i, 1); int imax = std::min(i + 4, NumParams + 1) - 1; @@ -593,15 +602,9 @@ struct BsplineFunctor : public OptimizableFunctorBase for (int i = 0; i < Parameters.size(); ++i) { int loc = myVars.where(i); - if (loc >= 0) { - Parameters[i] = std::real( myVars[i] = active[loc] ); - } + if (loc >= 0) + Parameters[i] = std::real(myVars[i] = active[loc]); } - // if (ResetCount++ == 100) - // { - // ResetCount = 0; - // if(ReportLevel) print(); - // } reset(); } @@ -643,6 +646,7 @@ inline T BsplineFunctor::evaluateV(const int iat, } real_type d = 0.0; + auto& coefs = *spline_coefs_; #pragma omp simd reduction(+ : d) for (int jat = 0; jat < iCount; jat++) { @@ -654,10 +658,10 @@ inline T BsplineFunctor::evaluateV(const int iat, real_type tp1 = t * t; real_type tp2 = t; - real_type d1 = SplineCoefs[i + 0] * (A[0] * tp0 + A[1] * tp1 + A[2] * tp2 + A[3]); - real_type d2 = SplineCoefs[i + 1] * (A[4] * tp0 + A[5] * tp1 + A[6] * tp2 + A[7]); - real_type d3 = SplineCoefs[i + 2] * (A[8] * tp0 + A[9] * tp1 + A[10] * tp2 + A[11]); - real_type d4 = SplineCoefs[i + 3] * (A[12] * tp0 + A[13] * tp1 + A[14] * tp2 + A[15]); + real_type d1 = coefs[i + 0] * (A0 * tp0 + A1 * tp1 + A2 * tp2 + A3); + real_type d2 = coefs[i + 1] * (A4 * tp0 + A5 * tp1 + A6 * tp2 + A7); + real_type d3 = coefs[i + 2] * (A8 * tp0 + A9 * tp1 + A10 * tp2 + A11); + real_type d4 = coefs[i + 3] * (A12 * tp0 + A13 * tp1 + A14 * tp2 + A15); d += (d1 + d2 + d3 + d4); } return d; @@ -700,6 +704,7 @@ inline void BsplineFunctor::evaluateVGL(const int iat, } } + auto& coefs = *spline_coefs_; #pragma omp simd for (int j = 0; j < iCount; j++) { @@ -713,23 +718,22 @@ inline void BsplineFunctor::evaluateVGL(const int iat, real_type tp1 = t * t; real_type tp2 = t; - real_type sCoef0 = SplineCoefs[iGather + 0]; - real_type sCoef1 = SplineCoefs[iGather + 1]; - real_type sCoef2 = SplineCoefs[iGather + 2]; - real_type sCoef3 = SplineCoefs[iGather + 3]; + real_type sCoef0 = coefs[iGather + 0]; + real_type sCoef1 = coefs[iGather + 1]; + real_type sCoef2 = coefs[iGather + 2]; + real_type sCoef3 = coefs[iGather + 3]; laplArray[iScatter] = dSquareDeltaRinv * - (sCoef0 * (d2A[2] * tp2 + d2A[3]) + sCoef1 * (d2A[6] * tp2 + d2A[7]) + sCoef2 * (d2A[10] * tp2 + d2A[11]) + - sCoef3 * (d2A[14] * tp2 + d2A[15])); + (sCoef0 * (d2A2 * tp2 + d2A3) + sCoef1 * (d2A6 * tp2 + d2A7) + sCoef2 * (d2A10 * tp2 + d2A11) + + sCoef3 * (d2A14 * tp2 + d2A15)); gradArray[iScatter] = DeltaRInv * rinv * - (sCoef0 * (dA[1] * tp1 + dA[2] * tp2 + dA[3]) + sCoef1 * (dA[5] * tp1 + dA[6] * tp2 + dA[7]) + - sCoef2 * (dA[9] * tp1 + dA[10] * tp2 + dA[11]) + sCoef3 * (dA[13] * tp1 + dA[14] * tp2 + dA[15])); + (sCoef0 * (dA1 * tp1 + dA2 * tp2 + dA3) + sCoef1 * (dA5 * tp1 + dA6 * tp2 + dA7) + + sCoef2 * (dA9 * tp1 + dA10 * tp2 + dA11) + sCoef3 * (dA13 * tp1 + dA14 * tp2 + dA15)); - valArray[iScatter] = (sCoef0 * (A[0] * tp0 + A[1] * tp1 + A[2] * tp2 + A[3]) + - sCoef1 * (A[4] * tp0 + A[5] * tp1 + A[6] * tp2 + A[7]) + - sCoef2 * (A[8] * tp0 + A[9] * tp1 + A[10] * tp2 + A[11]) + - sCoef3 * (A[12] * tp0 + A[13] * tp1 + A[14] * tp2 + A[15])); + valArray[iScatter] = + (sCoef0 * (A0 * tp0 + A1 * tp1 + A2 * tp2 + A3) + sCoef1 * (A4 * tp0 + A5 * tp1 + A6 * tp2 + A7) + + sCoef2 * (A8 * tp0 + A9 * tp1 + A10 * tp2 + A11) + sCoef3 * (A12 * tp0 + A13 * tp1 + A14 * tp2 + A15)); } } } // namespace qmcplusplus From 28839a6b4a6d426237e354055b137a8c7e9a9589 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 24 Jul 2021 17:28:36 -0500 Subject: [PATCH 06/13] Add helper functions/accessor. --- src/Particle/DistanceTableData.h | 22 ++++++++++++++++++---- src/Particle/SoaDistanceTableABOMPTarget.h | 17 +++++++++++++---- src/Particle/VirtualParticleSet.h | 8 ++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/Particle/DistanceTableData.h b/src/Particle/DistanceTableData.h index 5907615bbf..4833e644f2 100644 --- a/src/Particle/DistanceTableData.h +++ b/src/Particle/DistanceTableData.h @@ -125,6 +125,20 @@ class DistanceTableData ///returns the number of source particles inline IndexType sources() const { return N_sources; } + /// return multi_walker full distance table data ptr + virtual const RealType* getMultiWalkerDataPtr() const + { + throw std::runtime_error(name_ + " multi waler data pointer not supported"); + return nullptr; + } + + /// return stride of per target pctl data. full table data = stride * num of target particles + virtual size_t getPerTargetPctlStrideSize() const + { + throw std::runtime_error(name_ + " getPerTargetPctlStrideSize not supported"); + return 0; + } + /** return full table distances */ const std::vector& getDistances() const { return distances_; } @@ -145,7 +159,7 @@ class DistanceTableData */ virtual const DistRow& getOldDists() const { - APP_ABORT("DistanceTableData::getOldDists is used incorrectly! Contact developers on github."); + throw std::runtime_error("DistanceTableData::getOldDists is used incorrectly! Contact developers on github."); return temp_r_; // dummy return to avoid compiler warning. } @@ -153,7 +167,7 @@ class DistanceTableData */ virtual const DisplRow& getOldDispls() const { - APP_ABORT("DistanceTableData::getOldDispls is used incorrectly! Contact developers on github."); + throw std::runtime_error("DistanceTableData::getOldDispls is used incorrectly! Contact developers on github."); return temp_dr_; // dummy return to avoid compiler warning. } @@ -293,13 +307,13 @@ class DistanceTableData */ virtual int get_first_neighbor(IndexType iat, RealType& r, PosType& dr, bool newpos) const { - APP_ABORT("DistanceTableData::get_first_neighbor is not implemented in calling base class"); + throw std::runtime_error("DistanceTableData::get_first_neighbor is not implemented in calling base class"); return 0; } inline void print(std::ostream& os) { - APP_ABORT("DistanceTableData::print is not supported") + throw std::runtime_error("DistanceTableData::print is not supported"); //os << "Table " << Origin->getName() << std::endl; //for (int i = 0; i < r_m.size(); i++) // os << r_m[i] << " "; diff --git a/src/Particle/SoaDistanceTableABOMPTarget.h b/src/Particle/SoaDistanceTableABOMPTarget.h index ca8bcb60ba..4cc5eb23f3 100644 --- a/src/Particle/SoaDistanceTableABOMPTarget.h +++ b/src/Particle/SoaDistanceTableABOMPTarget.h @@ -64,7 +64,7 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance // initialize memory containers and views const int N_sources_padded = getAlignedSize(N_sources); - const int stride_size = N_sources_padded * (D + 1); + const int stride_size = getPerTargetPctlStrideSize(); r_dr_memorypool_.resize(stride_size * N_targets); distances_.resize(N_targets); @@ -175,7 +175,16 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance } } - /** evaluate the full table */ + const T* getMultiWalkerDataPtr() const override + { + if(!mw_mem_) + throw std::runtime_error("SoaDistanceTableABOMPTarget mw_mem_ is nullptr"); + return mw_mem_->mw_r_dr.data(); + } + + size_t getPerTargetPctlStrideSize() const override { return getAlignedSize(N_sources) * (D + 1); } + + /** evaluate the full table */ inline void evaluate(ParticleSet& P) override { resize(); @@ -199,7 +208,7 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance // To maximize thread usage, the loop over electrons is chunked. Each chunk is sent to an OpenMP offload thread team. const int ChunkSizePerTeam = 256; const int num_teams = (N_sources + ChunkSizePerTeam - 1) / ChunkSizePerTeam; - const size_t stride_size = N_sources_padded * (D + 1); + const size_t stride_size = getPerTargetPctlStrideSize(); { ScopedTimer offload(offload_timer_); @@ -258,7 +267,7 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance const int N_sources_padded = getAlignedSize(N_sources); #ifndef NDEBUG - const int stride_size = N_sources_padded * (D + 1); + const int stride_size = getPerTargetPctlStrideSize(); count_targets = 0; for (size_t iw = 0; iw < dt_list.size(); iw++) { diff --git a/src/Particle/VirtualParticleSet.h b/src/Particle/VirtualParticleSet.h index e015d3db67..4d6d0da263 100644 --- a/src/Particle/VirtualParticleSet.h +++ b/src/Particle/VirtualParticleSet.h @@ -79,6 +79,14 @@ class VirtualParticleSet : public ParticleSet ref_list.push_back(vp); return ref_list; } + + static int countVPs(const RefVector& vp_list) + { + int nVPs = 0; + for (const VirtualParticleSet& vp : vp_list) + nVPs += vp.getTotalNum(); + return nVPs; + } }; } // namespace qmcplusplus #endif From 0c82d2f05003e3fd46ffac3f05abc983c9bb349b Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 24 Jul 2021 18:01:51 -0500 Subject: [PATCH 07/13] J2OMPTarget mw_evaluateRatios --- src/Particle/SoaDistanceTableABOMPTarget.h | 2 +- src/QMCWaveFunctions/Jastrow/BsplineFunctor.h | 29 ++++++++- src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 60 +++++++++++++++++++ src/QMCWaveFunctions/Jastrow/J2OMPTarget.h | 17 +++++- src/QMCWaveFunctions/Jastrow/PadeFunctors.h | 26 ++++++++ src/QMCWaveFunctions/Jastrow/UserFunctor.h | 26 ++++++++ 6 files changed, 156 insertions(+), 4 deletions(-) diff --git a/src/Particle/SoaDistanceTableABOMPTarget.h b/src/Particle/SoaDistanceTableABOMPTarget.h index 4cc5eb23f3..09803578d9 100644 --- a/src/Particle/SoaDistanceTableABOMPTarget.h +++ b/src/Particle/SoaDistanceTableABOMPTarget.h @@ -175,7 +175,7 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance } } - const T* getMultiWalkerDataPtr() const override + const T* getMultiWalkerDataPtr() const override { if(!mw_mem_) throw std::runtime_error("SoaDistanceTableABOMPTarget mw_mem_ is nullptr"); diff --git a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h index 6afeb33571..2b61498fc2 100644 --- a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h +++ b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h @@ -158,7 +158,33 @@ struct BsplineFunctor : public OptimizableFunctorBase const T* restrict _distArray, T* restrict distArrayCompressed) const; - inline real_type evaluate(real_type r) + /** evaluate sum of the pair potentials FIXME + * @return \f$\sum u(r_j)\f$ for r_j < cutoff_radius + */ + static void mw_evaluateV(const int num_groups, + const BsplineFunctor* const functors[], + const int iStart[], + const int iEnd[], + const int num_pairs, + const int* ref_at, + const T* mw_dist, + const int dist_stride, + T* mw_vals) + { + for(int ip = 0; ip < num_pairs; ip++) + { + mw_vals[ip] = 0; + for(int ig = 0; ig < num_groups; ig++) + { + auto& functor(*functors[ig]); + for (int j = iStart[ig]; j < iEnd[ig]; j++) + if (j != ref_at[ip]) + mw_vals[ip] += functor.evaluate(mw_dist[ip * dist_stride + j]); + } + } + } + + inline real_type evaluate(real_type r) const { if (r >= cutoff_radius) return 0.0; @@ -177,6 +203,7 @@ struct BsplineFunctor : public OptimizableFunctorBase coefs[i + 2] * (A8 * tp[0] + A9 * tp[1] + A10 * tp[2] + A11 * tp[3]) + coefs[i + 3] * (A12 * tp[0] + A13 * tp[1] + A14 * tp[2] + A15 * tp[3])); } + inline real_type evaluate(real_type r, real_type rinv) { return Y = evaluate(r, dY, d2Y); } inline void evaluateAll(real_type r, real_type rinv) { Y = evaluate(r, dY, d2Y); } diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp index c92919841e..a4aeb9c944 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -18,6 +18,7 @@ #include "BsplineFunctor.h" #include "PadeFunctors.h" #include "UserFunctor.h" +#include "SoaDistanceTableABOMPTarget.h" namespace qmcplusplus { @@ -88,6 +89,45 @@ void J2OMPTarget::evaluateRatios(const VirtualParticleSet& VP, std::vector +void J2OMPTarget::mw_evaluateRatios(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& vp_list, + std::vector>& ratios) const +{ + // add early return to prevent from accessing vp_list[0] + if (wfc_list.size() == 0) return; + auto& wfc_leader = wfc_list.getCastedLeader>(); + auto& vp_leader = vp_list.getLeader(); + const int nw = wfc_list.size(); + + const int nVPs = VirtualParticleSet::countVPs(vp_list); + Vector> refPctls(nVPs); + Vector> mw_vals(nVPs); + + int ivp = 0; + for (const VirtualParticleSet& vp : vp_list) + for (int k = 0; k < vp.getTotalNum(); ++k, ivp++) + refPctls[ivp] = vp.refPtcl; + assert(ivp == nVPs); + + // need to access the spin group of refPtcl. vp_leader doesn't necessary be a member of the list. + // for this reason, refPtcl must be access from [0]. + const int igt = vp_leader.refPS.getGroupID(vp_list[0].refPtcl); + const auto& dt_leader(vp_leader.getDistTable(wfc_leader.my_table_ID_)); + + FT::mw_evaluateV(NumGroups, F.data() + igt * NumGroups, g_first.data(), g_last.data(), nVPs, refPctls.data(), dt_leader.getMultiWalkerDataPtr(), dt_leader.getPerTargetPctlStrideSize(), mw_vals.data()); + + ivp = 0; + for (int iw = 0; iw < nw; ++iw) + { + const VirtualParticleSet& vp = vp_list[iw]; + const auto& wfc = wfc_list.getCastedElement>(iw); + for (int k = 0; k < vp.getTotalNum(); ++k, ivp++) + ratios[iw][k] = std::exp(wfc.Uat[refPctls[ivp]] - mw_vals[ivp]); + } + assert(ivp == nVPs); +} + template void J2OMPTarget::registerData(ParticleSet& P, WFBufferType& buf) { @@ -191,6 +231,16 @@ void J2OMPTarget::init(ParticleSet& p) F.resize(NumGroups * NumGroups, nullptr); DistCompressed.resize(N); DistIndice.resize(N); + + g_first.resize(NumGroups); + g_last.resize(NumGroups); + for (int ig = 0; ig < NumGroups; ig++) + { + g_first[ig] = p.first(ig); + g_last[ig] = p.last(ig); + } + g_first.updateTo(); + g_last.updateTo(); } template @@ -444,6 +494,16 @@ void J2OMPTarget::recompute(const ParticleSet& P) } } +template +void J2OMPTarget::mw_completeUpdates(const RefVectorWithLeader& wfc_list) const +{ + for (int iw = 0; iw < wfc_list.size(); iw++) + { + auto& j2 = wfc_list.getCastedElement>(iw); + j2.Uat.updateTo(); + } +} + template typename J2OMPTarget::LogValueType J2OMPTarget::evaluateLog(const ParticleSet& P, ParticleSet::ParticleGradient_t& G, diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h index 057a5890a4..d8b3082f2e 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h @@ -23,7 +23,8 @@ #endif #include "Particle/DistanceTableData.h" #include "LongRange/StructFact.h" -#include "CPU/SIMD/aligned_allocator.hpp" +#include "OMPTarget/OMPallocator.hpp" +#include "Platforms/PinnedAllocator.h" #include "J2KECorrection.h" namespace qmcplusplus @@ -47,6 +48,8 @@ template class J2OMPTarget : public WaveFunctionComponent { public: + template + using OffloadPinnedAllocator = OMPallocator>; ///alias FuncType using FuncType = FT; ///type of each component U, dU, d2U; @@ -65,12 +68,16 @@ class J2OMPTarget : public WaveFunctionComponent size_t N_padded; ///number of groups of the target particleset size_t NumGroups; + /// the index of the first particle in each group + Vector> g_first; + /// the index + 1 of the last particle in each group + Vector> g_last; ///diff value RealType DiffVal; ///Correction RealType KEcorr; ///\f$Uat[i] = sum_(j) u_{i,j}\f$ - Vector Uat; + Vector> Uat; ///\f$dUat[i] = sum_(j) du_{i,j}\f$ gContainer_type dUat; ///\f$d2Uat[i] = sum_(j) d2u_{i,j}\f$ @@ -132,6 +139,10 @@ class J2OMPTarget : public WaveFunctionComponent void evaluateRatios(const VirtualParticleSet& VP, std::vector& ratios) override; void evaluateRatiosAlltoOne(ParticleSet& P, std::vector& ratios) override; + void mw_evaluateRatios(const RefVectorWithLeader& wfc_list, + const RefVectorWithLeader& vp_list, + std::vector>& ratios) const override; + GradType evalGrad(ParticleSet& P, int iat) override; PsiValueType ratioGrad(ParticleSet& P, int iat, GradType& grad_iat) override; @@ -139,6 +150,8 @@ class J2OMPTarget : public WaveFunctionComponent void acceptMove(ParticleSet& P, int iat, bool safe_to_delay = false) override; inline void restore(int iat) override {} + void mw_completeUpdates(const RefVectorWithLeader& wfc_list) const override; + /** compute G and L after the sweep */ LogValueType evaluateGL(const ParticleSet& P, diff --git a/src/QMCWaveFunctions/Jastrow/PadeFunctors.h b/src/QMCWaveFunctions/Jastrow/PadeFunctors.h index ef9fcb3ae9..9ed1f8977d 100644 --- a/src/QMCWaveFunctions/Jastrow/PadeFunctors.h +++ b/src/QMCWaveFunctions/Jastrow/PadeFunctors.h @@ -113,6 +113,32 @@ struct PadeFunctor : public OptimizableFunctorBase return sum; } + /** evaluate sum of the pair potentials FIXME + * @return \f$\sum u(r_j)\f$ for r_j < cutoff_radius + */ + static void mw_evaluateV(const int num_groups, + const PadeFunctor* const functors[], + const int iStart[], + const int iEnd[], + const int num_pairs, + const int* ref_at, + const T* mw_dist, + const int dist_stride, + T* mw_vals) + { + for(int ip = 0; ip < num_pairs; ip++) + { + mw_vals[ip] = 0; + for(int ig = 0; ig < num_groups; ig++) + { + auto& functor(*functors[ig]); + for (int j = iStart[ig]; j < iEnd[ig]; j++) + if (j != ref_at[ip]) + mw_vals[ip] += functor.evaluate(mw_dist[ip * dist_stride + j]); + } + } + } + inline void evaluateVGL(const int iat, const int iStart, const int iEnd, diff --git a/src/QMCWaveFunctions/Jastrow/UserFunctor.h b/src/QMCWaveFunctions/Jastrow/UserFunctor.h index 8181c1091f..8cc877c1cf 100644 --- a/src/QMCWaveFunctions/Jastrow/UserFunctor.h +++ b/src/QMCWaveFunctions/Jastrow/UserFunctor.h @@ -127,6 +127,32 @@ struct UserFunctor : public OptimizableFunctorBase return sum; } + /** evaluate sum of the pair potentials FIXME + * @return \f$\sum u(r_j)\f$ for r_j < cutoff_radius + */ + static void mw_evaluateV(const int num_groups, + const UserFunctor* const functors[], + const int iStart[], + const int iEnd[], + const int num_pairs, + const int* ref_at, + const T* mw_dist, + const int dist_stride, + T* mw_vals) + { + for(int ip = 0; ip < num_pairs; ip++) + { + mw_vals[ip] = 0; + for(int ig = 0; ig < num_groups; ig++) + { + auto& functor(*functors[ig]); + for (int j = iStart[ig]; j < iEnd[ig]; j++) + if (j != ref_at[ip]) + mw_vals[ip] += functor.evaluate(mw_dist[ip * dist_stride + j]); + } + } + } + inline void evaluateVGL(const int iat, const int iStart, const int iEnd, From 60aa967d3671f2fa83107b5517b5703a8af4cedd Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 24 Jul 2021 21:23:03 -0500 Subject: [PATCH 08/13] BsplineFunctor::mw_evaluateV offload enabled --- src/QMCWaveFunctions/Jastrow/BsplineFunctor.h | 46 +++++++++++++++++-- src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 1 + 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h index 2b61498fc2..45fec71c01 100644 --- a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h +++ b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h @@ -171,16 +171,54 @@ struct BsplineFunctor : public OptimizableFunctorBase const int dist_stride, T* mw_vals) { + Vector> mw_buffer; + mw_buffer.resize((sizeof(T*) + sizeof(T) * 2)*num_groups); + T** mw_coefs_ptr = reinterpret_cast(mw_buffer.data()); + T* mw_DeltaRInv_ptr = reinterpret_cast(mw_buffer.data() + sizeof(T*) * num_groups); + T* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups; + for (int ig = 0; ig < num_groups; ig++) + { + mw_coefs_ptr[ig] = functors[ig]->spline_coefs_->device_data(); + mw_DeltaRInv_ptr[ig] = functors[ig]->DeltaRInv; + mw_cutoff_radius_ptr[ig] = functors[ig]->cutoff_radius; + } + + auto* mw_buffer_ptr = mw_buffer.data(); + + PRAGMA_OFFLOAD("omp target teams distribute map(always, to:mw_buffer_ptr[:mw_buffer.size()]) \ + map(to:iStart[:num_groups], iEnd[:num_groups]) \ + map(to:ref_at[:num_pairs], mw_dist[:dist_stride*num_pairs]) \ + map(always, from:mw_vals[:num_pairs])") for(int ip = 0; ip < num_pairs; ip++) { - mw_vals[ip] = 0; + T sum = 0; + const T* dist = mw_dist + ip * dist_stride; + T** mw_coefs = reinterpret_cast(mw_buffer_ptr); + T* mw_DeltaRInv = reinterpret_cast(mw_buffer_ptr + sizeof(T*) * num_groups); + T* mw_cutoff_radius = mw_DeltaRInv + num_groups; for(int ig = 0; ig < num_groups; ig++) { - auto& functor(*functors[ig]); + const T* coefs = mw_coefs[ig]; + T DeltaRInv = mw_DeltaRInv[ig]; + T cutoff_radius = mw_cutoff_radius[ig]; + PRAGMA_OFFLOAD("omp parallel for reduction(+: sum)") for (int j = iStart[ig]; j < iEnd[ig]; j++) - if (j != ref_at[ip]) - mw_vals[ip] += functor.evaluate(mw_dist[ip * dist_stride + j]); + { + T r = dist[j]; + if (j != ref_at[ip] && r < cutoff_radius) + { + r *= DeltaRInv; + real_type ipart, t; + t = std::modf(r, &ipart); + int i = (int)ipart; + sum += coefs[i + 0] * (((A0 * t + A1) * t + A2) * t + A3) + + coefs[i + 1] * (((A4 * t + A5) * t + A6) * t + A7) + + coefs[i + 2] * (((A8 * t + A9) * t + A10) * t + A11) + + coefs[i + 3] * (((A12 * t + A13) * t + A14) * t + A15); + } + } } + mw_vals[ip] = sum; } } diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp index a4aeb9c944..38a5b30ce9 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -115,6 +115,7 @@ void J2OMPTarget::mw_evaluateRatios(const RefVectorWithLeader Date: Sun, 25 Jul 2021 00:16:13 -0500 Subject: [PATCH 09/13] Optimize VP resource management --- src/Particle/VirtualParticleSet.cpp | 74 ++++++++++++++++++- src/Particle/VirtualParticleSet.h | 33 ++++++++- .../OMPTarget/OMPAlignedAllocator.hpp | 28 +++++++ src/QMCHamiltonians/NonLocalECPComponent.cpp | 3 +- .../BsplineFactory/SplineC2COMPTarget.h | 8 +- .../BsplineFactory/SplineC2ROMPTarget.h | 8 +- .../SplineOMPTargetMultiWalkerMem.h | 6 +- src/QMCWaveFunctions/Jastrow/BsplineFunctor.h | 26 +++---- src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 63 +++++++++++----- src/QMCWaveFunctions/Jastrow/J2OMPTarget.h | 17 ++++- src/QMCWaveFunctions/Jastrow/PadeFunctors.h | 4 +- src/QMCWaveFunctions/Jastrow/UserFunctor.h | 6 +- 12 files changed, 208 insertions(+), 68 deletions(-) create mode 100644 src/Platforms/OMPTarget/OMPAlignedAllocator.hpp diff --git a/src/Particle/VirtualParticleSet.cpp b/src/Particle/VirtualParticleSet.cpp index 6d46fa9830..34887ae450 100644 --- a/src/Particle/VirtualParticleSet.cpp +++ b/src/Particle/VirtualParticleSet.cpp @@ -20,9 +20,23 @@ #include "Particle/DistanceTableData.h" #include "Particle/createDistanceTable.h" #include "QMCHamiltonians/NLPPJob.h" +#include "ResourceCollection.h" namespace qmcplusplus { + +struct VPMultiWalkerMem: public Resource +{ + /// multi walker reference particle + Vector> mw_refPctls; + + VPMultiWalkerMem() : Resource("VPMultiWalkerMem") {} + + VPMultiWalkerMem(const VPMultiWalkerMem&) : VPMultiWalkerMem() {} + + Resource* makeClone() const override { return new VPMultiWalkerMem(*this); } +}; + VirtualParticleSet::VirtualParticleSet(const ParticleSet& p, int nptcl) : refPS(p) { setName("virtual"); @@ -38,6 +52,48 @@ VirtualParticleSet::VirtualParticleSet(const ParticleSet& p, int nptcl) : refPS( addTable(refPS.getDistTable(i).origin()); } +VirtualParticleSet::~VirtualParticleSet() = default; + +Vector>& VirtualParticleSet::getMultiWalkerRefPctls() +{ + if(!mw_mem_) + throw std::runtime_error("VirtualParticleSet::getMultiWalkerRefPctls mw_mem_ is nullptr"); + return mw_mem_->mw_refPctls; +} + +const Vector>& VirtualParticleSet::getMultiWalkerRefPctls() const +{ + if(!mw_mem_) + throw std::runtime_error("VirtualParticleSet::getMultiWalkerRefPctls mw_mem_ is nullptr"); + return mw_mem_->mw_refPctls; +} + +void VirtualParticleSet::createResource(ResourceCollection& collection) const +{ + collection.addResource(std::make_unique()); + + ParticleSet::createResource(collection); +} + +void VirtualParticleSet::acquireResource(ResourceCollection& collection, const RefVectorWithLeader& vp_list) +{ + auto& vp_leader = vp_list.getLeader(); + auto res_ptr = dynamic_cast(collection.lendResource().release()); + if (!res_ptr) + throw std::runtime_error("VirtualParticleSet::acquireResource dynamic_cast failed"); + vp_leader.mw_mem_.reset(res_ptr); + + auto p_list = RefVectorWithLeaderParticleSet(vp_list); + ParticleSet::acquireResource(collection, p_list); +} + +void VirtualParticleSet::releaseResource(ResourceCollection& collection, const RefVectorWithLeader& vp_list) +{ + collection.takebackResource(std::move(vp_list.getLeader().mw_mem_)); + auto p_list = RefVectorWithLeaderParticleSet(vp_list); + ParticleSet::releaseResource(collection, p_list); +} + /// move virtual particles to new postions and update distance tables void VirtualParticleSet::makeMoves(int jel, const PosType& ref_pos, @@ -46,7 +102,7 @@ void VirtualParticleSet::makeMoves(int jel, int iat) { if (sphere && iat < 0) - APP_ABORT("VirtualParticleSet::makeMoves is invoked incorrectly, the flag sphere=true requires iat specified!"); + throw std::runtime_error("VirtualParticleSet::makeMoves is invoked incorrectly, the flag sphere=true requires iat specified!"); onSphere = sphere; refPtcl = jel; refSourcePtcl = iat; @@ -64,9 +120,14 @@ void VirtualParticleSet::mw_makeMoves(const RefVectorWithLeader p_list(vp_leader); p_list.reserve(vp_list.size()); + size_t ivp = 0; for (int iw = 0; iw < vp_list.size(); iw++) { VirtualParticleSet& vp(vp_list[iw]); @@ -77,12 +138,17 @@ void VirtualParticleSet::mw_makeMoves(const RefVectorWithLeader struct NLPPJob; +struct VPMultiWalkerMem; /** Introduced to handle virtual moves and ratio computations, e.g. for non-local PP evaluations. */ @@ -34,6 +36,10 @@ class VirtualParticleSet : public ParticleSet private: /// true, if virtual particles are on a sphere for NLPP bool onSphere; + /// multi walker resource + std::unique_ptr mw_mem_; + + Vector>& getMultiWalkerRefPctls(); public: /// Reference particle @@ -46,12 +52,27 @@ class VirtualParticleSet : public ParticleSet inline bool isOnSphere() const { return onSphere; } + const Vector>& getMultiWalkerRefPctls() const; + /** constructor * @param p ParticleSet whose virtual moves are handled by this object * @param nptcl number of virtual particles */ VirtualParticleSet(const ParticleSet& p, int nptcl); + ~VirtualParticleSet(); + + /// initialize a shared resource and hand it to a collection + void createResource(ResourceCollection& collection) const; + /** acquire external resource and assocaite it with the list of ParticleSet + * Note: use RAII ResourceCollectionTeamLock whenever possible + */ + static void acquireResource(ResourceCollection& collection, const RefVectorWithLeader& vp_list); + /** release external resource + * Note: use RAII ResourceCollectionTeamLock whenever possible + */ + static void releaseResource(ResourceCollection& collection, const RefVectorWithLeader& vp_list); + /** move virtual particles to new postions and update distance tables * @param jel reference particle that all the VP moves from * @param ref_pos reference particle position @@ -80,9 +101,17 @@ class VirtualParticleSet : public ParticleSet return ref_list; } - static int countVPs(const RefVector& vp_list) + static size_t countVPs(const RefVectorWithLeader& vp_list) + { + size_t nVPs = 0; + for (const VirtualParticleSet& vp : vp_list) + nVPs += vp.getTotalNum(); + return nVPs; + } + + static size_t countVPs(const RefVectorWithLeader& vp_list) { - int nVPs = 0; + size_t nVPs = 0; for (const VirtualParticleSet& vp : vp_list) nVPs += vp.getTotalNum(); return nVPs; diff --git a/src/Platforms/OMPTarget/OMPAlignedAllocator.hpp b/src/Platforms/OMPTarget/OMPAlignedAllocator.hpp new file mode 100644 index 0000000000..61d8b2d3bd --- /dev/null +++ b/src/Platforms/OMPTarget/OMPAlignedAllocator.hpp @@ -0,0 +1,28 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2021 QMCPACK developers. +// +// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +// +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +////////////////////////////////////////////////////////////////////////////////////// +// -*- C++ -*- +/** @file + */ +#ifndef QMCPLUSPLUS_OMPTARGET_ALIGNED_ALLOCATOR_H +#define QMCPLUSPLUS_OMPTARGET_ALIGNED_ALLOCATOR_H + + +#include "OMPallocator.hpp" +#include "PinnedAllocator.h" + +namespace qmcplusplus +{ + template + using OffloadAllocator = OMPallocator>; + template + using OffloadPinnedAllocator = OMPallocator>; +} // namespace qmcplusplus +#endif diff --git a/src/QMCHamiltonians/NonLocalECPComponent.cpp b/src/QMCHamiltonians/NonLocalECPComponent.cpp index b7af5a2c27..abc344ca8a 100644 --- a/src/QMCHamiltonians/NonLocalECPComponent.cpp +++ b/src/QMCHamiltonians/NonLocalECPComponent.cpp @@ -220,8 +220,7 @@ void NonLocalECPComponent::mw_evaluateOne(const RefVectorWithLeader vp_res_lock(collection, vp_to_p_list); + ResourceCollectionTeamLock vp_res_lock(collection, vp_list); VirtualParticleSet::mw_makeMoves(vp_list, deltaV_list, joblist, true); diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h index 1b81d13042..9dfc933d2f 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTarget.h @@ -22,8 +22,7 @@ #include "QMCWaveFunctions/BsplineFactory/BsplineSet.h" #include "OhmmsSoA/VectorSoaContainer.h" #include "spline2/MultiBspline.hpp" -#include "OMPTarget/OMPallocator.hpp" -#include "Platforms/PinnedAllocator.h" +#include "OMPTarget/OMPAlignedAllocator.hpp" #include "Utilities/FairDivide.h" #include "Utilities/TimerManager.h" #include "SplineOMPTargetMultiWalkerMem.h" @@ -40,11 +39,6 @@ template class SplineC2COMPTarget : public BsplineSet { public: - template - using OffloadAllocator = OMPallocator>; - template - using OffloadPinnedAllocator = OMPallocator>; - using SplineType = typename bspline_traits::SplineType; using BCType = typename bspline_traits::BCType; using DataType = ST; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h index fb0b02f2f9..f2968a4493 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTarget.h @@ -22,8 +22,7 @@ #include "QMCWaveFunctions/BsplineFactory/BsplineSet.h" #include "OhmmsSoA/VectorSoaContainer.h" #include "spline2/MultiBspline.hpp" -#include "OMPTarget/OMPallocator.hpp" -#include "Platforms/PinnedAllocator.h" +#include "OMPTarget/OMPAlignedAllocator.hpp" #include "Utilities/FairDivide.h" #include "Utilities/TimerManager.h" #include "SplineOMPTargetMultiWalkerMem.h" @@ -40,11 +39,6 @@ template class SplineC2ROMPTarget : public BsplineSet { public: - template - using OffloadAllocator = OMPallocator>; - template - using OffloadPinnedAllocator = OMPallocator>; - using SplineType = typename bspline_traits::SplineType; using BCType = typename bspline_traits::BCType; using DataType = ST; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineOMPTargetMultiWalkerMem.h b/src/QMCWaveFunctions/BsplineFactory/SplineOMPTargetMultiWalkerMem.h index cc24f3b96a..2c07463d6a 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineOMPTargetMultiWalkerMem.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineOMPTargetMultiWalkerMem.h @@ -13,8 +13,7 @@ #ifndef QMCPLUSPLUS_OFFLOADSHAREDMEM_H #define QMCPLUSPLUS_OFFLOADSHAREDMEM_H -#include "OMPTarget/OMPallocator.hpp" -#include "PinnedAllocator.h" +#include "OMPTarget/OMPAlignedAllocator.hpp" #include "ResourceCollection.h" namespace qmcplusplus @@ -23,9 +22,6 @@ namespace qmcplusplus template struct SplineOMPTargetMultiWalkerMem: public Resource { - template - using OffloadPinnedAllocator = OMPallocator>; - ///team private ratios for reduction, numVP x numTeams Matrix> mw_ratios_private; ///team private ratios and grads for reduction, numVP x numTeams diff --git a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h index 45fec71c01..ba5ae61777 100644 --- a/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h +++ b/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h @@ -28,8 +28,8 @@ #include "OhmmsData/AttributeSet.h" #include "OhmmsPETE/OhmmsVector.h" #include "Numerics/LinearFit.h" -#include "OMPTarget/OMPallocator.hpp" -#include "Platforms/PinnedAllocator.h" +#include "OMPTarget/OMPAlignedAllocator.hpp" + namespace qmcplusplus { @@ -37,10 +37,6 @@ template struct BsplineFunctor : public OptimizableFunctorBase { using value_type = real_type; - template - using OffloadAllocator = OMPallocator>; - template - using OffloadPinnedAllocator = OMPallocator>; static constexpr real_type A0 = -1.0 / 6.0, A1 = 3.0 / 6.0, A2 = -3.0 / 6.0, A3 = 1.0 / 6.0; static constexpr real_type A4 = 3.0 / 6.0, A5 = -6.0 / 6.0, A6 = 0.0 / 6.0, A7 = 4.0 / 6.0; @@ -169,12 +165,12 @@ struct BsplineFunctor : public OptimizableFunctorBase const int* ref_at, const T* mw_dist, const int dist_stride, - T* mw_vals) + T* mw_vals, + Vector>& transfer_buffer) { - Vector> mw_buffer; - mw_buffer.resize((sizeof(T*) + sizeof(T) * 2)*num_groups); - T** mw_coefs_ptr = reinterpret_cast(mw_buffer.data()); - T* mw_DeltaRInv_ptr = reinterpret_cast(mw_buffer.data() + sizeof(T*) * num_groups); + transfer_buffer.resize((sizeof(T*) + sizeof(T) * 2)*num_groups); + T** mw_coefs_ptr = reinterpret_cast(transfer_buffer.data()); + T* mw_DeltaRInv_ptr = reinterpret_cast(transfer_buffer.data() + sizeof(T*) * num_groups); T* mw_cutoff_radius_ptr = mw_DeltaRInv_ptr + num_groups; for (int ig = 0; ig < num_groups; ig++) { @@ -183,9 +179,9 @@ struct BsplineFunctor : public OptimizableFunctorBase mw_cutoff_radius_ptr[ig] = functors[ig]->cutoff_radius; } - auto* mw_buffer_ptr = mw_buffer.data(); + auto* transfer_buffer_ptr = transfer_buffer.data(); - PRAGMA_OFFLOAD("omp target teams distribute map(always, to:mw_buffer_ptr[:mw_buffer.size()]) \ + PRAGMA_OFFLOAD("omp target teams distribute map(always, to:transfer_buffer_ptr[:transfer_buffer.size()]) \ map(to:iStart[:num_groups], iEnd[:num_groups]) \ map(to:ref_at[:num_pairs], mw_dist[:dist_stride*num_pairs]) \ map(always, from:mw_vals[:num_pairs])") @@ -193,8 +189,8 @@ struct BsplineFunctor : public OptimizableFunctorBase { T sum = 0; const T* dist = mw_dist + ip * dist_stride; - T** mw_coefs = reinterpret_cast(mw_buffer_ptr); - T* mw_DeltaRInv = reinterpret_cast(mw_buffer_ptr + sizeof(T*) * num_groups); + T** mw_coefs = reinterpret_cast(transfer_buffer_ptr); + T* mw_DeltaRInv = reinterpret_cast(transfer_buffer_ptr + sizeof(T*) * num_groups); T* mw_cutoff_radius = mw_DeltaRInv + num_groups; for(int ig = 0; ig < num_groups; ig++) { diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp index 38a5b30ce9..a6fc9eda72 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -19,9 +19,47 @@ #include "PadeFunctors.h" #include "UserFunctor.h" #include "SoaDistanceTableABOMPTarget.h" +#include "ResourceCollection.h" namespace qmcplusplus { + +template +struct J2OMPTargetMultiWalkerMem: public Resource +{ + // fused buffer for fast transfer + Vector> transfer_buffer; + // multi walker result + Vector> mw_vals; + + J2OMPTargetMultiWalkerMem() : Resource("J2OMPTargetMultiWalkerMem") {} + + J2OMPTargetMultiWalkerMem(const J2OMPTargetMultiWalkerMem&) : J2OMPTargetMultiWalkerMem() {} + + Resource* makeClone() const override { return new J2OMPTargetMultiWalkerMem(*this); } +}; + +template +void J2OMPTarget::createResource(ResourceCollection& collection) const +{ + collection.addResource(std::make_unique>()); +} + +template +void J2OMPTarget::acquireResource(ResourceCollection& collection) +{ + auto res_ptr = dynamic_cast*>(collection.lendResource().release()); + if (!res_ptr) + throw std::runtime_error("VirtualParticleSet::acquireResource dynamic_cast failed"); + mw_mem_.reset(res_ptr); +} + +template +void J2OMPTarget::releaseResource(ResourceCollection& collection) +{ + collection.takebackResource(std::move(mw_mem_)); +} + template void J2OMPTarget::checkInVariables(opt_variables_type& active) { @@ -98,33 +136,27 @@ void J2OMPTarget::mw_evaluateRatios(const RefVectorWithLeader>(); auto& vp_leader = vp_list.getLeader(); + const auto& mw_refPctls = vp_leader.getMultiWalkerRefPctls(); + auto& mw_vals = wfc_leader.mw_mem_->mw_vals; const int nw = wfc_list.size(); - const int nVPs = VirtualParticleSet::countVPs(vp_list); - Vector> refPctls(nVPs); - Vector> mw_vals(nVPs); - - int ivp = 0; - for (const VirtualParticleSet& vp : vp_list) - for (int k = 0; k < vp.getTotalNum(); ++k, ivp++) - refPctls[ivp] = vp.refPtcl; - assert(ivp == nVPs); + const size_t nVPs = mw_refPctls.size(); + mw_vals.resize(nVPs); // need to access the spin group of refPtcl. vp_leader doesn't necessary be a member of the list. // for this reason, refPtcl must be access from [0]. const int igt = vp_leader.refPS.getGroupID(vp_list[0].refPtcl); const auto& dt_leader(vp_leader.getDistTable(wfc_leader.my_table_ID_)); - refPctls.updateTo(); - FT::mw_evaluateV(NumGroups, F.data() + igt * NumGroups, g_first.data(), g_last.data(), nVPs, refPctls.data(), dt_leader.getMultiWalkerDataPtr(), dt_leader.getPerTargetPctlStrideSize(), mw_vals.data()); + FT::mw_evaluateV(NumGroups, F.data() + igt * NumGroups, g_first.data(), g_last.data(), nVPs, mw_refPctls.data(), dt_leader.getMultiWalkerDataPtr(), dt_leader.getPerTargetPctlStrideSize(), mw_vals.data(), wfc_leader.mw_mem_->transfer_buffer); - ivp = 0; + size_t ivp = 0; for (int iw = 0; iw < nw; ++iw) { const VirtualParticleSet& vp = vp_list[iw]; const auto& wfc = wfc_list.getCastedElement>(iw); for (int k = 0; k < vp.getTotalNum(); ++k, ivp++) - ratios[iw][k] = std::exp(wfc.Uat[refPctls[ivp]] - mw_vals[ivp]); + ratios[iw][k] = std::exp(wfc.Uat[mw_refPctls[ivp]] - mw_vals[ivp]); } assert(ivp == nVPs); } @@ -498,11 +530,6 @@ void J2OMPTarget::recompute(const ParticleSet& P) template void J2OMPTarget::mw_completeUpdates(const RefVectorWithLeader& wfc_list) const { - for (int iw = 0; iw < wfc_list.size(); iw++) - { - auto& j2 = wfc_list.getCastedElement>(iw); - j2.Uat.updateTo(); - } } template diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h index d8b3082f2e..3198d8e189 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h @@ -23,12 +23,15 @@ #endif #include "Particle/DistanceTableData.h" #include "LongRange/StructFact.h" -#include "OMPTarget/OMPallocator.hpp" -#include "Platforms/PinnedAllocator.h" +#include "OMPTarget/OMPAlignedAllocator.hpp" #include "J2KECorrection.h" namespace qmcplusplus { + +template +struct J2OMPTargetMultiWalkerMem; + /** @ingroup WaveFunctionComponent * @brief Specialization for two-body Jastrow function using multiple functors * @@ -48,8 +51,6 @@ template class J2OMPTarget : public WaveFunctionComponent { public: - template - using OffloadPinnedAllocator = OMPallocator>; ///alias FuncType using FuncType = FT; ///type of each component U, dU, d2U; @@ -96,6 +97,8 @@ class J2OMPTarget : public WaveFunctionComponent // helper for compute J2 Chiesa KE correction J2KECorrection j2_ke_corr_helper; + std::unique_ptr> mw_mem_; + public: J2OMPTarget(const std::string& obj_name, ParticleSet& p); J2OMPTarget(const J2OMPTarget& rhs) = delete; @@ -107,6 +110,12 @@ class J2OMPTarget : public WaveFunctionComponent /** add functor for (ia,ib) pair */ void addFunc(int ia, int ib, std::unique_ptr j); + void createResource(ResourceCollection& collection) const override; + + void acquireResource(ResourceCollection& collection) override; + + void releaseResource(ResourceCollection& collection) override; + /** check in an optimizable parameter * @param o a super set of optimizable variables */ diff --git a/src/QMCWaveFunctions/Jastrow/PadeFunctors.h b/src/QMCWaveFunctions/Jastrow/PadeFunctors.h index 9ed1f8977d..54aa386010 100644 --- a/src/QMCWaveFunctions/Jastrow/PadeFunctors.h +++ b/src/QMCWaveFunctions/Jastrow/PadeFunctors.h @@ -25,6 +25,7 @@ #include // #include #include "OhmmsPETE/TinyVector.h" +#include "OMPTarget/OMPAlignedAllocator.hpp" namespace qmcplusplus @@ -124,7 +125,8 @@ struct PadeFunctor : public OptimizableFunctorBase const int* ref_at, const T* mw_dist, const int dist_stride, - T* mw_vals) + T* mw_vals, + Vector>& transfer_buffer) { for(int ip = 0; ip < num_pairs; ip++) { diff --git a/src/QMCWaveFunctions/Jastrow/UserFunctor.h b/src/QMCWaveFunctions/Jastrow/UserFunctor.h index 8cc877c1cf..96a945acd5 100644 --- a/src/QMCWaveFunctions/Jastrow/UserFunctor.h +++ b/src/QMCWaveFunctions/Jastrow/UserFunctor.h @@ -33,6 +33,7 @@ #include // #include #include "OhmmsPETE/TinyVector.h" +#include "OMPTarget/OMPAlignedAllocator.hpp" namespace qmcplusplus @@ -45,8 +46,6 @@ namespace qmcplusplus template struct UserFunctor : public OptimizableFunctorBase { - - /// Is optimizable bool Opt_A; /// Value @@ -138,7 +137,8 @@ struct UserFunctor : public OptimizableFunctorBase const int* ref_at, const T* mw_dist, const int dist_stride, - T* mw_vals) + T* mw_vals, + Vector>& transfer_buffer) { for(int ip = 0; ip < num_pairs; ip++) { From ec0e1795294b6eb243d851c3c6b5c3f2525d9d25 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Sat, 24 Jul 2021 22:36:56 -0500 Subject: [PATCH 10/13] Add DTModes::MW_EVALUATE_RESULT_NO_TRANSFER_TO_HOST --- src/Particle/DTModes.h | 6 +++++- src/Particle/SoaDistanceTableABOMPTarget.h | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Particle/DTModes.h b/src/Particle/DTModes.h index 82631cdf52..d7ea77df66 100644 --- a/src/Particle/DTModes.h +++ b/src/Particle/DTModes.h @@ -30,7 +30,11 @@ enum class DTModes : uint_fast8_t * only request this when data on host is needed for unoptimized code path. * This flag affects three subroutines mw_move, mw_updatePartial, mw_finalizePbyP in DistanceTableData. */ - NEED_TEMP_DATA_ON_HOST = 0x2 + NEED_TEMP_DATA_ON_HOST = 0x2, + /** skip data transfer back to host after mw_evalaute full distance table. + * this optimization can be used for distance table consumed directly on the device without copying back to the host. + */ + MW_EVALUATE_RESULT_NO_TRANSFER_TO_HOST = 0x4 }; constexpr bool operator&(DTModes x, DTModes y) diff --git a/src/Particle/SoaDistanceTableABOMPTarget.h b/src/Particle/SoaDistanceTableABOMPTarget.h index 09803578d9..0f9005ebfe 100644 --- a/src/Particle/SoaDistanceTableABOMPTarget.h +++ b/src/Particle/SoaDistanceTableABOMPTarget.h @@ -325,7 +325,7 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance ScopedTimer offload(dt_leader.offload_timer_); PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(total_targets*num_teams) \ map(always, to: input_ptr[:offload_input.size()]) \ - map(always, from: r_dr_ptr[:mw_r_dr.size()])") + depend(out:r_dr_ptr[:mw_r_dr.size()]) nowait") for (int iat = 0; iat < total_targets; ++iat) for (int team_id = 0; team_id < num_teams; team_id++) { @@ -348,6 +348,14 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance DTD_BConds::computeDistancesOffload(pos, source_pos_ptr, r_iat_ptr, dr_iat_ptr, N_sources_padded, iel); } + + if(!(modes_ & DTModes::MW_EVALUATE_RESULT_NO_TRANSFER_TO_HOST)) + { + PRAGMA_OFFLOAD("omp target update from(r_dr_ptr[:mw_r_dr.size()]) depend(inout:r_dr_ptr[:mw_r_dr.size()]) nowait") + } + // wait for computing and (optional) transfering back to host. + // It can potentially be moved to ParticleSet to fuse multiple similar taskwait + PRAGMA_OFFLOAD("omp taskwait") } } From 4798c3c2690139ef7502db1fb55221632f1df54d Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 26 Jul 2021 10:30:31 -0500 Subject: [PATCH 11/13] Formatting. --- src/Particle/DTModes.h | 2 +- src/Particle/SoaDistanceTableABOMPTarget.h | 11 ++-- src/Particle/VirtualParticleSet.cpp | 27 +++++---- src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 56 ++++++++++--------- src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp | 4 +- 5 files changed, 55 insertions(+), 45 deletions(-) diff --git a/src/Particle/DTModes.h b/src/Particle/DTModes.h index d7ea77df66..1fabb64050 100644 --- a/src/Particle/DTModes.h +++ b/src/Particle/DTModes.h @@ -24,7 +24,7 @@ enum class DTModes : uint_fast8_t * Optimization can be implemented during forward PbyP move when the full table is not needed all the time. * DT consumers should know if full table is needed or not and request via addTable. */ - NEED_FULL_TABLE_ANYTIME = 0x1, + NEED_FULL_TABLE_ANYTIME = 0x1, /** whether temporary data set on the host is updated or not when a move is proposed. * Considering transferring data from accelerator to host is relatively expensive, * only request this when data on host is needed for unoptimized code path. diff --git a/src/Particle/SoaDistanceTableABOMPTarget.h b/src/Particle/SoaDistanceTableABOMPTarget.h index 0f9005ebfe..01c29375c3 100644 --- a/src/Particle/SoaDistanceTableABOMPTarget.h +++ b/src/Particle/SoaDistanceTableABOMPTarget.h @@ -177,14 +177,14 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance const T* getMultiWalkerDataPtr() const override { - if(!mw_mem_) + if (!mw_mem_) throw std::runtime_error("SoaDistanceTableABOMPTarget mw_mem_ is nullptr"); return mw_mem_->mw_r_dr.data(); } size_t getPerTargetPctlStrideSize() const override { return getAlignedSize(N_sources) * (D + 1); } - /** evaluate the full table */ + /** evaluate the full table */ inline void evaluate(ParticleSet& P) override { resize(); @@ -268,7 +268,7 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance #ifndef NDEBUG const int stride_size = getPerTargetPctlStrideSize(); - count_targets = 0; + count_targets = 0; for (size_t iw = 0; iw < dt_list.size(); iw++) { auto& dt = dt_list.getCastedElement(iw); @@ -349,9 +349,10 @@ class SoaDistanceTableABOMPTarget : public DTD_BConds, public Distance iel); } - if(!(modes_ & DTModes::MW_EVALUATE_RESULT_NO_TRANSFER_TO_HOST)) + if (!(modes_ & DTModes::MW_EVALUATE_RESULT_NO_TRANSFER_TO_HOST)) { - PRAGMA_OFFLOAD("omp target update from(r_dr_ptr[:mw_r_dr.size()]) depend(inout:r_dr_ptr[:mw_r_dr.size()]) nowait") + PRAGMA_OFFLOAD( + "omp target update from(r_dr_ptr[:mw_r_dr.size()]) depend(inout:r_dr_ptr[:mw_r_dr.size()]) nowait") } // wait for computing and (optional) transfering back to host. // It can potentially be moved to ParticleSet to fuse multiple similar taskwait diff --git a/src/Particle/VirtualParticleSet.cpp b/src/Particle/VirtualParticleSet.cpp index 34887ae450..f04ef5eae6 100644 --- a/src/Particle/VirtualParticleSet.cpp +++ b/src/Particle/VirtualParticleSet.cpp @@ -25,7 +25,7 @@ namespace qmcplusplus { -struct VPMultiWalkerMem: public Resource +struct VPMultiWalkerMem : public Resource { /// multi walker reference particle Vector> mw_refPctls; @@ -56,14 +56,14 @@ VirtualParticleSet::~VirtualParticleSet() = default; Vector>& VirtualParticleSet::getMultiWalkerRefPctls() { - if(!mw_mem_) + if (!mw_mem_) throw std::runtime_error("VirtualParticleSet::getMultiWalkerRefPctls mw_mem_ is nullptr"); return mw_mem_->mw_refPctls; } const Vector>& VirtualParticleSet::getMultiWalkerRefPctls() const { - if(!mw_mem_) + if (!mw_mem_) throw std::runtime_error("VirtualParticleSet::getMultiWalkerRefPctls mw_mem_ is nullptr"); return mw_mem_->mw_refPctls; } @@ -75,10 +75,11 @@ void VirtualParticleSet::createResource(ResourceCollection& collection) const ParticleSet::createResource(collection); } -void VirtualParticleSet::acquireResource(ResourceCollection& collection, const RefVectorWithLeader& vp_list) +void VirtualParticleSet::acquireResource(ResourceCollection& collection, + const RefVectorWithLeader& vp_list) { auto& vp_leader = vp_list.getLeader(); - auto res_ptr = dynamic_cast(collection.lendResource().release()); + auto res_ptr = dynamic_cast(collection.lendResource().release()); if (!res_ptr) throw std::runtime_error("VirtualParticleSet::acquireResource dynamic_cast failed"); vp_leader.mw_mem_.reset(res_ptr); @@ -87,7 +88,8 @@ void VirtualParticleSet::acquireResource(ResourceCollection& collection, const R ParticleSet::acquireResource(collection, p_list); } -void VirtualParticleSet::releaseResource(ResourceCollection& collection, const RefVectorWithLeader& vp_list) +void VirtualParticleSet::releaseResource(ResourceCollection& collection, + const RefVectorWithLeader& vp_list) { collection.takebackResource(std::move(vp_list.getLeader().mw_mem_)); auto p_list = RefVectorWithLeaderParticleSet(vp_list); @@ -102,7 +104,8 @@ void VirtualParticleSet::makeMoves(int jel, int iat) { if (sphere && iat < 0) - throw std::runtime_error("VirtualParticleSet::makeMoves is invoked incorrectly, the flag sphere=true requires iat specified!"); + throw std::runtime_error( + "VirtualParticleSet::makeMoves is invoked incorrectly, the flag sphere=true requires iat specified!"); onSphere = sphere; refPtcl = jel; refSourcePtcl = iat; @@ -113,11 +116,11 @@ void VirtualParticleSet::makeMoves(int jel, } void VirtualParticleSet::mw_makeMoves(const RefVectorWithLeader& vp_list, - const RefVector>& deltaV_list, - const RefVector>& joblist, - bool sphere) + const RefVector>& deltaV_list, + const RefVector>& joblist, + bool sphere) { - auto& vp_leader = vp_list.getLeader(); + auto& vp_leader = vp_list.getLeader(); vp_leader.onSphere = sphere; const size_t nVPs = countVPs(vp_list); @@ -140,7 +143,7 @@ void VirtualParticleSet::mw_makeMoves(const RefVectorWithLeader -struct J2OMPTargetMultiWalkerMem: public Resource +struct J2OMPTargetMultiWalkerMem : public Resource { // fused buffer for fast transfer Vector> transfer_buffer; @@ -129,16 +129,17 @@ void J2OMPTarget::evaluateRatios(const VirtualParticleSet& VP, std::vector void J2OMPTarget::mw_evaluateRatios(const RefVectorWithLeader& wfc_list, - const RefVectorWithLeader& vp_list, - std::vector>& ratios) const + const RefVectorWithLeader& vp_list, + std::vector>& ratios) const { // add early return to prevent from accessing vp_list[0] - if (wfc_list.size() == 0) return; - auto& wfc_leader = wfc_list.getCastedLeader>(); - auto& vp_leader = vp_list.getLeader(); + if (wfc_list.size() == 0) + return; + auto& wfc_leader = wfc_list.getCastedLeader>(); + auto& vp_leader = vp_list.getLeader(); const auto& mw_refPctls = vp_leader.getMultiWalkerRefPctls(); - auto& mw_vals = wfc_leader.mw_mem_->mw_vals; - const int nw = wfc_list.size(); + auto& mw_vals = wfc_leader.mw_mem_->mw_vals; + const int nw = wfc_list.size(); const size_t nVPs = mw_refPctls.size(); mw_vals.resize(nVPs); @@ -148,13 +149,15 @@ void J2OMPTarget::mw_evaluateRatios(const RefVectorWithLeadertransfer_buffer); + FT::mw_evaluateV(NumGroups, F.data() + igt * NumGroups, g_first.data(), g_last.data(), nVPs, mw_refPctls.data(), + dt_leader.getMultiWalkerDataPtr(), dt_leader.getPerTargetPctlStrideSize(), mw_vals.data(), + wfc_leader.mw_mem_->transfer_buffer); size_t ivp = 0; for (int iw = 0; iw < nw; ++iw) { const VirtualParticleSet& vp = vp_list[iw]; - const auto& wfc = wfc_list.getCastedElement>(iw); + const auto& wfc = wfc_list.getCastedElement>(iw); for (int k = 0; k < vp.getTotalNum(); ++k, ivp++) ratios[iw][k] = std::exp(wfc.Uat[mw_refPctls[ivp]] - mw_vals[ivp]); } @@ -192,8 +195,8 @@ void J2OMPTarget::copyFromBuffer(ParticleSet& P, WFBufferType& buf) template typename J2OMPTarget::LogValueType J2OMPTarget::updateBuffer(ParticleSet& P, - WFBufferType& buf, - bool fromscratch) + WFBufferType& buf, + bool fromscratch) { evaluateGL(P, P.G, P.L, false); buf.forward(Bytes_in_WFBuffer); @@ -234,7 +237,9 @@ typename J2OMPTarget::posT J2OMPTarget::accumulateG(const valT* restrict template J2OMPTarget::J2OMPTarget(const std::string& obj_name, ParticleSet& p) - : WaveFunctionComponent("J2OMPTarget", obj_name), my_table_ID_(p.addTable(p, DTModes::NEED_TEMP_DATA_ON_HOST)), j2_ke_corr_helper(p, F) + : WaveFunctionComponent("J2OMPTarget", obj_name), + my_table_ID_(p.addTable(p, DTModes::NEED_TEMP_DATA_ON_HOST)), + j2_ke_corr_helper(p, F) { if (myName.empty()) throw std::runtime_error("J2OMPTarget object name cannot be empty!"); @@ -346,12 +351,12 @@ std::unique_ptr J2OMPTarget::makeClone(ParticleSet& t */ template void J2OMPTarget::computeU3(const ParticleSet& P, - int iat, - const DistRow& dist, - RealType* restrict u, - RealType* restrict du, - RealType* restrict d2u, - bool triangle) + int iat, + const DistRow& dist, + RealType* restrict u, + RealType* restrict du, + RealType* restrict d2u, + bool triangle) { const int jelmax = triangle ? iat : N; constexpr valT czero(0); @@ -529,22 +534,21 @@ void J2OMPTarget::recompute(const ParticleSet& P) template void J2OMPTarget::mw_completeUpdates(const RefVectorWithLeader& wfc_list) const -{ -} +{} template typename J2OMPTarget::LogValueType J2OMPTarget::evaluateLog(const ParticleSet& P, - ParticleSet::ParticleGradient_t& G, - ParticleSet::ParticleLaplacian_t& L) + ParticleSet::ParticleGradient_t& G, + ParticleSet::ParticleLaplacian_t& L) { return evaluateGL(P, G, L, true); } template WaveFunctionComponent::LogValueType J2OMPTarget::evaluateGL(const ParticleSet& P, - ParticleSet::ParticleGradient_t& G, - ParticleSet::ParticleLaplacian_t& L, - bool fromscratch) + ParticleSet::ParticleGradient_t& G, + ParticleSet::ParticleLaplacian_t& L, + bool fromscratch) { if (fromscratch) recompute(P); diff --git a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp index cb7ae195d8..164ac9892f 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp @@ -161,7 +161,9 @@ typename J2OrbitalSoA::posT J2OrbitalSoA::accumulateG(const valT* restri template J2OrbitalSoA::J2OrbitalSoA(const std::string& obj_name, ParticleSet& p) - : WaveFunctionComponent("J2OrbitalSoA", obj_name), my_table_ID_(p.addTable(p, DTModes::NEED_TEMP_DATA_ON_HOST)), j2_ke_corr_helper(p, F) + : WaveFunctionComponent("J2OrbitalSoA", obj_name), + my_table_ID_(p.addTable(p, DTModes::NEED_TEMP_DATA_ON_HOST)), + j2_ke_corr_helper(p, F) { if (myName.empty()) throw std::runtime_error("J2OrbitalSoA object name cannot be empty!"); From 887828f959543a2ca92a9cd2b9ba8ae53b98b16e Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 26 Jul 2021 15:23:04 -0500 Subject: [PATCH 12/13] Fix multiple build variants. --- src/QMCWaveFunctions/Jastrow/CudaSpline.h | 5 +++-- src/QMCWaveFunctions/TrialWaveFunction.cpp | 2 +- .../tests/benchmark_DiracMatrixComputeCUDA.cpp | 12 ++++++------ .../tests/test_DiracMatrixComputeCUDA.cpp | 8 ++++---- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/QMCWaveFunctions/Jastrow/CudaSpline.h b/src/QMCWaveFunctions/Jastrow/CudaSpline.h index ec10e0da90..9fb2099c9f 100644 --- a/src/QMCWaveFunctions/Jastrow/CudaSpline.h +++ b/src/QMCWaveFunctions/Jastrow/CudaSpline.h @@ -28,11 +28,12 @@ struct CudaSpline template void set(BsplineFunctor& func) { - int num_coefs = func.SplineCoefs.size(); + const auto& spline_coefs = *func.spline_coefs_; + const int num_coefs = spline_coefs.size(); gpu::host_vector coefs_h(num_coefs); for (int i = 0; i < num_coefs; i++) { - coefs_h[i] = func.SplineCoefs[i]; + coefs_h[i] = spline_coefs[i]; // app_log() << "coefs_h[" << i << "] = " << coefs_h[i] << std::endl; } coefs = coefs_h; diff --git a/src/QMCWaveFunctions/TrialWaveFunction.cpp b/src/QMCWaveFunctions/TrialWaveFunction.cpp index 79df11a5ba..b6a7b6d946 100644 --- a/src/QMCWaveFunctions/TrialWaveFunction.cpp +++ b/src/QMCWaveFunctions/TrialWaveFunction.cpp @@ -1009,7 +1009,7 @@ void TrialWaveFunction::mw_evaluateRatios(const RefVectorWithLeader& ratios = ratios_list[iw]; - assert(vp_list[iw].get().getTotalNum() == ratios.size()); + assert(vp_list[iw].getTotalNum() == ratios.size()); std::fill(ratios.begin(), ratios.end(), 1.0); t[iw].resize(ratios.size()); } diff --git a/src/QMCWaveFunctions/tests/benchmark_DiracMatrixComputeCUDA.cpp b/src/QMCWaveFunctions/tests/benchmark_DiracMatrixComputeCUDA.cpp index 5a72ce5ca9..bfbbb4143d 100644 --- a/src/QMCWaveFunctions/tests/benchmark_DiracMatrixComputeCUDA.cpp +++ b/src/QMCWaveFunctions/tests/benchmark_DiracMatrixComputeCUDA.cpp @@ -37,17 +37,17 @@ namespace qmcplusplus { #ifdef ENABLE_OFFLOAD -template -using OffloadPinnedAllocator = OMPallocator>; + template + using DualSpacePinnedAllocator = OMPallocator>; #elif ENABLE_CUDA -template -using OffloadPinnedAllocator = DualAllocator, PinnedAlignedAllocator>; + template + using DualSpacePinnedAllocator = DualAllocator, PinnedAlignedAllocator>; #endif template -using OffloadPinnedMatrix = Matrix>; +using OffloadPinnedMatrix = Matrix>; template -using OffloadPinnedVector = Vector>; +using OffloadPinnedVector = Vector>; // Mechanism to pretty print benchmark names. struct DiracComputeBenchmarkParameters; diff --git a/src/QMCWaveFunctions/tests/test_DiracMatrixComputeCUDA.cpp b/src/QMCWaveFunctions/tests/test_DiracMatrixComputeCUDA.cpp index 3c0dd84bc3..63f4cbde77 100644 --- a/src/QMCWaveFunctions/tests/test_DiracMatrixComputeCUDA.cpp +++ b/src/QMCWaveFunctions/tests/test_DiracMatrixComputeCUDA.cpp @@ -32,16 +32,16 @@ namespace qmcplusplus { #ifdef ENABLE_OFFLOAD template - using OffloadPinnedAllocator = OMPallocator>; + using DualSpacePinnedAllocator = OMPallocator>; #elif ENABLE_CUDA template - using OffloadPinnedAllocator = DualAllocator, PinnedAlignedAllocator>; + using DualSpacePinnedAllocator = DualAllocator, PinnedAlignedAllocator>; #endif template -using OffloadPinnedMatrix = Matrix>; +using OffloadPinnedMatrix = Matrix>; template -using OffloadPinnedVector = Vector>; +using OffloadPinnedVector = Vector>; TEST_CASE("DiracMatrixComputeCUDA_cuBLAS_geam_call", "[wavefunction][fermion]") { From c81106d07df87f1ef84d07e15944ad376625a793 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 26 Jul 2021 18:28:52 -0500 Subject: [PATCH 13/13] Update comments and license header. --- src/Particle/VirtualParticleSet.cpp | 8 +++----- src/Particle/VirtualParticleSet.h | 2 +- src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp | 2 +- src/QMCWaveFunctions/Jastrow/J2OMPTarget.h | 4 ++-- src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp | 2 +- src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h | 2 +- src/QMCWaveFunctions/TrialWaveFunction.h | 3 +++ src/QMCWaveFunctions/WaveFunctionComponent.h | 8 +++++--- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/Particle/VirtualParticleSet.cpp b/src/Particle/VirtualParticleSet.cpp index f04ef5eae6..5b800bb62e 100644 --- a/src/Particle/VirtualParticleSet.cpp +++ b/src/Particle/VirtualParticleSet.cpp @@ -2,7 +2,7 @@ // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// Copyright (c) 2021 QMCPACK developers. // // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign @@ -56,15 +56,13 @@ VirtualParticleSet::~VirtualParticleSet() = default; Vector>& VirtualParticleSet::getMultiWalkerRefPctls() { - if (!mw_mem_) - throw std::runtime_error("VirtualParticleSet::getMultiWalkerRefPctls mw_mem_ is nullptr"); + assert(mw_mem_ != nullptr); return mw_mem_->mw_refPctls; } const Vector>& VirtualParticleSet::getMultiWalkerRefPctls() const { - if (!mw_mem_) - throw std::runtime_error("VirtualParticleSet::getMultiWalkerRefPctls mw_mem_ is nullptr"); + assert(mw_mem_ != nullptr); return mw_mem_->mw_refPctls; } diff --git a/src/Particle/VirtualParticleSet.h b/src/Particle/VirtualParticleSet.h index 4324cfd419..c629e47527 100644 --- a/src/Particle/VirtualParticleSet.h +++ b/src/Particle/VirtualParticleSet.h @@ -2,7 +2,7 @@ // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// Copyright (c) 2021 QMCPACK developers. // // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp index e0be459daf..523bdc5034 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.cpp @@ -2,7 +2,7 @@ // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// Copyright (c) 2021 QMCPACK developers. // // File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. // Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp. diff --git a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h index 3198d8e189..8f630cf775 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h +++ b/src/QMCWaveFunctions/Jastrow/J2OMPTarget.h @@ -2,13 +2,13 @@ // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// Copyright (c) 2021 QMCPACK developers. // // File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. // Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp. // Ye Luo, yeluo@anl.gov, Argonne National Laboratory // -// File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. +// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory ////////////////////////////////////////////////////////////////////////////////////// // -*- C++ -*- #ifndef QMCPLUSPLUS_TWOBODYJASTROW_OMPTARGET_H diff --git a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp index 164ac9892f..24d4789a32 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp +++ b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.cpp @@ -2,7 +2,7 @@ // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// Copyright (c) 2021 QMCPACK developers. // // File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. // Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp. diff --git a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h index 74f966e744..87e8487416 100644 --- a/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h +++ b/src/QMCWaveFunctions/Jastrow/J2OrbitalSoA.h @@ -2,7 +2,7 @@ // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// Copyright (c) 2021 QMCPACK developers. // // File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp. // Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp. diff --git a/src/QMCWaveFunctions/TrialWaveFunction.h b/src/QMCWaveFunctions/TrialWaveFunction.h index 142d90055a..52ed3918a1 100644 --- a/src/QMCWaveFunctions/TrialWaveFunction.h +++ b/src/QMCWaveFunctions/TrialWaveFunction.h @@ -376,6 +376,9 @@ class TrialWaveFunction int iat, const std::vector& isAccepted, bool safe_to_delay = false); + + /** complete all the delayed or asynchronous operations before leaving the p-by-p move region. + * See WaveFunctionComponent::completeUpdates for more detail */ void completeUpdates(); /* batched version of completeUpdates. */ static void mw_completeUpdates(const RefVectorWithLeader& wf_list); diff --git a/src/QMCWaveFunctions/WaveFunctionComponent.h b/src/QMCWaveFunctions/WaveFunctionComponent.h index 1ae9075718..873ea2613a 100644 --- a/src/QMCWaveFunctions/WaveFunctionComponent.h +++ b/src/QMCWaveFunctions/WaveFunctionComponent.h @@ -334,12 +334,14 @@ struct WaveFunctionComponent : public QMCTraits const std::vector& isAccepted, bool safe_to_delay = false) const; - /** complete all the delayed updates, must be called after each substep or step during pbyp move + /** complete all the delayed or asynchronous operations before leaving the p-by-p move region. + * Must be called at the end of each substep if p-by-p move is used. + * This function was initially introduced for determinant delayed updates to complete all the delayed operations. + * It has been extended to handle asynchronous operations on accellerators before leaving the p-by-p move region. */ virtual void completeUpdates() {} - /** complete all the delayed updates for all the walkers in a batch - * must be called after each substep or step during pbyp move + /** complete all the delayed or asynchronous operations for all the walkers in a batch before leaving the p-by-p move region. */ virtual void mw_completeUpdates(const RefVectorWithLeader& wfc_list) const;