Merge pull request #4499 from ye-luo/tidy-up-qmc_cuda

Complete remaining QMC_CUDA related tasks
QMCPACK · Mar 6, 2023 · 58e98ca · 58e98ca
2 parents c2435c6 + 48a0317
commit 58e98ca
Show file tree

Hide file tree

Showing 22 changed files with 70 additions and 752 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 Notable changes to QMCPACK are documented in this file.
 
+## [Unreleased]
+
+The legacy CUDA implementation, the version built with QMC_CUDA=1, has been removed from the codebase.
+
 ## [3.16.0] - 2023-01-31
 
 This release contains important bug fixes as well as feature improvements. It is a recommended release for all users. Thanks to

diff --git a/CMake/ctest_script.cmake b/CMake/ctest_script.cmake
@@ -174,10 +174,6 @@ if(DEFINED RMG_BIN)
   set(CTEST_OPTIONS "${CTEST_OPTIONS};-DRMG_BIN='${RMG_BIN}'")
 endif()
 
-if(DEFINED QMC_CUDA)
-  set(CTEST_OPTIONS "${CTEST_OPTIONS};-DQMC_CUDA=${QMC_CUDA}")
-endif()
-
 if(DEFINED ENABLE_CUDA)
   set(CTEST_OPTIONS "${CTEST_OPTIONS};-DENABLE_CUDA=${ENABLE_CUDA}")
 endif()

diff --git a/CMake/macros.cmake b/CMake/macros.cmake
@@ -192,9 +192,9 @@ function(
       APPEND
       PROPERTY LABELS "QMCPACK")
 
-    if(QMC_CUDA
-       OR ENABLE_CUDA
+    if(ENABLE_CUDA
        OR ENABLE_ROCM
+       OR ENABLE_SYCL
        OR ENABLE_OFFLOAD)
       set_tests_properties(${TESTNAME} PROPERTIES RESOURCE_LOCK exclusively_owned_gpus)
     endif()

diff --git a/CMake/test_labels.cmake b/CMake/test_labels.cmake
@@ -1,12 +1,12 @@
 function(ADD_TEST_LABELS TEST_NAME TEST_LABELS)
   set(TEST_LABELS_TEMP "")
-  set(TEST_LABELS_UNIQUE_NAME TEST_LABELS_${TEST_NAME}_${QMC_CUDA}_${QMC_COMPLEX}_${QMC_MIXED_PRECISION})
+  set(TEST_LABELS_UNIQUE_NAME TEST_LABELS_${TEST_NAME}_${QMC_COMPLEX}_${QMC_MIXED_PRECISION})
   if(DEFINED ${TEST_LABELS_UNIQUE_NAME})
     set(TEST_LABELS_TEMP ${${TEST_LABELS_UNIQUE_NAME}})
   else()
     set(SUCCESS FALSE)
     execute_process(
-      COMMAND ${Python3_EXECUTABLE} ${qmcpack_SOURCE_DIR}/tests/scripts/test_labels.py ${TEST_NAME} ${QMC_CUDA}
+      COMMAND ${Python3_EXECUTABLE} ${qmcpack_SOURCE_DIR}/tests/scripts/test_labels.py ${TEST_NAME}
               ${QMC_COMPLEX} ${QMC_MIXED_PRECISION}
       OUTPUT_VARIABLE TEST_LABELS_TEMP
       RESULT_VARIABLE SUCCESS)

diff --git a/CMake/unit_test.cmake b/CMake/unit_test.cmake
@@ -21,9 +21,9 @@ function(ADD_UNIT_TEST TESTNAME PROCS THREADS TEST_BINARY)
     set_tests_properties(${TESTNAME} PROPERTIES PROCESSORS ${TOT_PROCS} ENVIRONMENT OMP_NUM_THREADS=${THREADS}
                                                 PROCESSOR_AFFINITY TRUE)
 
-    if(QMC_CUDA
-       OR ENABLE_CUDA
+    if(ENABLE_CUDA
        OR ENABLE_ROCM
+       OR ENABLE_SYCL
        OR ENABLE_OFFLOAD)
       set_tests_properties(${TESTNAME} PROPERTIES RESOURCE_LOCK exclusively_owned_gpus)
     endif()

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -52,11 +52,7 @@ include(CMakeDependentOption)
 option(QMC_MPI "Enable/disable MPI" ON)
 option(QMC_OMP "Enable/disable OpenMP" ON)
 option(QMC_COMPLEX "Build for complex binary" OFF)
-option(QMC_CUDA "Build with GPU support through CUDA" OFF)
-option(
-  ENABLE_CUDA
-  "Build with the second generation of GPU support through CUDA (production quality for AFQMC, experimental for real space)"
-  OFF)
+option(ENABLE_CUDA "Build with GPU support through CUDA" OFF)
 option(QMC_CUDA2HIP "Map all CUDA kernels and library calls to HIP" OFF)
 if(QMC_CUDA2HIP OR ENABLE_HIP)
   set(ENABLE_ROCM ON) # option(ENABLE_ROCM) will be no-op

diff --git a/README.md b/README.md
@@ -153,28 +153,26 @@ make -j 8
                                         CMAKE_CXX_FLAGS_RELWITHDEBINFO
 ```
 
- * Key QMC build options
-
-```
-     QMC_CUDA            Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no)
-     QMC_COMPLEX         Build the complex (general twist/k-point) version (1:yes, 0:no)
-     QMC_MIXED_PRECISION Build the mixed precision (mixing double/float) version
-                         (1:yes (GPU default), 0:no (CPU default)).
-                         The CPU support is experimental.
-                         Use float and double for base and full precision.
-                         The GPU support is quite mature.
-                         Use always double for host side base and full precision
-                         and use float and double for CUDA base and full precision.
-     ENABLE_CUDA         ON/OFF(default). Enable CUDA code path for NVIDIA GPU acceleration.
-                         Production quality for AFQMC. Pre-production quality for real-space.
-                         Use CMAKE_CUDA_ARCHITECTURES, default 70, to set the actual GPU architecture.
-     ENABLE_OFFLOAD      ON/OFF(default). Experimental feature. Enable OpenMP target offload for GPU acceleration.
-     ENABLE_TIMERS       ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse
-                         to avoid potential slowdown in tiny systems.
-                         For systems beyond tiny sizes (100+ electrons) there is no risk.
+ * Key QMCPACK build options
+
 ```
+    QMC_COMPLEX           ON/OFF(default). Build the complex (general twist/k-point) version.
+    QMC_MIXED_PRECISION   ON/OFF(default). Build the mixed precision (mixing double/float) version
+                          Mixed precision calculations can be signifiantly faster but should be
+                          carefully checked validated against full double precision runs,
+                          particularly for large electron counts.
+    ENABLE_OFFLOAD        ON/OFF(default). Enable OpenMP target offload for GPU acceleration.
+    ENABLE_CUDA           ON/OFF(default). Enable CUDA code path for NVIDIA GPU acceleration.
+                          Production quality for AFQMC and real-space performance portable implementation.
+                          Use CMAKE_CUDA_ARCHITECTURES, default 70, to select the actual GPU architecture.
+    QMC_CUDA2HIP          ON/OFF(default). Map all CUDA kernels and library calls to HIP and use ROCm libraries.
+                          Set both ENABLE_CUDA and QMC_CUDA2HIP ON to target AMD GPUs.
+                          Use CMAKE_HIP_ARCHITECTURES, default gfx906, to select the actual GPU architecture.
+    ENABLE_SYCL           ON/OFF(default). Enable SYCL code path. Only support Intel GPUs and OneAPI compilers.
 
- * Additional QMC options
+```
+
+ * Additional QMCPACK options
 
 ```
      QE_BIN              Location of Quantum Espresso binaries including pw2qmcpack.x
@@ -187,6 +185,9 @@ make -j 8
                             saving default use of symbolic links for test files. Useful
                             if the build is on a separate filesystem from the source, as
                             required on some HPC systems.
+     ENABLE_TIMERS       ON(default)/OFF. Enable fine-grained timers. Timers are on by default but at level coarse
+                         to avoid potential slowdown in tiny systems.
+                         For systems beyond tiny sizes (100+ electrons) there is no risk.
 ```
 
   * libxml2 related

diff --git a/config/build_olcf_andes.sh b/config/build_olcf_andes.sh
@@ -15,8 +15,6 @@ export LAPACK_LIBS="$BLAS_LIBS $OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.a"
 
 declare -A builds=( ["cpu"]="-DBUILD_PPCONVERT=1" \
                     ["complex_cpu"]="-DQMC_COMPLEX=1" \
-#		    ["legacy_gpu"]="-DQMC_CUDA=1 " \
-#		    ["complex_legacy_gpu"]="-DQMC_CUDA=1 -DQMC_COMPLEX=1 " \
 		  )
 
 mkdir bin_andes

diff --git a/config/build_olcf_summit.sh b/config/build_olcf_summit.sh
diff --git a/config/load_olcf_summit_modules.sh b/config/load_olcf_summit_modules.sh
diff --git a/docs/features.rst b/docs/features.rst
@@ -67,8 +67,6 @@ feature that you are interested in, check the remainder of this manual or ask if
 
 -  OpenMP-offload-based performance portable GPU implementation, see :ref:`gpufeatures`.
 
--  Legacy GPU (NVIDIA CUDA) implementation (limited functionality - see :ref:`gpufeatures`).
-
 -  Analysis tools for minimal environments (Perl only) through to
    Python-based environments with graphs produced via matplotlib (included with Nexus).
 
@@ -117,17 +115,14 @@ description of the available features, see  :ref:`afqmc`.
 Supported GPU features for real space QMC
 -----------------------------------------
 
-There are two GPU implementations in the code base.
-
-  - **Performance portable implementation** (recommended). Implements real space QMC methods
-    using OpenMP offload programming model and accelerated linear algebra libraries.
-    Runs with good performance on NVIDIA and AMD GPUs, and the Intel GPU support is under development.
-    Unlike the "legacy" implementation, it is feature complete
-    and users may mix and match CPU-only and GPU-accelerated features.
-
-  - **Legacy implementation**. Fully based on NVIDIA CUDA. Achieves very good speedup on NVIDIA GPUs.
-    However, only a very limited subset of features is available.
+The **Performance portable implementation** implements real space QMC methods
+using OpenMP offload programming model and accelerated linear algebra libraries.
+Runs with good performance on NVIDIA and AMD GPUs, and the Intel GPU support is under development.
+Unlike the "legacy" implementation, it is feature complete
+and users may mix and match CPU-only and GPU-accelerated features.
+Using batched QMC drivers is required.
 
+The **Legacy implementation** fully based on NVIDIA CUDA has been removed.
 
 QMCPACK supports running on multi-GPU node architectures via MPI.
 
@@ -150,7 +145,7 @@ Supported GPU features:
   +--------------------------------+---------------------------+------------------+
   | LCAO orbitals                  | on host now, being ported | not supported    |
   +--------------------------------+---------------------------+------------------+
-  | One-body Jastrow factors       | on host                   | accelerated      |
+  | One-body Jastrow factors       | accelerated               | accelerated      |
   +--------------------------------+---------------------------+------------------+
   | Two-body Jastrow factors       | accelerated               | accelerated      |
   +--------------------------------+---------------------------+------------------+
@@ -167,40 +162,4 @@ Supported GPU features:
   | Model periodic Coulomb (MPC)   | on host                   | accelerated      |
   +--------------------------------+---------------------------+------------------+
 
-Additional information:
-
-- Performance portable implementation requires using batched QMC drivers.
-
-- Legacy CUDA implementation only supports T-move v0 or no T-move.
-
-- In most features, the algorithmic and implementation details differ a lot between these two GPU implementations.
-
-Sharing of spline data across multiple GPUs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Sharing of GPU spline data enables distribution of the data across
-multiple GPUs on a given computational node. For example, on a
-two-GPU-per-node system, each GPU would have half of the orbitals. This
-allows use of larger overall spline tables than would fit in the memory
-of individual GPUs and potentially up to the total GPU memory on a node.
-To obtain high performance, large electron counts or a high-performing
-CPU-GPU interconnect is required.
-This feature is only supported in the legacy implementation.
-
-To use this feature, the following needs to be done:
-
--  The CUDA Multi-Process Service (MPS) needs to be used (e.g., on OLCF
-   Summit/SummitDev use “-alloc_flags gpumps" for bsub). If MPI is not
-   detected, sharing will be disabled.
-
--  CUDA_VISIBLE_DEVICES needs to be properly set to control each rank’s
-   visible CUDA devices (e.g., on OLCF Summit/SummitDev create a
-   resource set containing all GPUs with the respective number of ranks
-   with “jsrun –task-per-rs Ngpus -g Ngpus").
-
--  In the determinant set definition of the <wavefunction> section, the
-   “gpusharing" parameter needs to be set (i.e., <determinantset
-   gpusharing=“yes">). See
-   :ref:`spo-spline`.
-
 .. bibliography:: /bibs/features.bib
diff --git a/docs/hamiltonianobservable.rst b/docs/hamiltonianobservable.rst
@@ -112,8 +112,6 @@ Many pair potentials are supported.  Though only the most commonly used pair pot
   +------------------+---------+-----------------------------------------------+
   |                  | mpc     | Model periodic Coulomb interaction/correction |
   +------------------+---------+-----------------------------------------------+
-  |                  | cpp     | Core polarization potential                   |
-  +------------------+---------+-----------------------------------------------+
   |                  | skpot   | *Unknown*                                     |
   +------------------+---------+-----------------------------------------------+
 
@@ -148,16 +146,11 @@ Additional information:
    of the classical/quantum ``particleset``.
 
 -  Only ``Coulomb, pseudo``, and ``mpc`` are described in detail in the
-   following subsections. The older or less-used types (``cpp, skpot``)
+   following subsections. The older or less-used types (``skpot``)
    are not covered.
 
--  Available only if ``QMC_CUDA`` is not defined: ``skpot``.
-
 -  Available only if ``OHMMS_DIM==3``: ``mpc, vhxc, pseudo``.
 
--  Available only if ``OHMMS_DIM==3`` and ``QMC_CUDA`` is not defined:
-   ``cpp``.
-
 Coulomb potentials
 ~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -282,19 +282,18 @@ the path to the source directory.
 
   ::
 
-    QMC_COMPLEX           Build the complex (general twist/k-point) version (1:yes, 0:no)
-    QMC_MIXED_PRECISION   Build the mixed precision (mixing double/float) version
-                          (1:yes (QMC_CUDA=1 default), 0:no (QMC_CUDA=0 default)).
+    QMC_COMPLEX           ON/OFF(default). Build the complex (general twist/k-point) version.
+    QMC_MIXED_PRECISION   ON/OFF(default). Build the mixed precision (mixing double/float) version
                           Mixed precision calculations can be signifiantly faster but should be
                           carefully checked validated against full double precision runs,
                           particularly for large electron counts.
     ENABLE_OFFLOAD        ON/OFF(default). Enable OpenMP target offload for GPU acceleration.
-    QMC_CUDA              Enable legacy CUDA code path for NVIDIA GPU acceleration (1:yes, 0:no)
     ENABLE_CUDA           ON/OFF(default). Enable CUDA code path for NVIDIA GPU acceleration.
                           Production quality for AFQMC and real-space performance portable implementation.
-                          Use CMAKE_CUDA_ARCHITECTURES, default 70, to set the actual GPU architecture.
-    QMC_CUDA2HIP          ON/OFF(default). To be set ON, it requires either QMC_CUDA or ENABLE_CUDA to be ON.
-                          Compile CUDA source code as HIP and use ROCm libraries for AMD GPUs.
+                          Use CMAKE_CUDA_ARCHITECTURES, default 70, to select the actual GPU architecture.
+    QMC_CUDA2HIP          ON/OFF(default). Map all CUDA kernels and library calls to HIP and use ROCm libraries.
+                          Set both ENABLE_CUDA and QMC_CUDA2HIP ON to target AMD GPUs.
+                          Use CMAKE_HIP_ARCHITECTURES, default gfx906, to select the actual GPU architecture.
     ENABLE_SYCL           ON/OFF(default). Enable SYCL code path. Only support Intel GPUs and OneAPI compilers.
 
 - General build options

diff --git a/docs/intro_wavefunction.rst b/docs/intro_wavefunction.rst
@@ -265,10 +265,6 @@ attribute:
 +-----------------------------+------------+--------------------------+---------+-------------------------------------------+
 | ``gpu``                     | Text       | Yes/no                   |         | GPU switch.                               |
 +-----------------------------+------------+--------------------------+---------+-------------------------------------------+
-| ``gpusharing``              | Text       | Yes/no                   | No      | Share B-spline table across GPUs.         |
-+-----------------------------+------------+--------------------------+---------+-------------------------------------------+
-| ``Spline_Size_Limit_MB``    | Integer    |                          |         | Limit B-spline table size on GPU.         |
-+-----------------------------+------------+--------------------------+---------+-------------------------------------------+
 | ``check_orb_norm``          | Text       | Yes/no                   | Yes     | Check norms of orbitals from h5 file.     |
 +-----------------------------+------------+--------------------------+---------+-------------------------------------------+
 | ``save_coefs``              | Text       | Yes/no                   | No      | Save the spline coefficients to h5 file.  |
@@ -314,32 +310,6 @@ Additional information:
     scratch memory on the compute nodes, users can perform this step on
     fat nodes and transfer back the h5 file for QMC calculations.
 
-- gpusharing
-    If enabled, spline data is shared across multiple
-    GPUs on a given computational node. For example, on a
-    two-GPU-per-node system, each GPU would have half of the orbitals.
-    This enables larger overall spline tables than would normally fit in
-    the memory of individual GPUs to be used, potentially up to the total
-    GPU memory on a node. To obtain high performance, large electron
-    counts or a high-performing CPU-GPU interconnect is required. To use
-    this feature, the following needs to be done:
-
-      -  The CUDA Multi-Process Service (MPS) needs to be used (e.g., on
-         Summit use "-alloc_flags gpumps" for bsub). If MPS is not
-         detected, sharing will be disabled.
-
-      -  CUDA_VISIBLE_DEVICES needs to be properly set to control each rank’s
-         visible CUDA devices (e.g., on OLCF Summit one needs to
-         create a resource set containing all GPUs with the respective number
-         of ranks with "jsrun –task-per-rs Ngpus -g Ngpus").
-
-- Spline_Size_Limit_MB
-    Allows distribution of the B-spline
-    coefficient table between the host and GPU memory. The compute kernels
-    access host memory via zero-copy. Although the performance penalty
-    introduced by it is significant, it allows large calculations to go
-    through.
-
 - skip_checks
     When converting the wave function from convertpw4qmc instead
     of pw2qmcpack, there is missing ionic information. This flag bypasses the requirement

diff --git a/docs/introduction.rst b/docs/introduction.rst
@@ -16,7 +16,7 @@ results in some cases.
 QMCPACK is written in C++ and is designed with the modularity afforded by object-oriented programming. High parallel and
 computational efficiencies are achievable on the largest supercomputers. Because of the modular architecture, the addition of new
 wavefunctions, algorithms, and observables is relatively straightforward. For parallelization, QMCPACK uses a fully hybrid
-(OpenMP,CUDA)/MPI approach to optimize memory usage and to take advantage of the growing number of cores per SMP node or graphical
+OpenMP/MPI approach to optimize memory usage and to take advantage of the growing number of cores per SMP node or graphical
 processing units (GPUs) and accelerators. Finally, QMCPACK uses standard file formats for input and output in XML and HDF5 to
 facilitate data exchange.