Merge pull request #1101 from atillack/split_splines_gpu

Split splines across multiple GPUs on a given node
QMCPACK · Dec 7, 2018 · a22d633 · a22d633
2 parents 7183da7 + 7eda7d2
commit a22d633
Show file tree

Hide file tree

Showing 22 changed files with 1,817 additions and 338 deletions.
diff --git a/manual/features.tex b/manual/features.tex
@@ -103,3 +103,29 @@ \subsection{Auxiliary-Field Quantum Monte Carlo}
     \item Complex implementation for PBC calculations with complex integrals.
     \item Sparse representation of large matrices for reduced memory usage.
 \end{itemize}
+
+\subsection{Sharing of spline data across multiple GPUs}
+
+Sharing of GPU spline data enables the distribution of the spline data
+across multiple GPUs on a given computational node. For example, on a
+two GPU per node system, each GPU would have half of the
+orbitals. This enables larger overall spline tables than would fit in
+the memory of individual GPUs to be utilized, and potentially up to
+the total GPU memory on a node. To obtain high performance, large
+electron counts or a high-performing CPU-GPU interconnect is required.
+
+In order to use this feature, the following needs to be done:
+
+\begin{itemize}
+    \item The CUDA Multi-Process Service (MPS) needs to be utilized
+      (e.g. on Summit/SummitDev use "-alloc\_flags gpumps" for
+      bsub). If MPI is not detected sharing will be disabled.
+    \item CUDA\_VISIBLE\_DEVICES needs to be properly set to control each
+      rank's visible CUDA devices (e.g. on OLCF Summit/SummitDev one
+      needs to create a resource set containing all GPUs with the
+      respective number of ranks with "jsrun --task-per-rs Ngpus -g
+      Ngpus")
+    \item In the determinant set definition of the <wavefunction>
+      section the "gpusharing" parameter needs to be set
+      (i.e. <determinantset gpusharing="yes">). See Sec. \ref{sec:spo_spline}.
+\end{itemize}
diff --git a/manual/spo_spline.tex b/manual/spo_spline.tex
@@ -57,13 +57,14 @@ \subsection{Spline basis sets}
 \multicolumn{2}{l}{attribute      :} & \multicolumn{4}{l}{}\\
    &   \bfseries name                   & \bfseries datatype & \bfseries values & \bfseries default & \bfseries description \\
    &   \texttt{type}                    &  text              &   bspline        &                   &  Type of \texttt{sposet}. \\
-   &   \texttt{href}                    &  text              &                  &                   &  Path to the h5 file made by pw2qmcpack.x. \\
+   &   \texttt{href}                    &  text              &                  &                   &  Path to hdf5 file from pw2qmcpack.x. \\
    &   \texttt{tilematrix}              &  9 integers        &                  &                   &  Tiling matrix used to expand supercell. \\
    &   \texttt{twistnum}                &  integer           &                  &                   &  Index of the super twist. \\
    &   \texttt{twist}                   &  3 floats          &                  &                   &  Super twist. \\
    &   \texttt{meshfactor}              &  float             &  $\le 1.0$       &                   &  Grid spacing ratio. \\
    &   \texttt{precision}               &  text              &  single/double   &                   &  Precision of spline coefficients. \\
    &   \texttt{gpu}                     &  text              &  yes/no          &                   &  GPU switch. \\
+   &   \texttt{gpusharing}              &  text              &  yes/no          & no                &  Share B-spline table across GPUs. \\
    &   \texttt{Spline\_Size\_Limit\_MB} &  integer           &                  &                   &  Limit B-spline table size on GPU. \\
    &   \texttt{check\_orb\_norm}        &  text              &  yes/no          &  yes              &  Check norms of orbitals from h5 file. \\
    &   \texttt{source}                  &  text              &  \textit{any}    &  ion0             &  Particle set with atomic positions. \\
@@ -109,5 +110,22 @@ \subsection{Spline basis sets}
 \item \texttt{precision}. Only effective on CPU version without mixed precision, `single' is always imposed with mixed precision. Using single precision not only saves memory usage but also speeds up the B-spline evaluation. It is recommended to use single precision since we saw little chance of really compromising the accuracy of calculation.
 \item \texttt{meshfactor}. It is the ratio of actual grid spacing of B-splines used in QMC calculation with respect to the original one calculated from h5. Smaller meshfactor saves memory usage but reduces accuracy. The effects are similar to reducing plane wave cutoff in DFT calculation. Use with caution! 
 \item \texttt{twistnum}. If positive, it is the index. It is recommended not to take this way since the indexing may show some uncertainty. If negative, the super twist is referred by \texttt{twist}.
-\item \texttt{Spline\_Size\_Limit\_MB}. Allows to distribute the B-spline coefficient table between the host and GPU memory. The compute kernels access host memory via zero-copy. Though the performance penaty introduced by it is significant but allows large calculations to go.
+\item \texttt{gpusharing}. If enabled, spline data is shared across multiple GPUs on a given computational node. For example, on a
+two GPU per node system, each GPU would have half of the
+orbitals. This enables larger overall spline tables than would fit in
+the memory of individual GPUs to be utilized, and potentially up to
+the total GPU memory on a node. To obtain high performance, large
+electron counts or a high-performing CPU-GPU interconnect is required.
+In order to use this feature, the following needs to be done:
+\begin{itemize}
+    \item The CUDA Multi-Process Service (MPS) needs to be utilized
+      (e.g. on Summit/SummitDev use "-alloc\_flags gpumps" for
+      bsub). If MPS is not detected sharing will be disabled.
+    \item CUDA\_VISIBLE\_DEVICES needs to be properly set to control each
+      rank's visible CUDA devices (e.g. on OLCF Summit/SummitDev one
+      needs to create a resource set containing all GPUs with the
+      respective number of ranks with "jsrun --task-per-rs Ngpus -g
+      Ngpus")
+\end{itemize}
+\item \texttt{Spline\_Size\_Limit\_MB}. Allows to distribute the B-spline coefficient table between the host and GPU memory. The compute kernels access host memory via zero-copy. Though the performance penalty introduced by it is significant but allows large calculations to go.
 \end{itemize}
diff --git a/src/CUDA/gpu_misc.cpp b/src/CUDA/gpu_misc.cpp
@@ -34,6 +34,11 @@ cublasHandle_t cublasHandle;
 
 size_t MaxGPUSpineSizeMB;
 int rank;
+int relative_rank; // relative rank number on the node the rank is on, counting starts at zero
+int device_group_size; // size of the lists below
+bool cudamps; // is set to true if Cuda MPS service is running
+std::vector<int> device_group_numbers; // on node list of GPU device numbers with respect to relative rank number
+std::vector<int> device_rank_numbers; // on node list of MPI rank numbers (absolute) with respect to relative rank number
 
 void
 initCUDAStreams()

diff --git a/src/CUDA/gpu_misc.h b/src/CUDA/gpu_misc.h
@@ -45,6 +45,11 @@ extern cublasHandle_t cublasHandle;
 
 extern size_t MaxGPUSpineSizeMB;
 extern int rank;
+extern int relative_rank;
+extern int device_group_size;
+extern bool cudamps;
+extern std::vector<int> device_group_numbers;
+extern std::vector<int> device_rank_numbers;
 
 void initCUDAStreams();
 void initCUDAEvents();

diff --git a/src/CUDA/gpu_vector.cpp b/src/CUDA/gpu_vector.cpp
@@ -57,6 +57,39 @@ cuda_memory_manager_type::allocate(size_t bytes, std::string name)
   return p;
 }
 
+void*
+cuda_memory_manager_type::allocate_managed(size_t bytes, std::string name, unsigned int flags)
+{
+  // Make sure size is a multiple of 16
+  bytes = (((bytes+15)/16)*16);
+  void *p;
+  cudaMallocManaged ((void**)&p, bytes, flags);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    fprintf (stderr, "Failed to allocate %ld bytes for object %s:\n  %s\n",
+             bytes, name.c_str(), cudaGetErrorString(err));
+    abort();
+  }
+  else
+  {
+    gpu_pointer_map[p] = std::pair<std::string,size_t> (name,bytes);
+    std::map<std::string,gpu_mem_object>::iterator iter = gpu_mem_map.find(name);
+    if (iter == gpu_mem_map.end())
+    {
+      gpu_mem_object obj(bytes);
+      gpu_mem_map[name] = obj;
+    }
+    else
+    {
+      gpu_mem_object &obj = iter->second;
+      obj.num_objects++;
+      obj.total_bytes += bytes;
+    }
+  }
+  return p;
+}
+
 void
 cuda_memory_manager_type::deallocate (void *p)
 {

diff --git a/src/CUDA/gpu_vector.h b/src/CUDA/gpu_vector.h
@@ -56,6 +56,7 @@ class cuda_memory_manager_type
 
 public:
   void *allocate (size_t bytes, std::string name="");
+  void *allocate_managed (size_t bytes, std::string name="", unsigned int flags=cudaMemAttachGlobal);
 
   void deallocate (void *p);
 
@@ -76,36 +77,45 @@ class device_vector
   // True if the data was allocated by this vector.  False if we're
   // referencing memory
   bool own_data;
+  // True if managed memory was requested using resize function, starts out false
+  bool managedmem;
+  // Flags for managed memory creation (defaults to cudaMemAttachGlobal) that can be set with set_managed_flags function
+  unsigned int managed_flags;
 public:
   typedef T* pointer;
 
   void set_name(std::string n)
   {
     name = n;
   }
+
+  void set_managed_flags(unsigned int flags)
+  {
+    managed_flags = flags;
+  }
 
   inline
   device_vector() : data_pointer(NULL), current_size(0),
-    alloc_size(0), own_data(true)
+    alloc_size(0), own_data(true), managedmem(false), managed_flags(cudaMemAttachGlobal)
   { }
 
   inline
   device_vector(std::string myName) : name(myName), data_pointer(NULL),
     current_size(0), alloc_size(0),
-    own_data(true)
+    own_data(true), managedmem(false), managed_flags(cudaMemAttachGlobal)
   {  }
 
   inline
   device_vector(size_t size) : data_pointer(NULL), current_size(0),
-    alloc_size(0), own_data(true)
+    alloc_size(0), own_data(true), managedmem(false), managed_flags(cudaMemAttachGlobal)
   {
     resize (size);
   }
 
   inline
   device_vector(std::string myName, size_t size) :
     name(myName), data_pointer(NULL), current_size(0),
-    alloc_size(0), own_data(true)
+    alloc_size(0), own_data(true), managedmem(false), managed_flags(cudaMemAttachGlobal)
   {
     resize(size);
   }
@@ -130,6 +140,7 @@ class device_vector
     current_size = size;
     alloc_size = 0;
     own_data = false;
+    managedmem = false;
   }
 
 
@@ -140,7 +151,7 @@ class device_vector
 
 
   inline void
-  resize(size_t size, double reserve_factor=1.0)
+  resize(size_t size, double reserve_factor=1.0, bool managed=false)
   {
     if (!size)
     {
@@ -149,24 +160,38 @@ class device_vector
     }
     size_t reserve_size = (size_t)std::ceil(reserve_factor*size);
     size_t byte_size = sizeof(T)*reserve_size;
-    if (alloc_size == 0)
+    bool error=false;
+    if (managed!=managedmem)
     {
-      data_pointer = (T*)cuda_memory_manager.allocate(byte_size, name);
-      current_size = size;
-      alloc_size = reserve_size;
-      own_data = true;
+      if (managedmem)
+      {
+        if (alloc_size>0) // Only trigger error message if memory is allocated
+        {
+          fprintf(stderr,"device_vector.resize from managed (%p) ",data_pointer);
+          error=true;
+        }
+      }
+      else
+      {
+        if (alloc_size!=0)
+          fprintf(stderr,"device_vector.resize from non-managed to managed.\n");
+      }
     }
-    else if (size > alloc_size)
+    if ((size > alloc_size) || (alloc_size == 0))
     {
-      if (own_data)
+      if (own_data && (alloc_size>0))
         cuda_memory_manager.deallocate (data_pointer);
-      data_pointer = (T*)cuda_memory_manager.allocate(byte_size, name);
-      current_size = size;
+      if (managed)
+        data_pointer = (T*)cuda_memory_manager.allocate_managed(byte_size, name, managed_flags);
+      else
+        data_pointer = (T*)cuda_memory_manager.allocate(byte_size, name);
       alloc_size = reserve_size;
       own_data = true;
+      managedmem=managed;
     }
-    else
-      current_size = size;
+    current_size = size;
+    if (error)
+      fprintf(stderr,"to non-managed (%p).\n",data_pointer);
   }
 
   inline void
@@ -200,7 +225,7 @@ class device_vector
                  name.c_str(), size(), vec.size());
         abort();
       }
-      resize(vec.size());
+      resize(vec.size(),1.0,managedmem);
     }
 #ifdef QMC_CUDA
     cudaMemcpyAsync (data_pointer, &(vec[0]), this->size()*sizeof(T),
@@ -219,10 +244,10 @@ class device_vector
   }
 
   device_vector(const device_vector<T> &vec) :
-    data_pointer(NULL), current_size(0), alloc_size(0), own_data(true),
+    data_pointer(NULL), current_size(0), alloc_size(0), own_data(true), managedmem(vec.managedmem), managed_flags(vec.managed_flags),
     name(vec.name)
   {
-    resize(vec.size());
+    resize(vec.size(),1.0,managedmem);
     // fprintf(stderr, "device_vector copy constructor called, name=%s.\n",
     // 	      name.c_str());
 #ifdef QMC_CUDA
@@ -254,16 +279,16 @@ class device_vector
         // 	   name.c_str(), size(), vec.size());
         abort();
       }
-      resize(vec.size());
+      resize(vec.size(),1.0,managedmem);
     }
 #ifdef QMC_CUDA
     cudaMemcpyAsync (data_pointer, &(vec[0]), this->size()*sizeof(T),
                      cudaMemcpyHostToDevice);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
     {
-      fprintf (stderr, "CUDA error in device_vector::operator=(vector):\n  %s\n",
-               cudaGetErrorString(err));
+      fprintf (stderr, "CUDA error in device_vector::operator (%p)=(std::vector %p):\n  %s\n",
+               data_pointer,&(vec[0]),cudaGetErrorString(err));
       abort();
     }
 #endif
@@ -283,7 +308,7 @@ class device_vector
                  name.c_str(), size(), vec.size());
         abort();
       }
-      resize(vec.size());
+      resize(vec.size(),1.0,managedmem);
     }
 #ifdef QMC_CUDA
     cudaMemcpy (&((*this)[0]), &(vec[0]), vec.size()*sizeof(T),
@@ -316,7 +341,7 @@ class device_vector
                  name.c_str(), size(), vec.size());
         abort();
       }
-      resize(vec.size());
+      resize(vec.size(),1.0,managedmem);
     }
 #ifdef QMC_CUDA
     cudaMemcpyAsync (&((*this)[0]), &(vec[0]), vec.size()*sizeof(T),
@@ -348,7 +373,7 @@ class device_vector
                  name.c_str(), size(), vec.size());
         abort();
       }
-      resize(vec.size());
+      resize(vec.size(),1.0,managedmem);
     }
 #ifdef QMC_CUDA
     cudaMemcpyAsync (&((*this)[0]), &(vec[0]), vec.size()*sizeof(T),
@@ -483,7 +508,7 @@ class host_vector
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
     {
-      fprintf (stderr, "CUDA error in host_vector::operator=():\n  %s\n",
+      fprintf (stderr, "CUDA error in host_vector::operator=(host_vector):\n  %s\n",
                cudaGetErrorString(err));
       abort();
     }
@@ -502,8 +527,8 @@ class host_vector
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
     {
-      fprintf (stderr, "CUDA error in host_vector::operator=():\n  %s\n",
-               cudaGetErrorString(err));
+      fprintf (stderr, "CUDA error in host_vector::operator=(device_vector %p):\n  %s\n",
+               &(vec[0]),cudaGetErrorString(err));
       abort();
     }
 #endif
@@ -608,7 +633,7 @@ class host_vector
 
 template<typename T>
 device_vector<T>::device_vector(const host_vector<T> &vec) :
-  data_pointer(NULL), current_size(0), alloc_size(0), own_data(true)
+  data_pointer(NULL), current_size(0), alloc_size(0), own_data(true), managedmem(false)
 {
   this->resize(vec.size());
 #ifdef QMC_CUDA