From 5f97b147d153c63386431b30c2e7d7c73b1db0e3 Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Fri, 27 Jan 2023 12:44:49 -0500 Subject: [PATCH 1/4] Add check for using parallel hdf5 in non-mpi build Added check for when MPI is disabled but the parallel hdf5 library was found. This results in an error due to incompatibility. --- CMakeLists.txt | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a8654f9884..4eb111f2a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -632,13 +632,9 @@ find_package(LibXml2 REQUIRED) # set up HDF5 library #------------------------------------------------------------------- if(HAVE_MPI) - set(HDF5_PREFER_PARALLEL - TRUE - CACHE BOOL "Request parallel/serial HDF5 library") + option(HDF5_PREFER_PARALLEL "Request parallel/serial HDF5 library" ON) else(HAVE_MPI) - set(HDF5_PREFER_PARALLEL - FALSE - CACHE BOOL "Request parallel/serial HDF5 library") + option(HDF5_PREFER_PARALLEL "Request parallel/serial HDF5 library" OFF) if(HDF5_PREFER_PARALLEL) message(FATAL_ERROR "Parallel HDF5 library cannot be selected with QMCPACK non-MPI build. " "Please set HDF5_PREFER_PARALLEL=0.") @@ -657,8 +653,13 @@ find_package(HDF5 1.10 COMPONENTS C) if(HDF5_FOUND) if(HDF5_IS_PARALLEL) - message(STATUS "Parallel HDF5 library found") - option(ENABLE_PHDF5 "Enable code paths using parallel HDF5" ON) + if(HAVE_MPI) + message(STATUS "Parallel HDF5 library found") + option(ENABLE_PHDF5 "Enable code paths using parallel HDF5" ON) + else(HAVE_MPI) + message(FATAL_ERROR "Parallel HDF5 library found but cannot be used with QMCPACK non-MPI build. " + "Please provide serial HDF5 library.") + endif(HAVE_MPI) else(HDF5_IS_PARALLEL) message(STATUS "Serial HDF5 library found") option(ENABLE_PHDF5 "Enable code paths using parallel HDF5" OFF) From b91c16fd5ea4337c41045666aca0d1f7b3e14691 Mon Sep 17 00:00:00 2001 From: pengwang Date: Fri, 27 Jan 2023 10:00:37 -0800 Subject: [PATCH 2/4] fix grid dim bug for eval_multi_multi_UBspline_3d_c_vgl_kernel --- src/einspline/multi_bspline_cuda_c_impl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/einspline/multi_bspline_cuda_c_impl.h b/src/einspline/multi_bspline_cuda_c_impl.h index ff5f11a291..95aa5eee16 100644 --- a/src/einspline/multi_bspline_cuda_c_impl.h +++ b/src/einspline/multi_bspline_cuda_c_impl.h @@ -453,7 +453,7 @@ eval_multi_multi_UBspline_3d_c_vgl_kernel(float const * __restrict__ pos, __shared__ float ab[96]; __shared__ float a[12], b[12], c[12]; __shared__ float G[3][3], GGt[3][3]; - int ir = blockIdx.y; + int ir = blockIdx.x; float rx = pos[3*ir+0]; float ry = pos[3*ir+1]; float rz = pos[3*ir+2]; @@ -512,7 +512,7 @@ eval_multi_multi_UBspline_3d_c_vgl_kernel(float const * __restrict__ pos, float h00 = 0.0f, h01 = 0.0f, h02 = 0.0f; float h11 = 0.0f, h12 = 0.0f, h22 = 0.0f; float v = 0.0f; - int off = blockIdx.x * blockDim.x + threadIdx.x; + int off = blockIdx.y * blockDim.x + threadIdx.x; if (off < 2*N) { int stride_x = strides.x; @@ -687,7 +687,7 @@ eval_multi_multi_UBspline_3d_c_vgl_cuda (multi_UBspline_3d_c_cuda *spline, int num_splines=spline->num_splines; int threadsPerBlock = max(64,min(32*((2*num_splines+31)/32),256)); dim3 dimBlock(threadsPerBlock); - dim3 dimGrid((2 * num_splines + dimBlock.x - 1) / dimBlock.x, num); + dim3 dimGrid(num, (2 * num_splines + dimBlock.x - 1) / dimBlock.x); eval_multi_multi_UBspline_3d_c_vgl_kernel<<>> (pos_d, (float*)spline->coefs, Linv_d, (float**)vals_d, (float**)grad_lapl_d, spline->gridInv, spline->dim, @@ -712,7 +712,7 @@ eval_multi_multi_UBspline_3d_c_vgl_cudasplit (multi_UBspline_3d_c_cuda *spline, int num_splines=spline->num_split_splines; int threadsPerBlock = max(64,min(32*((2*num_splines+31)/32),256)); dim3 dimBlock(threadsPerBlock); - dim3 dimGrid((2 * num_splines + dimBlock.x - 1) / dimBlock.x, num); + dim3 dimGrid(num, (2 * num_splines + dimBlock.x - 1) / dimBlock.x); eval_multi_multi_UBspline_3d_c_vgl_kernel<<>> (pos_d, coefs, Linv_d, (float**)vals_d, (float**)grad_lapl_d, spline->gridInv, spline->dim, From df21a097b3e4649435679a70a50f01ebfa3e19d6 Mon Sep 17 00:00:00 2001 From: "Paul R. C. Kent" Date: Fri, 27 Jan 2023 13:23:38 -0500 Subject: [PATCH 3/4] Additional explanation --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eb111f2a5..85f4034660 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -658,7 +658,7 @@ if(HDF5_FOUND) option(ENABLE_PHDF5 "Enable code paths using parallel HDF5" ON) else(HAVE_MPI) message(FATAL_ERROR "Parallel HDF5 library found but cannot be used with QMCPACK non-MPI build. " - "Please provide serial HDF5 library.") + "Please provide a serial HDF5 library or switch to building QMCPACK with MPI.") endif(HAVE_MPI) else(HDF5_IS_PARALLEL) message(STATUS "Serial HDF5 library found") From 0c932581cfec312e386264511fd308a1fcbcbad4 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Fri, 27 Jan 2023 12:38:59 -0600 Subject: [PATCH 4/4] Propagate change to cuda_s/d/z --- src/einspline/multi_bspline_cuda_d_impl.h | 12 ++++-------- src/einspline/multi_bspline_cuda_s_impl.h | 8 +++----- src/einspline/multi_bspline_cuda_z_impl.h | 14 +++++--------- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/einspline/multi_bspline_cuda_d_impl.h b/src/einspline/multi_bspline_cuda_d_impl.h index 8525d1128a..1a7fd24823 100644 --- a/src/einspline/multi_bspline_cuda_d_impl.h +++ b/src/einspline/multi_bspline_cuda_d_impl.h @@ -397,9 +397,9 @@ eval_multi_multi_UBspline_3d_d_vgl_kernel double *vals[], double *grad_lapl[], uint3 dim, uint3 strides, int N, int row_stride, int spline_offset) { - int block = blockIdx.x; + int ir = blockIdx.x; int thr = threadIdx.x; - int ir = blockIdx.y; + int block = blockIdx.y; int off = block*SPLINE_BLOCK_SIZE+threadIdx.x; __shared__ double *myval, *mygrad_lapl; __shared__ double3 r; @@ -546,9 +546,7 @@ eval_multi_multi_UBspline_3d_d_vgl_cuda double *vals_d[], double *grad_lapl_d[], int num, int row_stride) { dim3 dimBlock(SPLINE_BLOCK_SIZE); - dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num); - if (spline->num_splines % SPLINE_BLOCK_SIZE) - dimGrid.x++; + dim3 dimGrid(num, (spline->num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE); eval_multi_multi_UBspline_3d_d_vgl_kernel<<>> (pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d, grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride, 0); @@ -563,9 +561,7 @@ eval_multi_multi_UBspline_3d_d_vgl_cudasplit { int num_splines=spline->num_split_splines; dim3 dimBlock(SPLINE_BLOCK_SIZE); - dim3 dimGrid(num_splines/SPLINE_BLOCK_SIZE, num); - if (num_splines % SPLINE_BLOCK_SIZE) - dimGrid.x++; + dim3 dimGrid(num, (num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE); eval_multi_multi_UBspline_3d_d_vgl_kernel<<>> (pos_d, spline->gridInv, coefs, Linv_d, vals_d, grad_lapl_d, spline->dim, spline->stride, num_splines, row_stride, device_nr*num_splines*num); diff --git a/src/einspline/multi_bspline_cuda_s_impl.h b/src/einspline/multi_bspline_cuda_s_impl.h index 16a8585f5b..9f7b36d6b7 100644 --- a/src/einspline/multi_bspline_cuda_s_impl.h +++ b/src/einspline/multi_bspline_cuda_s_impl.h @@ -523,9 +523,9 @@ eval_multi_multi_UBspline_3d_s_vgl_kernel float *vals[], float *grad_lapl[], uint3 dim, uint3 strides, int N, int row_stride) { - int block = blockIdx.x; + int ir = blockIdx.x; int thr = threadIdx.x; - int ir = blockIdx.y; + int block = blockIdx.y; int off = block*SPLINE_BLOCK_SIZE+threadIdx.x; __shared__ float *myval, *mygrad_lapl; __shared__ float3 r; @@ -680,9 +680,7 @@ eval_multi_multi_UBspline_3d_s_vgl_cuda float *vals_d[], float *grad_lapl_d[], int num, int row_stride) { dim3 dimBlock(SPLINE_BLOCK_SIZE); - dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num); - if (spline->num_splines % SPLINE_BLOCK_SIZE) - dimGrid.x++; + dim3 dimGrid(num, (spline->num_splines + SPLINE_BLOCK_SIZE -1) / SPLINE_BLOCK_SIZE); eval_multi_multi_UBspline_3d_s_vgl_kernel<<>> (pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d, grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride); diff --git a/src/einspline/multi_bspline_cuda_z_impl.h b/src/einspline/multi_bspline_cuda_z_impl.h index 11a7454db3..31d3cfca60 100644 --- a/src/einspline/multi_bspline_cuda_z_impl.h +++ b/src/einspline/multi_bspline_cuda_z_impl.h @@ -296,9 +296,9 @@ eval_multi_multi_UBspline_3d_z_vgl_kernel double *vals[], double *grad_lapl[], uint3 dim, uint3 strides, int N, int row_stride, int spline_offset) { - int block = blockIdx.x; + int ir = blockIdx.x; int thr = threadIdx.x; - int ir = blockIdx.y; + int block = blockIdx.y; int off = block*SPLINE_BLOCK_SIZE+threadIdx.x; __shared__ double *myval, *mygrad_lapl; __shared__ double3 r; @@ -444,9 +444,7 @@ eval_multi_multi_UBspline_3d_z_vgl_cuda double *vals_d[], double *grad_lapl_d[], int num, int row_stride) { dim3 dimBlock(SPLINE_BLOCK_SIZE); - dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num); - if (2*spline->num_splines % SPLINE_BLOCK_SIZE) - dimGrid.x++; + dim3 dimGrid(num, (2 * spline->num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE); eval_multi_multi_UBspline_3d_z_vgl_kernel<<>> (pos_d, spline->gridInv, (double*)spline->coefs, Linv_d, (double**)vals_d, (double**)grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride, 0); @@ -459,11 +457,9 @@ eval_multi_multi_UBspline_3d_z_vgl_cudasplit double *vals_d[], double *grad_lapl_d[], int num, int row_stride, double *coefs, int device_nr, cudaStream_t s) { - int num_splines=spline->num_split_splines; + int num_splines = spline->num_split_splines; dim3 dimBlock(SPLINE_BLOCK_SIZE); - dim3 dimGrid(2*num_splines/SPLINE_BLOCK_SIZE, num); - if (2*num_splines % SPLINE_BLOCK_SIZE) - dimGrid.x++; + dim3 dimGrid(num, (2 * num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE); eval_multi_multi_UBspline_3d_z_vgl_kernel<<>> (pos_d, spline->gridInv, coefs, Linv_d, (double**)vals_d, (double**)grad_lapl_d, spline->dim, spline->stride, num_splines, row_stride, 2*device_nr*num_splines*num);