Skip to content

Commit

Permalink
Merge pull request #2883 from prckent/fixcuda
Browse files Browse the repository at this point in the history
Fix race conditions in CUDA kernels causing incorrect energies
  • Loading branch information
ye-luo authored Feb 9, 2021
2 parents 04f7bec + c441d27 commit a53fcc2
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 2 deletions.
18 changes: 16 additions & 2 deletions src/QMCHamiltonians/CudaCoulomb.cu
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ __global__ void coulomb_AA_PBC_kernel(TR** R,
for (int i = 0; i < 3; i++)
if ((3 * b + i) * BS + tid < 3 * N)
r1[0][i * BS + tid] = myR[(3 * b + i) * BS + tid];
__syncthreads();
int ptcl1 = b * BS + tid;
if (ptcl1 < N)
{
Expand All @@ -308,12 +309,14 @@ __global__ void coulomb_AA_PBC_kernel(TR** R,
}
// Avoid double-counting on the diagonal blocks
mysum *= 0.5;
__syncthreads();
// Now do off-diagonal blocks
for (int b1 = 0; b1 < NB; b1++)
{
for (int i = 0; i < 3; i++)
if ((3 * b1 + i) * BS + tid < 3 * N)
r1[0][i * BS + tid] = myR[(3 * b1 + i) * BS + tid];
__syncthreads();
int ptcl1 = b1 * BS + tid;
if (ptcl1 < N)
{
Expand All @@ -322,6 +325,7 @@ __global__ void coulomb_AA_PBC_kernel(TR** R,
for (int i = 0; i < 3; i++)
if ((3 * b2 + i) * BS + tid < 3 * N)
r2[0][i * BS + tid] = myR[(3 * b2 + i) * BS + tid];
__syncthreads();
int end = ((b2 + 1) * BS < N) ? BS : (N - b2 * BS);
for (int j = 0; j < end; j++)
{
Expand Down Expand Up @@ -368,6 +372,7 @@ __global__ void coulomb_AA_kernel(T** R, int N, T* sum)
for (int i = 0; i < 3; i++)
if ((3 * b + i) * BS + tid < 3 * N)
r1[0][i * BS + tid] = myR[(3 * b + i) * BS + tid];
__syncthreads();
int ptcl1 = b * BS + tid;
if (ptcl1 < N)
{
Expand All @@ -388,12 +393,14 @@ __global__ void coulomb_AA_kernel(T** R, int N, T* sum)
}
// Avoid double-counting on the diagonal blocks
mysum *= 0.5;
__syncthreads();
// Now do off-diagonal blocks
for (int b1 = 0; b1 < NB; b1++)
{
for (int i = 0; i < 3; i++)
if ((3 * b1 + i) * BS + tid < 3 * N)
r1[0][i * BS + tid] = myR[(3 * b1 + i) * BS + tid];
__syncthreads();
int ptcl1 = b1 * BS + tid;
if (ptcl1 < N)
{
Expand All @@ -402,6 +409,7 @@ __global__ void coulomb_AA_kernel(T** R, int N, T* sum)
for (int i = 0; i < 3; i++)
if ((3 * b2 + i) * BS + tid < 3 * N)
r2[0][i * BS + tid] = myR[(3 * b2 + i) * BS + tid];
__syncthreads();
int end = ((b2 + 1) * BS < N) ? BS : (N - b2 * BS);
for (int j = 0; j < end; j++)
{
Expand Down Expand Up @@ -544,6 +552,7 @@ __global__ void MPC_SR_kernel(T** R, int N, T* lattice, T* latticeInv, T* sum)
}
// Avoid double-counting on the diagonal blocks
mysum *= 0.5;
__syncthreads();
// Now do off-diagonal blocks
for (int b1 = 0; b1 < NB; b1++)
{
Expand All @@ -559,6 +568,7 @@ __global__ void MPC_SR_kernel(T** R, int N, T* lattice, T* latticeInv, T* sum)
for (int i = 0; i < 3; i++)
if ((3 * b2 + i) * BS + tid < 3 * N)
r2[0][i * BS + tid] = myR[(3 * b2 + i) * BS + tid];
__syncthreads();
int end = ((b2 + 1) * BS < N) ? BS : (N - b2 * BS);
for (int j = 0; j < end; j++)
{
Expand Down Expand Up @@ -1074,6 +1084,7 @@ __global__ void eval_rhok_kernel(T** R, int numr, T* kpoints, int numk, T** rhok
if ((i + 3 * rBlock) * BS + tid < 3 * numr)
r[0][BS * i + tid] = myR[(i + 3 * rBlock) * BS + tid];
int end = ((rBlock + 1) * BS < numr) ? BS : (numr - rBlock * BS);
__syncthreads();
for (int j = 0; j < end; j++)
{
T phase = (k[tid][0] * r[j][0] + k[tid][1] * r[j][1] + k[tid][2] * r[j][2]);
Expand Down Expand Up @@ -1122,6 +1133,7 @@ __global__ void eval_rhok_kernel(TR** R, int first, int last, T* kpoints, int nu
for (int i = 0; i < 3; i++)
if ((i + 3 * rBlock) * BS + tid < 3 * numr)
r[0][BS * i + tid] = myR[3 * first + (i + 3 * rBlock) * BS + tid];
__syncthreads();
int end = ((rBlock + 1) * BS < numr) ? BS : (numr - rBlock * BS);
for (int j = 0; j < end; j++)
{
Expand Down Expand Up @@ -1234,18 +1246,19 @@ __global__ void vk_sum_kernel2(T** rhok1, T** rhok2, T* vk, int numk, T* sum)
myrhok1 = rhok1[blockIdx.x];
myrhok2 = rhok2[blockIdx.x];
}
__syncthreads();
// Used to do coalesced global loads
__shared__ T rhok_s1[2 * BS], rhok_s2[2 * BS];
int NB = numk / BS + ((numk % BS) ? 1 : 0);
T mysum = 0.0f;
for (int b = 0; b < NB; b++)
{
__syncthreads();
if (2 * b * BS + tid < 2 * numk)
{
rhok_s1[tid] = myrhok1[2 * b * BS + tid];
rhok_s2[tid] = myrhok2[2 * b * BS + tid];
}
__syncthreads();
if ((2 * b + 1) * BS + tid < 2 * numk)
{
rhok_s1[BS + tid] = myrhok1[(2 * b + 1) * BS + tid];
Expand Down Expand Up @@ -1313,18 +1326,19 @@ __global__ void vk_sum_kernel2(T** rhok1, T* rhok2, T* vk, int numk, T* sum)
__shared__ T* myrhok1;
if (tid == 0)
myrhok1 = rhok1[blockIdx.x];
__syncthreads();
// Used to do coalesced global loads
__shared__ T rhok_s1[2 * BS], rhok_s2[2 * BS];
int NB = numk / BS + ((numk % BS) ? 1 : 0);
T mysum = 0.0f;
for (int b = 0; b < NB; b++)
{
__syncthreads();
if (2 * b * BS + tid < 2 * numk)
{
rhok_s1[tid] = myrhok1[2 * b * BS + tid];
rhok_s2[tid] = rhok2[2 * b * BS + tid];
}
__syncthreads();
if ((2 * b + 1) * BS + tid < 2 * numk)
{
rhok_s1[BS + tid] = myrhok1[(2 * b + 1) * BS + tid];
Expand Down
1 change: 1 addition & 0 deletions src/QMCHamiltonians/NLPP.cu
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ __global__ void find_core_electrons_PBC_kernel(T** R,
disp[tid][1] = r[tid][1] - i[ion][1];
disp[tid][2] = r[tid][2] - i[ion][2];
dist[tid] = min_dist<T>(disp[tid][0], disp[tid][1], disp[tid][2], L, Linv);
__syncthreads();
for (int elec = 0; elec < elecEnd; elec++)
{
__syncthreads();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1412,6 +1412,7 @@ __global__ void calc_ratio_grad_lapl(T** Ainv_list,
// ratio to make it w.r.t. new position
if (tid < 4)
ratio_prod[(tid + 1) * BS1] /= ratio_prod[0];
__syncthreads();
if (tid < 5)
ratio_grad_lapl[5 * blockIdx.x + tid] = ratio_prod[tid * BS1];
}
Expand Down

0 comments on commit a53fcc2

Please sign in to comment.