Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix grid dim bug for eval_multi_multi_UBspline_3d_c_vgl_kernel #4421

Merged
merged 3 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/einspline/multi_bspline_cuda_c_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ eval_multi_multi_UBspline_3d_c_vgl_kernel(float const * __restrict__ pos,
__shared__ float ab[96];
__shared__ float a[12], b[12], c[12];
__shared__ float G[3][3], GGt[3][3];
int ir = blockIdx.y;
int ir = blockIdx.x;
float rx = pos[3*ir+0];
float ry = pos[3*ir+1];
float rz = pos[3*ir+2];
Expand Down Expand Up @@ -512,7 +512,7 @@ eval_multi_multi_UBspline_3d_c_vgl_kernel(float const * __restrict__ pos,
float h00 = 0.0f, h01 = 0.0f, h02 = 0.0f;
float h11 = 0.0f, h12 = 0.0f, h22 = 0.0f;
float v = 0.0f;
int off = blockIdx.x * blockDim.x + threadIdx.x;
int off = blockIdx.y * blockDim.x + threadIdx.x;
if (off < 2*N)
{
int stride_x = strides.x;
Expand Down Expand Up @@ -687,7 +687,7 @@ eval_multi_multi_UBspline_3d_c_vgl_cuda (multi_UBspline_3d_c_cuda *spline,
int num_splines=spline->num_splines;
int threadsPerBlock = max(64,min(32*((2*num_splines+31)/32),256));
dim3 dimBlock(threadsPerBlock);
dim3 dimGrid((2 * num_splines + dimBlock.x - 1) / dimBlock.x, num);
dim3 dimGrid(num, (2 * num_splines + dimBlock.x - 1) / dimBlock.x);
eval_multi_multi_UBspline_3d_c_vgl_kernel<<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(pos_d, (float*)spline->coefs, Linv_d, (float**)vals_d,
(float**)grad_lapl_d, spline->gridInv, spline->dim,
Expand All @@ -712,7 +712,7 @@ eval_multi_multi_UBspline_3d_c_vgl_cudasplit (multi_UBspline_3d_c_cuda *spline,
int num_splines=spline->num_split_splines;
int threadsPerBlock = max(64,min(32*((2*num_splines+31)/32),256));
dim3 dimBlock(threadsPerBlock);
dim3 dimGrid((2 * num_splines + dimBlock.x - 1) / dimBlock.x, num);
dim3 dimGrid(num, (2 * num_splines + dimBlock.x - 1) / dimBlock.x);
eval_multi_multi_UBspline_3d_c_vgl_kernel<<<dimGrid,dimBlock,0,s>>>
(pos_d, coefs, Linv_d, (float**)vals_d,
(float**)grad_lapl_d, spline->gridInv, spline->dim,
Expand Down
12 changes: 4 additions & 8 deletions src/einspline/multi_bspline_cuda_d_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,9 @@ eval_multi_multi_UBspline_3d_d_vgl_kernel
double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride, int spline_offset)
{
int block = blockIdx.x;
int ir = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int block = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad_lapl;
__shared__ double3 r;
Expand Down Expand Up @@ -546,9 +546,7 @@ eval_multi_multi_UBspline_3d_d_vgl_cuda
double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (spline->num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_d_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride, 0);
Expand All @@ -563,9 +561,7 @@ eval_multi_multi_UBspline_3d_d_vgl_cudasplit
{
int num_splines=spline->num_split_splines;
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(num_splines/SPLINE_BLOCK_SIZE, num);
if (num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_d_vgl_kernel<<<dimGrid,dimBlock,0,s>>>
(pos_d, spline->gridInv, coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, num_splines, row_stride, device_nr*num_splines*num);
Expand Down
8 changes: 3 additions & 5 deletions src/einspline/multi_bspline_cuda_s_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -523,9 +523,9 @@ eval_multi_multi_UBspline_3d_s_vgl_kernel
float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int ir = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int block = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad_lapl;
__shared__ float3 r;
Expand Down Expand Up @@ -680,9 +680,7 @@ eval_multi_multi_UBspline_3d_s_vgl_cuda
float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (spline->num_splines + SPLINE_BLOCK_SIZE -1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_s_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
Expand Down
14 changes: 5 additions & 9 deletions src/einspline/multi_bspline_cuda_z_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,9 @@ eval_multi_multi_UBspline_3d_z_vgl_kernel
double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride, int spline_offset)
{
int block = blockIdx.x;
int ir = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int block = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad_lapl;
__shared__ double3 r;
Expand Down Expand Up @@ -444,9 +444,7 @@ eval_multi_multi_UBspline_3d_z_vgl_cuda
double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (2 * spline->num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_z_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (double*)spline->coefs, Linv_d, (double**)vals_d,
(double**)grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride, 0);
Expand All @@ -459,11 +457,9 @@ eval_multi_multi_UBspline_3d_z_vgl_cudasplit
double *vals_d[], double *grad_lapl_d[], int num, int row_stride,
double *coefs, int device_nr, cudaStream_t s)
{
int num_splines=spline->num_split_splines;
int num_splines = spline->num_split_splines;
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*num_splines/SPLINE_BLOCK_SIZE, num);
if (2*num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
dim3 dimGrid(num, (2 * num_splines + SPLINE_BLOCK_SIZE - 1) / SPLINE_BLOCK_SIZE);
eval_multi_multi_UBspline_3d_z_vgl_kernel<<<dimGrid,dimBlock,0,s>>>
(pos_d, spline->gridInv, coefs, Linv_d, (double**)vals_d,
(double**)grad_lapl_d, spline->dim, spline->stride, num_splines, row_stride, 2*device_nr*num_splines*num);
Expand Down