Skip to content

Commit

Permalink
fix memory error in single_stage_multi_reduction_kernel (#235)
Browse files Browse the repository at this point in the history
* refactor

* refactor

* revert

* refactor: clang format

* Update icicle/appUtils/msm/msm.cu
  • Loading branch information
ImmanuelSegol authored Oct 3, 2023
1 parent 97f0079 commit 9114ecb
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 15 deletions.
40 changes: 26 additions & 14 deletions icicle/appUtils/msm/msm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"
#include "../../utils/cuda_utils.cuh"
#include "../../utils/error_handler.cuh"
#include "msm.cuh"
#include <cooperative_groups.h>
#include <cub/device/device_radix_sort.cuh>
Expand All @@ -23,20 +24,29 @@

template <typename P>
__global__ void single_stage_multi_reduction_kernel(
P* v, P* v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding)
P* v,
P* v_r,
unsigned block_size,
unsigned write_stride,
unsigned write_phase,
unsigned padding,
unsigned num_of_threads)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
if (tid >= num_of_threads) { return; }

int jump = block_size / 2;
int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
int block_id = tid_p / jump;
int block_tid = tid_p % jump;
unsigned read_ind = block_size * block_id + block_tid;
unsigned write_ind = tid;
v_r
[write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
: write_ind] =
padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
: v[read_ind] + v[read_ind + jump];
unsigned v_r_key =
write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind;
P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
: v[read_ind] + v[read_ind + jump];

v_r[v_r_key] = v_r_value;
}

// this kernel performs single scalar multiplication
Expand Down Expand Up @@ -388,7 +398,7 @@ void bucket_method_msm(
NUM_THREADS = min(MAX_TH, s);
NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
ones_results, ones_results, s * 2, 0, 0, 0);
ones_results, ones_results, s * 2, 0, 0, 0, s);
}

unsigned* bucket_indices;
Expand Down Expand Up @@ -554,7 +564,9 @@ void bucket_method_msm(
NUM_THREADS = min(MAX_TH, s);
NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
large_buckets, large_buckets, s * 2, 0, 0, 0);
large_buckets, large_buckets, s * 2, 0, 0, 0, s);

CHECK_LAST_CUDA_ERROR();
}

// distribute
Expand Down Expand Up @@ -631,18 +643,18 @@ void bucket_method_msm(
if (source_bits_count > 0) {
for (unsigned j = 0; j < target_bits_count; j++) {
unsigned last_j = target_bits_count - 1;
NUM_THREADS = min(MAX_TH, (source_buckets_count >> (1 + j)));
NUM_BLOCKS = ((source_buckets_count >> (1 + j)) + NUM_THREADS - 1) / NUM_THREADS;
unsigned nof_threads = (source_buckets_count >> (1 + j));
NUM_THREADS = min(MAX_TH, nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1,
1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0);
1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, nof_threads);

unsigned nof_threads = (source_buckets_count >> (1 + j));
NUM_THREADS = min(MAX_TH, nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2,
1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0);
1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, nof_threads);
}
}
if (target_bits_count == 1) {
Expand Down
2 changes: 1 addition & 1 deletion icicle/appUtils/msm/tests/msm_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -224,4 +224,4 @@ int main()
// std::cout<<pr<<std::endl;

return 0;
}
}
32 changes: 32 additions & 0 deletions icicle/utils/error_handler.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#pragma once
#include <iostream>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char* const func, const char* const file, const int line)
{
if (err != cudaSuccess) {
std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(const char* const file, const int line)
{
cudaError_t err{cudaGetLastError()};
if (err != cudaSuccess) {
std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << std::endl;
}
}

#define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__)
void syncDevice(const char* const file, const int line)
{
cudaError_t err{cudaDeviceSynchronize()};
if (err != cudaSuccess) {
std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << std::endl;
}
}

0 comments on commit 9114ecb

Please sign in to comment.