From 9114ecb26978a6e29573e5a108b7c708d677749a Mon Sep 17 00:00:00 2001 From: ImmanuelSegol <3ditds@gmail.com> Date: Tue, 3 Oct 2023 15:22:28 +0300 Subject: [PATCH] fix memory error in single_stage_multi_reduction_kernel (#235) * refactor * refactor * revert * refactor: clang format * Update icicle/appUtils/msm/msm.cu --- icicle/appUtils/msm/msm.cu | 40 +++++++++++++++++---------- icicle/appUtils/msm/tests/msm_test.cu | 2 +- icicle/utils/error_handler.cuh | 32 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 15 deletions(-) create mode 100644 icicle/utils/error_handler.cuh diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index f09b9fe85..c58381cc0 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -5,6 +5,7 @@ #include "../../primitives/field.cuh" #include "../../primitives/projective.cuh" #include "../../utils/cuda_utils.cuh" +#include "../../utils/error_handler.cuh" #include "msm.cuh" #include #include @@ -23,20 +24,29 @@ template __global__ void single_stage_multi_reduction_kernel( - P* v, P* v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding) + P* v, + P* v_r, + unsigned block_size, + unsigned write_stride, + unsigned write_phase, + unsigned padding, + unsigned num_of_threads) { int tid = blockIdx.x * blockDim.x + threadIdx.x; - int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid; + if (tid >= num_of_threads) { return; } + int jump = block_size / 2; + int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid; int block_id = tid_p / jump; int block_tid = tid_p % jump; unsigned read_ind = block_size * block_id + block_tid; unsigned write_ind = tid; - v_r - [write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride - : write_ind] = - padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero() - : v[read_ind] + v[read_ind + jump]; + unsigned v_r_key = + write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind; + P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero() + : v[read_ind] + v[read_ind + jump]; + + v_r[v_r_key] = v_r_value; } // this kernel performs single scalar multiplication @@ -388,7 +398,7 @@ void bucket_method_msm( NUM_THREADS = min(MAX_TH, s); NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( - ones_results, ones_results, s * 2, 0, 0, 0); + ones_results, ones_results, s * 2, 0, 0, 0, s); } unsigned* bucket_indices; @@ -554,7 +564,9 @@ void bucket_method_msm( NUM_THREADS = min(MAX_TH, s); NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( - large_buckets, large_buckets, s * 2, 0, 0, 0); + large_buckets, large_buckets, s * 2, 0, 0, 0, s); + + CHECK_LAST_CUDA_ERROR(); } // distribute @@ -631,18 +643,18 @@ void bucket_method_msm( if (source_bits_count > 0) { for (unsigned j = 0; j < target_bits_count; j++) { unsigned last_j = target_bits_count - 1; - NUM_THREADS = min(MAX_TH, (source_buckets_count >> (1 + j))); - NUM_BLOCKS = ((source_buckets_count >> (1 + j)) + NUM_THREADS - 1) / NUM_THREADS; + unsigned nof_threads = (source_buckets_count >> (1 + j)); + NUM_THREADS = min(MAX_TH, nof_threads); + NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1, - 1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0); + 1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, nof_threads); - unsigned nof_threads = (source_buckets_count >> (1 + j)); NUM_THREADS = min(MAX_TH, nof_threads); NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2, - 1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0); + 1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, nof_threads); } } if (target_bits_count == 1) { diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu index 962c88bbb..b69680f38 100644 --- a/icicle/appUtils/msm/tests/msm_test.cu +++ b/icicle/appUtils/msm/tests/msm_test.cu @@ -224,4 +224,4 @@ int main() // std::cout< + +#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) +template +void check(T err, const char* const func, const char* const file, const int line) +{ + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << " " << func << std::endl; + } +} + +#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) +void checkLast(const char* const file, const int line) +{ + cudaError_t err{cudaGetLastError()}; + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << std::endl; + } +} + +#define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__) +void syncDevice(const char* const file, const int line) +{ + cudaError_t err{cudaDeviceSynchronize()}; + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << std::endl; + } +}