From 9114ecb26978a6e29573e5a108b7c708d677749a Mon Sep 17 00:00:00 2001
From: ImmanuelSegol <3ditds@gmail.com>
Date: Tue, 3 Oct 2023 15:22:28 +0300
Subject: [PATCH] fix memory error in single_stage_multi_reduction_kernel
 (#235)

* refactor

* refactor

* revert

* refactor: clang format

* Update icicle/appUtils/msm/msm.cu
---
 icicle/appUtils/msm/msm.cu            | 40 +++++++++++++++++----------
 icicle/appUtils/msm/tests/msm_test.cu |  2 +-
 icicle/utils/error_handler.cuh        | 32 +++++++++++++++++++++
 3 files changed, 59 insertions(+), 15 deletions(-)
 create mode 100644 icicle/utils/error_handler.cuh
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index f09b9fe85..c58381cc0 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -5,6 +5,7 @@
 #include "../../primitives/field.cuh"
 #include "../../primitives/projective.cuh"
 #include "../../utils/cuda_utils.cuh"
+#include "../../utils/error_handler.cuh"
 #include "msm.cuh"
 #include <cooperative_groups.h>
 #include <cub/device/device_radix_sort.cuh>
@@ -23,20 +24,29 @@
 
 template <typename P>
 __global__ void single_stage_multi_reduction_kernel(
-  P* v, P* v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding)
+  P* v,
+  P* v_r,
+  unsigned block_size,
+  unsigned write_stride,
+  unsigned write_phase,
+  unsigned padding,
+  unsigned num_of_threads)
 {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
+  if (tid >= num_of_threads) { return; }
+
   int jump = block_size / 2;
+  int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
   int block_id = tid_p / jump;
   int block_tid = tid_p % jump;
   unsigned read_ind = block_size * block_id + block_tid;
   unsigned write_ind = tid;
-  v_r
-    [write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
-                  : write_ind] =
-      padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
-              : v[read_ind] + v[read_ind + jump];
+  unsigned v_r_key =
+    write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind;
+  P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
+                        : v[read_ind] + v[read_ind + jump];
+
+  v_r[v_r_key] = v_r_value;
 }
 
 // this kernel performs single scalar multiplication
@@ -388,7 +398,7 @@ void bucket_method_msm(
     NUM_THREADS = min(MAX_TH, s);
     NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
     single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-      ones_results, ones_results, s * 2, 0, 0, 0);
+      ones_results, ones_results, s * 2, 0, 0, 0, s);
   }
 
   unsigned* bucket_indices;
@@ -554,7 +564,9 @@ void bucket_method_msm(
       NUM_THREADS = min(MAX_TH, s);
       NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
       single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
-        large_buckets, large_buckets, s * 2, 0, 0, 0);
+        large_buckets, large_buckets, s * 2, 0, 0, 0, s);
+
+      CHECK_LAST_CUDA_ERROR();
     }
 
     // distribute
@@ -631,18 +643,18 @@ void bucket_method_msm(
       if (source_bits_count > 0) {
         for (unsigned j = 0; j < target_bits_count; j++) {
           unsigned last_j = target_bits_count - 1;
-          NUM_THREADS = min(MAX_TH, (source_buckets_count >> (1 + j)));
-          NUM_BLOCKS = ((source_buckets_count >> (1 + j)) + NUM_THREADS - 1) / NUM_THREADS;
+          unsigned nof_threads = (source_buckets_count >> (1 + j));
+          NUM_THREADS = min(MAX_TH, nof_threads);
+          NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
           single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
             j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1,
-            1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0);
+            1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, nof_threads);
 
-          unsigned nof_threads = (source_buckets_count >> (1 + j));
           NUM_THREADS = min(MAX_TH, nof_threads);
           NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
           single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
             j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2,
-            1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0);
+            1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, nof_threads);
         }
       }
       if (target_bits_count == 1) {
diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu
index 962c88bbb..b69680f38 100644
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -224,4 +224,4 @@ int main()
   // std::cout<<pr<<std::endl;
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/icicle/utils/error_handler.cuh b/icicle/utils/error_handler.cuh
new file mode 100644
index 000000000..269d311cd
--- /dev/null
+++ b/icicle/utils/error_handler.cuh
@@ -0,0 +1,32 @@
+#pragma once
+#include <iostream>
+
+#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
+template <typename T>
+void check(T err, const char* const func, const char* const file, const int line)
+{
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
+  }
+}
+
+#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
+void checkLast(const char* const file, const int line)
+{
+  cudaError_t err{cudaGetLastError()};
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}
+
+#define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__)
+void syncDevice(const char* const file, const int line)
+{
+  cudaError_t err{cudaDeviceSynchronize()};
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}