From dcf0671f6feb20dfd746e208d0a7fa23a5084107 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 25 May 2021 14:13:41 +0300
Subject: [PATCH] Fix block shuffle

---
 cmake/CubCudaConfig.cmake   |   2 +-
 cub/block/block_shuffle.cuh |  35 ++--
 test/test_block_shuffle.cu  | 358 ++++++++++++++++++++++++++++++++++++
 3 files changed, 374 insertions(+), 21 deletions(-)
 create mode 100644 test/test_block_shuffle.cu
diff --git a/cmake/CubCudaConfig.cmake b/cmake/CubCudaConfig.cmake
index d9e77d1df6..398ed3ed76 100644
--- a/cmake/CubCudaConfig.cmake
+++ b/cmake/CubCudaConfig.cmake
@@ -4,7 +4,7 @@ enable_language(CUDA)
 # Architecture options:
 #
 
-set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80)
+set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86)
 set(arch_message "CUB: Explicitly enabled compute architectures:")
 
 # Thrust sets up the architecture flags in CMAKE_CUDA_FLAGS already. Just
diff --git a/cub/block/block_shuffle.cuh b/cub/block/block_shuffle.cuh
index ba2e9b59a0..39b0d2c116 100644
--- a/cub/block/block_shuffle.cuh
+++ b/cub/block/block_shuffle.cuh
@@ -87,11 +87,7 @@ private:
      ******************************************************************************/
 
     /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T prev[BLOCK_THREADS];
-        T next[BLOCK_THREADS];
-    };
+    typedef T _TempStorage[BLOCK_THREADS];
 
 
 public:
@@ -171,14 +167,14 @@ public:
         T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
         int distance = 1)           ///< [in] Offset distance (may be negative)
     {
-        temp_storage[linear_tid].prev = input;
+        temp_storage[linear_tid] = input;
 
         CTA_SYNC();
 
         const int offset_tid = static_cast<int>(linear_tid) + distance;
         if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS))
         {
-            output = temp_storage[static_cast<size_t>(offset_tid)].prev;
+            output = temp_storage[static_cast<size_t>(offset_tid)];
         }
     }
 
@@ -194,7 +190,7 @@ public:
         T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
         unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
     {
-        temp_storage[linear_tid].prev = input;
+        temp_storage[linear_tid] = input;
 
         CTA_SYNC();
 
@@ -202,7 +198,7 @@ public:
         if (offset >= BLOCK_THREADS)
             offset -= BLOCK_THREADS;
 
-        output = temp_storage[offset].prev;
+        output = temp_storage[offset];
     }
 
 
@@ -219,7 +215,7 @@ public:
         T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
         T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
     {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
         CTA_SYNC();
 
@@ -227,9 +223,8 @@ public:
         for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
             prev[ITEM] = input[ITEM - 1];
 
-
         if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
+            prev[0] = temp_storage[linear_tid - 1];
     }
 
 
@@ -248,7 +243,7 @@ public:
         T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
     {
         Up(input, prev);
-        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
+        block_suffix = temp_storage[BLOCK_THREADS - 1];
     }
 
 
@@ -265,16 +260,16 @@ public:
         T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
         T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
     {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+        temp_storage[linear_tid] = input[0];
 
         CTA_SYNC();
 
         #pragma unroll
-        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
-            prev[ITEM] = input[ITEM - 1];
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+            prev[ITEM] = input[ITEM + 1];
 
-        if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
+        if (linear_tid < BLOCK_THREADS - 1)
+            prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1];
     }
 
 
@@ -292,8 +287,8 @@ public:
         T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
         T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
     {
-        Up(input, prev);
-        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
+        Down(input, prev);
+        block_prefix = temp_storage[0];
     }
 
     //@}  end member group
diff --git a/test/test_block_shuffle.cu b/test/test_block_shuffle.cu
new file mode 100644
index 0000000000..d7b250b92f
--- /dev/null
+++ b/test/test_block_shuffle.cu
@@ -0,0 +1,358 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockShuffle utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <typeinfo>
+#include <memory>
+
+#include <cub/util_allocator.cuh>
+#include <cub/block/block_shuffle.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+template<typename data_type>
+__global__ void IotaKernel(
+        const unsigned int num_items,
+        data_type *data)
+{
+    const unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (i < num_items)
+    {
+        data[i] = i;
+    }
+}
+
+template<typename data_type>
+void Iota(
+        const unsigned int num_items,
+        data_type *data)
+{
+    const unsigned int threads_per_block = 256;
+    const unsigned int blocks_per_grid = (num_items + threads_per_block - 1) / threads_per_block;
+
+    IotaKernel<<<blocks_per_grid, threads_per_block>>>(num_items, data);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+}
+
+template <
+        typename data_type,
+        unsigned int threads_in_block,
+        unsigned int items_per_thread,
+        typename action_type>
+__global__ void BlockShuffleTestKernel(
+        data_type *data,
+        action_type action)
+{
+    typedef cub::BlockShuffle<data_type, threads_in_block> BlockShuffle;
+
+    __shared__ typename BlockShuffle::TempStorage temp_storage_shuffle;
+
+    data_type thread_data[items_per_thread];
+
+    data += threadIdx.x * items_per_thread;
+    for (unsigned int item = 0; item < items_per_thread; item++)
+    {
+        thread_data[item] = data[item];
+    }
+    __syncthreads();
+
+    BlockShuffle block_shuffle(temp_storage_shuffle);
+    action(block_shuffle, thread_data);
+
+    for (unsigned int item = 0; item < items_per_thread; item++)
+    {
+        data[item] = thread_data[item];
+    }
+}
+
+template<
+        typename data_type,
+        unsigned int items_per_thread,
+        unsigned int threads_in_block,
+        typename action_type>
+void BlockShuffleTest(data_type *data, action_type action)
+{
+    BlockShuffleTestKernel<data_type, threads_in_block, items_per_thread><<<1, threads_in_block>>> (data, action);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+}
+
+template <
+        typename data_type,
+        unsigned int items_per_thread,
+        unsigned int threads_in_block>
+struct UpTest
+{
+    __device__ void operator()(
+            BlockShuffle<data_type, threads_in_block> &block_shuffle,
+            data_type (&thread_data)[items_per_thread]) const
+    {
+        block_shuffle.Up(thread_data, thread_data);
+    }
+
+    static __host__ bool check(const data_type *data, int i)
+    {
+        if (i == 0)
+        {
+            return data[i] == 0;
+        }
+
+        return data[i] == i - 1;
+    }
+};
+
+template <
+        typename data_type,
+        unsigned int items_per_thread,
+        unsigned int threads_in_block>
+struct DownTest
+{
+    __device__ void operator()(
+            BlockShuffle<data_type, threads_in_block> &block_shuffle,
+            data_type (&thread_data)[items_per_thread]) const
+    {
+        block_shuffle.Down(thread_data, thread_data);
+    }
+
+    static __host__ bool check(const data_type *data, int i)
+    {
+        if (i == items_per_thread * threads_in_block - 1)
+        {
+            return data[i] == i;
+        }
+
+        return data[i] == i + 1;
+    }
+};
+
+template<typename data_type,
+         unsigned int threads_in_block,
+         int offset>
+struct OffsetTestBase
+{
+    static constexpr unsigned int items_per_thread = 1;
+
+    __device__ void operator()(
+            BlockShuffle<data_type, threads_in_block> &block_shuffle,
+            data_type (&thread_data)[items_per_thread]) const
+    {
+        block_shuffle.Offset(thread_data[0], thread_data[0], offset);
+    }
+};
+
+template <typename data_type,
+          unsigned int threads_in_block>
+struct OffsetUpTest : public OffsetTestBase<data_type, threads_in_block, -1 /* offset */>
+{
+    static __host__ bool check(const data_type *data, int i)
+    {
+        return UpTest<data_type, 1 /* items_per_thread */, threads_in_block>::check (data, i);
+    }
+};
+
+template<typename data_type,
+         unsigned int threads_in_block>
+struct OffsetDownTest : public OffsetTestBase<data_type, threads_in_block, 1 /* offset */>
+{
+    static __host__ bool check(const data_type *data, int i)
+    {
+        return DownTest<data_type, 1 /* items_per_thread */, threads_in_block>::check (data, i);
+    }
+};
+
+template<typename data_type,
+         unsigned int threads_in_block,
+         unsigned int offset>
+struct RotateTestBase
+{
+    static constexpr unsigned int items_per_thread = 1;
+
+    __device__ void operator()(
+            BlockShuffle<data_type, threads_in_block> &block_shuffle,
+            data_type (&thread_data)[items_per_thread]) const
+    {
+        block_shuffle.Rotate(thread_data[0], thread_data[0], offset);
+    }
+
+    static __host__ bool check(const data_type *data, int i)
+    {
+        return data[i] == static_cast<data_type>((i + offset) % threads_in_block);
+    }
+};
+
+template<typename data_type,
+         unsigned int threads_in_block>
+struct RotateUpTest : public RotateTestBase<data_type, threads_in_block, 1 /* offset */>
+{ };
+
+template<typename data_type,
+         unsigned int threads_in_block>
+struct RotateTest : public RotateTestBase<data_type, threads_in_block, 24 /* offset */>
+{ };
+
+
+template <typename data_type, typename test_type>
+int CheckResult(
+        int num_items,
+        const data_type *d_output,
+        data_type *h_output,
+        const test_type &test)
+{
+    CubDebugExit(cudaMemcpy(h_output, d_output, num_items * sizeof (data_type), cudaMemcpyDeviceToHost));
+
+    for (int i = 0; i < num_items; i++)
+    {
+        if (!test.check (h_output, i))
+        {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+template <
+        typename data_type,
+        unsigned int items_per_thread,
+        unsigned int threads_in_block,
+        template<typename, unsigned int, unsigned int> class test_type>
+void Test(unsigned int num_items,
+          data_type *d_data,
+          data_type *h_data)
+{
+    test_type<data_type, items_per_thread, threads_in_block> test;
+
+    Iota(num_items, d_data);
+    BlockShuffleTest<data_type, items_per_thread, threads_in_block>(d_data, test);
+    AssertEquals(0, CheckResult(num_items, d_data, h_data, test));
+}
+
+template <
+        typename data_type,
+        unsigned int items_per_thread,
+        unsigned int threads_in_block,
+        template<typename, unsigned int> class test_type>
+struct SingleItemTestHelper
+{
+    static void run(unsigned int /* num_items */,
+                    data_type * /* d_data */,
+                    data_type * /* h_data */)
+    {
+    }
+};
+
+template <
+        typename data_type,
+        unsigned int threads_in_block,
+        template<typename, unsigned int> class test_type>
+struct SingleItemTestHelper<data_type, 1, threads_in_block, test_type>
+{
+    static void run(unsigned int num_items,
+                   data_type *d_data,
+                   data_type *h_data)
+    {
+        test_type<data_type, threads_in_block> test;
+
+        Iota(num_items, d_data);
+        BlockShuffleTest<data_type, 1 /* items_per_thread */, threads_in_block>(d_data, test);
+        AssertEquals(0, CheckResult(num_items, d_data, h_data, test));
+    }
+};
+
+
+template <
+        typename data_type,
+        unsigned int items_per_thread,
+        unsigned int threads_in_block>
+void Test(
+    CachingDeviceAllocator &g_allocator
+)
+{
+    const unsigned int num_items = items_per_thread * threads_in_block;
+
+    data_type *d_data = nullptr;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(data_type) * num_items));
+
+    std::unique_ptr<data_type[]> h_data(new data_type[num_items]);
+
+    Test<data_type, items_per_thread, threads_in_block, UpTest>(num_items, d_data, h_data.get());
+    Test<data_type, items_per_thread, threads_in_block, DownTest>(num_items, d_data, h_data.get());
+
+    SingleItemTestHelper<data_type, items_per_thread, threads_in_block, OffsetUpTest>().run(num_items,
+                                                                                            d_data,
+                                                                                            h_data.get());
+
+    SingleItemTestHelper<data_type, items_per_thread, threads_in_block, OffsetDownTest>().run(num_items,
+                                                                                              d_data,
+                                                                                              h_data.get());
+
+    SingleItemTestHelper<data_type, items_per_thread, threads_in_block, RotateUpTest>().run(num_items,
+                                                                                            d_data,
+                                                                                            h_data.get());
+
+    SingleItemTestHelper<data_type, items_per_thread, threads_in_block, RotateTest>().run(num_items,
+                                                                                          d_data,
+                                                                                          h_data.get());
+
+    if (d_data)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_data));
+    }
+}
+
+
+int main(int argc, char** argv)
+{
+    CommandLineArgs args(argc, argv);
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    CachingDeviceAllocator g_allocator(true);
+
+    Test<int16_t, 1, 32>(g_allocator);
+    Test<int32_t, 1, 32>(g_allocator);
+    Test<int32_t, 1, 512>(g_allocator);
+    Test<int64_t, 1, 512>(g_allocator);
+    Test<int64_t, 1, 1024>(g_allocator);
+
+    return 0;
+}