From dcf0671f6feb20dfd746e208d0a7fa23a5084107 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 25 May 2021 14:13:41 +0300 Subject: [PATCH] Fix block shuffle --- cmake/CubCudaConfig.cmake | 2 +- cub/block/block_shuffle.cuh | 35 ++-- test/test_block_shuffle.cu | 358 ++++++++++++++++++++++++++++++++++++ 3 files changed, 374 insertions(+), 21 deletions(-) create mode 100644 test/test_block_shuffle.cu diff --git a/cmake/CubCudaConfig.cmake b/cmake/CubCudaConfig.cmake index d9e77d1df6..398ed3ed76 100644 --- a/cmake/CubCudaConfig.cmake +++ b/cmake/CubCudaConfig.cmake @@ -4,7 +4,7 @@ enable_language(CUDA) # Architecture options: # -set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80) +set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86) set(arch_message "CUB: Explicitly enabled compute architectures:") # Thrust sets up the architecture flags in CMAKE_CUDA_FLAGS already. Just diff --git a/cub/block/block_shuffle.cuh b/cub/block/block_shuffle.cuh index ba2e9b59a0..39b0d2c116 100644 --- a/cub/block/block_shuffle.cuh +++ b/cub/block/block_shuffle.cuh @@ -87,11 +87,7 @@ private: ******************************************************************************/ /// Shared memory storage layout type (last element from each thread's input) - struct _TempStorage - { - T prev[BLOCK_THREADS]; - T next[BLOCK_THREADS]; - }; + typedef T _TempStorage[BLOCK_THREADS]; public: @@ -171,14 +167,14 @@ public: T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 int distance = 1) ///< [in] Offset distance (may be negative) { - temp_storage[linear_tid].prev = input; + temp_storage[linear_tid] = input; CTA_SYNC(); const int offset_tid = static_cast(linear_tid) + distance; if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS)) { - output = temp_storage[static_cast(offset_tid)].prev; + output = temp_storage[static_cast(offset_tid)]; } } @@ -194,7 +190,7 @@ public: T& output, ///< [out] The \p input item from thread thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) { - temp_storage[linear_tid].prev = input; + temp_storage[linear_tid] = input; CTA_SYNC(); @@ -202,7 +198,7 @@ public: if (offset >= BLOCK_THREADS) offset -= BLOCK_THREADS; - output = temp_storage[offset].prev; + output = temp_storage[offset]; } @@ -219,7 +215,7 @@ public: T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. { - temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); @@ -227,9 +223,8 @@ public: for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) prev[ITEM] = input[ITEM - 1]; - if (linear_tid > 0) - prev[0] = temp_storage[linear_tid - 1].prev; + prev[0] = temp_storage[linear_tid - 1]; } @@ -248,7 +243,7 @@ public: T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads { Up(input, prev); - block_suffix = temp_storage[BLOCK_THREADS - 1].prev; + block_suffix = temp_storage[BLOCK_THREADS - 1]; } @@ -265,16 +260,16 @@ public: T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. { - temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + temp_storage[linear_tid] = input[0]; CTA_SYNC(); #pragma unroll - for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) - prev[ITEM] = input[ITEM - 1]; + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++) + prev[ITEM] = input[ITEM + 1]; - if (linear_tid > 0) - prev[0] = temp_storage[linear_tid - 1].prev; + if (linear_tid < BLOCK_THREADS - 1) + prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1]; } @@ -292,8 +287,8 @@ public: T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads { - Up(input, prev); - block_prefix = temp_storage[BLOCK_THREADS - 1].prev; + Down(input, prev); + block_prefix = temp_storage[0]; } //@} end member group diff --git a/test/test_block_shuffle.cu b/test/test_block_shuffle.cu new file mode 100644 index 0000000000..d7b250b92f --- /dev/null +++ b/test/test_block_shuffle.cu @@ -0,0 +1,358 @@ +/****************************************************************************** + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Test of BlockShuffle utilities + ******************************************************************************/ + +// Ensure printing of CUDA runtime errors to console +#define CUB_STDERR + +#include +#include +#include +#include + +#include +#include + +#include "test_util.h" + +using namespace cub; + +template +__global__ void IotaKernel( + const unsigned int num_items, + data_type *data) +{ + const unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; + + if (i < num_items) + { + data[i] = i; + } +} + +template +void Iota( + const unsigned int num_items, + data_type *data) +{ + const unsigned int threads_per_block = 256; + const unsigned int blocks_per_grid = (num_items + threads_per_block - 1) / threads_per_block; + + IotaKernel<<>>(num_items, data); + + CubDebugExit(cudaPeekAtLastError()); + CubDebugExit(cudaDeviceSynchronize()); +} + +template < + typename data_type, + unsigned int threads_in_block, + unsigned int items_per_thread, + typename action_type> +__global__ void BlockShuffleTestKernel( + data_type *data, + action_type action) +{ + typedef cub::BlockShuffle BlockShuffle; + + __shared__ typename BlockShuffle::TempStorage temp_storage_shuffle; + + data_type thread_data[items_per_thread]; + + data += threadIdx.x * items_per_thread; + for (unsigned int item = 0; item < items_per_thread; item++) + { + thread_data[item] = data[item]; + } + __syncthreads(); + + BlockShuffle block_shuffle(temp_storage_shuffle); + action(block_shuffle, thread_data); + + for (unsigned int item = 0; item < items_per_thread; item++) + { + data[item] = thread_data[item]; + } +} + +template< + typename data_type, + unsigned int items_per_thread, + unsigned int threads_in_block, + typename action_type> +void BlockShuffleTest(data_type *data, action_type action) +{ + BlockShuffleTestKernel<<<1, threads_in_block>>> (data, action); + + CubDebugExit(cudaPeekAtLastError()); + CubDebugExit(cudaDeviceSynchronize()); +} + +template < + typename data_type, + unsigned int items_per_thread, + unsigned int threads_in_block> +struct UpTest +{ + __device__ void operator()( + BlockShuffle &block_shuffle, + data_type (&thread_data)[items_per_thread]) const + { + block_shuffle.Up(thread_data, thread_data); + } + + static __host__ bool check(const data_type *data, int i) + { + if (i == 0) + { + return data[i] == 0; + } + + return data[i] == i - 1; + } +}; + +template < + typename data_type, + unsigned int items_per_thread, + unsigned int threads_in_block> +struct DownTest +{ + __device__ void operator()( + BlockShuffle &block_shuffle, + data_type (&thread_data)[items_per_thread]) const + { + block_shuffle.Down(thread_data, thread_data); + } + + static __host__ bool check(const data_type *data, int i) + { + if (i == items_per_thread * threads_in_block - 1) + { + return data[i] == i; + } + + return data[i] == i + 1; + } +}; + +template +struct OffsetTestBase +{ + static constexpr unsigned int items_per_thread = 1; + + __device__ void operator()( + BlockShuffle &block_shuffle, + data_type (&thread_data)[items_per_thread]) const + { + block_shuffle.Offset(thread_data[0], thread_data[0], offset); + } +}; + +template +struct OffsetUpTest : public OffsetTestBase +{ + static __host__ bool check(const data_type *data, int i) + { + return UpTest::check (data, i); + } +}; + +template +struct OffsetDownTest : public OffsetTestBase +{ + static __host__ bool check(const data_type *data, int i) + { + return DownTest::check (data, i); + } +}; + +template +struct RotateTestBase +{ + static constexpr unsigned int items_per_thread = 1; + + __device__ void operator()( + BlockShuffle &block_shuffle, + data_type (&thread_data)[items_per_thread]) const + { + block_shuffle.Rotate(thread_data[0], thread_data[0], offset); + } + + static __host__ bool check(const data_type *data, int i) + { + return data[i] == static_cast((i + offset) % threads_in_block); + } +}; + +template +struct RotateUpTest : public RotateTestBase +{ }; + +template +struct RotateTest : public RotateTestBase +{ }; + + +template +int CheckResult( + int num_items, + const data_type *d_output, + data_type *h_output, + const test_type &test) +{ + CubDebugExit(cudaMemcpy(h_output, d_output, num_items * sizeof (data_type), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < num_items; i++) + { + if (!test.check (h_output, i)) + { + return 1; + } + } + + return 0; +} + +template < + typename data_type, + unsigned int items_per_thread, + unsigned int threads_in_block, + template class test_type> +void Test(unsigned int num_items, + data_type *d_data, + data_type *h_data) +{ + test_type test; + + Iota(num_items, d_data); + BlockShuffleTest(d_data, test); + AssertEquals(0, CheckResult(num_items, d_data, h_data, test)); +} + +template < + typename data_type, + unsigned int items_per_thread, + unsigned int threads_in_block, + template class test_type> +struct SingleItemTestHelper +{ + static void run(unsigned int /* num_items */, + data_type * /* d_data */, + data_type * /* h_data */) + { + } +}; + +template < + typename data_type, + unsigned int threads_in_block, + template class test_type> +struct SingleItemTestHelper +{ + static void run(unsigned int num_items, + data_type *d_data, + data_type *h_data) + { + test_type test; + + Iota(num_items, d_data); + BlockShuffleTest(d_data, test); + AssertEquals(0, CheckResult(num_items, d_data, h_data, test)); + } +}; + + +template < + typename data_type, + unsigned int items_per_thread, + unsigned int threads_in_block> +void Test( + CachingDeviceAllocator &g_allocator +) +{ + const unsigned int num_items = items_per_thread * threads_in_block; + + data_type *d_data = nullptr; + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(data_type) * num_items)); + + std::unique_ptr h_data(new data_type[num_items]); + + Test(num_items, d_data, h_data.get()); + Test(num_items, d_data, h_data.get()); + + SingleItemTestHelper().run(num_items, + d_data, + h_data.get()); + + SingleItemTestHelper().run(num_items, + d_data, + h_data.get()); + + SingleItemTestHelper().run(num_items, + d_data, + h_data.get()); + + SingleItemTestHelper().run(num_items, + d_data, + h_data.get()); + + if (d_data) + { + CubDebugExit(g_allocator.DeviceFree(d_data)); + } +} + + +int main(int argc, char** argv) +{ + CommandLineArgs args(argc, argv); + + // Initialize device + CubDebugExit(args.DeviceInit()); + + CachingDeviceAllocator g_allocator(true); + + Test(g_allocator); + Test(g_allocator); + Test(g_allocator); + Test(g_allocator); + Test(g_allocator); + + return 0; +}