diff --git a/cub/agent/agent_select_if.cuh b/cub/agent/agent_select_if.cuh index f88d2f384f..807ea18c6d 100644 --- a/cub/agent/agent_select_if.cuh +++ b/cub/agent/agent_select_if.cuh @@ -93,7 +93,7 @@ template < typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type typename InputIteratorT, ///< Random-access input iterator type for selection items typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items + typename SelectedOutputIteratorT, ///< Random-access output iterator type for selection_flags items typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) typename OffsetT, ///< Signed integer type for global offsets @@ -511,7 +511,7 @@ struct AgentSelectIf */ template __device__ __forceinline__ OffsetT ConsumeFirstTile( - int num_tile_items, ///< Number of input items comprising this tile + int num_tile_items, ///< Number of input items comprising this tile OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { @@ -570,7 +570,7 @@ struct AgentSelectIf */ template __device__ __forceinline__ OffsetT ConsumeSubsequentTile( - int num_tile_items, ///< Number of input items comprising this tile + int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor @@ -632,7 +632,7 @@ struct AgentSelectIf */ template __device__ __forceinline__ OffsetT ConsumeTile( - int num_tile_items, ///< Number of input items comprising this tile + int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor @@ -688,4 +688,3 @@ struct AgentSelectIf CUB_NAMESPACE_END - diff --git a/cub/agent/agent_unique_by_key.cuh b/cub/agent/agent_unique_by_key.cuh new file mode 100644 index 0000000000..d5925c43be --- /dev/null +++ b/cub/agent/agent_unique_by_key.cuh @@ -0,0 +1,571 @@ +/****************************************************************************** + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key. + */ + +#pragma once + +#include +#include + +#include "../thread/thread_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_scan.cuh" +#include "../agent/single_pass_scan_operators.cuh" +#include "../block/block_discontinuity.cuh" + +CUB_NAMESPACE_BEGIN + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentUniqueByKey + */ +template +struct AgentUniqueByKeyPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, + }; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + +/** + * \brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key + */ +template < + typename AgentUniqueByKeyPolicyT, ///< Parameterized AgentUniqueByKeyPolicy tuning policy type + typename KeyInputIteratorT, ///< Random-access input iterator type for keys + typename ValueInputIteratorT, ///< Random-access input iterator type for values + typename KeyOutputIteratorT, ///< Random-access output iterator type for keys + typename ValueOutputIteratorT, ///< Random-access output iterator type for values + typename EqualityOpT, ///< Equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentUniqueByKey +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input key and value type + using KeyT = typename std::iterator_traits::value_type; + using ValueT = typename std::iterator_traits::value_type; + + // Tile status descriptor interface type + using ScanTileStateT = ScanTileState; + + // Constants + enum + { + BLOCK_THREADS = AgentUniqueByKeyPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentUniqueByKeyPolicyT::ITEMS_PER_THREAD, + ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + using WrappedKeyInputIteratorT = typename std::conditional::value, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + KeyInputIteratorT>::type; // Directly use the supplied input iterator type + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + using WrappedValueInputIteratorT = typename std::conditional::value, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + ValueInputIteratorT>::type; // Directly use the supplied input iterator type + + // Parameterized BlockLoad type for input data + using BlockLoadKeys = BlockLoad< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>; + + // Parameterized BlockLoad type for flags + using BlockLoadValues = BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>; + + // Parameterized BlockDiscontinuity type for items + using BlockDiscontinuityKeys = cub::BlockDiscontinuity; + + // Parameterized BlockScan type + using BlockScanT = cub::BlockScan; + + // Parameterized BlockDiscontinuity type for items + using TilePrefixCallback = cub::TilePrefixCallbackOp; + + // Key exchange type + using KeyExchangeT = KeyT[ITEMS_PER_TILE]; + + // Value exchange type + using ValueExchangeT = ValueT[ITEMS_PER_TILE]; + + // Shared memory type for this thread block + union _TempStorage + { + struct ScanStorage + { + typename BlockScanT::TempStorage scan; + typename TilePrefixCallback::TempStorage prefix; + typename BlockDiscontinuityKeys::TempStorage discontinuity; + } scan_storage; + + // Smem needed for loading keys + typename BlockLoadKeys::TempStorage load_keys; + + // Smem needed for loading values + typename BlockLoadValues::TempStorage load_values; + + // Smem needed for compacting items (allows non POD items in this union) + Uninitialized shared_keys; + Uninitialized shared_values; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; + WrappedKeyInputIteratorT d_keys_in; + WrappedValueInputIteratorT d_values_in; + KeyOutputIteratorT d_keys_out; + ValueOutputIteratorT d_values_out; + cub::InequalityWrapper inequality_op; + OffsetT num_items; + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentUniqueByKey( + TempStorage &temp_storage_, + WrappedKeyInputIteratorT d_keys_in_, + WrappedValueInputIteratorT d_values_in_, + KeyOutputIteratorT d_keys_out_, + ValueOutputIteratorT d_values_out_, + EqualityOpT equality_op_, + OffsetT num_items_) + : + temp_storage(temp_storage_.Alias()), + d_keys_in(d_keys_in_), + d_values_in(d_values_in_), + d_keys_out(d_keys_out_), + d_values_out(d_values_out_), + inequality_op(equality_op_), + num_items(num_items_) + {} + + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + struct KeyTagT {}; + struct ValueTagT {}; + + __device__ __forceinline__ + KeyExchangeT &GetShared(KeyTagT) + { + return temp_storage.shared_keys.Alias(); + } + __device__ __forceinline__ + ValueExchangeT &GetShared(ValueTagT) + { + return temp_storage.shared_values.Alias(); + } + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + template + __device__ __forceinline__ void Scatter( + Tag tag, + OutputIt items_out, + T (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int /*num_tile_items*/, + int num_tile_selections, + OffsetT num_selections_prefix, + OffsetT /*num_selections*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int local_scatter_offset = selection_indices[ITEM] - + num_selections_prefix; + if (selection_flags[ITEM]) + { + GetShared(tag)[local_scatter_offset] = items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; + item < num_tile_selections; + item += BLOCK_THREADS) + { + items_out[num_selections_prefix + item] = GetShared(tag)[item]; + } + + CTA_SYNC(); + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeFirstTile( + int num_tile_items, ///< Number of input items comprising this tile + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + KeyT keys[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_idx[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadKeys(temp_storage.load_keys) + .Load(d_keys_in + tile_offset, + keys, + num_tile_items, + *(d_keys_in + tile_offset)); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); + } + + + CTA_SYNC(); + + ValueT values[ITEMS_PER_THREAD]; + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, + values, + num_tile_items, + *(d_values_in + tile_offset)); + } + else + { + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, values); + } + + CTA_SYNC(); + + BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) + .FlagHeads(selection_flags, keys, inequality_op); + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Set selection_flags for out-of-bounds items + if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) + selection_flags[ITEM] = 1; + } + + CTA_SYNC(); + + + OffsetT num_tile_selections = 0; + OffsetT num_selections = 0; + OffsetT num_selections_prefix = 0; + + BlockScanT(temp_storage.scan_storage.scan) + .ExclusiveSum(selection_flags, + selection_idx, + num_tile_selections); + + if (threadIdx.x == 0) + { + // Update tile status if this is not the last tile + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, num_tile_selections); + } + + // Do not count any out-of-bounds selections + if (IS_LAST_TILE) + { + int num_discount = ITEMS_PER_TILE - num_tile_items; + num_tile_selections -= num_discount; + } + num_selections = num_tile_selections; + + CTA_SYNC(); + + Scatter(KeyTagT(), + d_keys_out, + keys, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + CTA_SYNC(); + + Scatter(ValueTagT(), + d_values_out, + values, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + return num_selections; + } + + /** + * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeSubsequentTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + KeyT keys[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_idx[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadKeys(temp_storage.load_keys) + .Load(d_keys_in + tile_offset, + keys, + num_tile_items, + *(d_keys_in + tile_offset)); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); + } + + + CTA_SYNC(); + + ValueT values[ITEMS_PER_THREAD]; + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, + values, + num_tile_items, + *(d_values_in + tile_offset)); + } + else + { + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, values); + } + + CTA_SYNC(); + + KeyT tile_predecessor = d_keys_in[tile_offset - 1]; + BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) + .FlagHeads(selection_flags, keys, inequality_op, tile_predecessor); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Set selection_flags for out-of-bounds items + if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) + selection_flags[ITEM] = 1; + } + + CTA_SYNC(); + + + OffsetT num_tile_selections = 0; + OffsetT num_selections = 0; + OffsetT num_selections_prefix = 0; + + TilePrefixCallback prefix_cb(tile_state, + temp_storage.scan_storage.prefix, + cub::Sum(), + tile_idx); + BlockScanT(temp_storage.scan_storage.scan) + .ExclusiveSum(selection_flags, + selection_idx, + prefix_cb); + + num_selections = prefix_cb.GetInclusivePrefix(); + num_tile_selections = prefix_cb.GetBlockAggregate(); + num_selections_prefix = prefix_cb.GetExclusivePrefix(); + + if (IS_LAST_TILE) + { + int num_discount = ITEMS_PER_TILE - num_tile_items; + num_tile_selections -= num_discount; + num_selections -= num_discount; + } + + CTA_SYNC(); + + Scatter(KeyTagT(), + d_keys_out, + keys, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + CTA_SYNC(); + + Scatter(ValueTagT(), + d_values_out, + values, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + return num_selections; + } + + + /** + * Process a tile of input + */ + template + __device__ __forceinline__ OffsetT ConsumeTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OffsetT num_selections; + if (tile_idx == 0) + { + num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); + } + else + { + num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); + } + + return num_selections; + } + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selection_flags + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state, ///< Global tile state descriptor + NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * ITEMS_PER_TILE; // Global offset for the current tile + + if (tile_idx < num_tiles - 1) + { + ConsumeTile(ITEMS_PER_TILE, + tile_idx, + tile_offset, + tile_state); + } + else + { + int num_remaining = static_cast(num_items - tile_offset); + OffsetT num_selections = ConsumeTile(num_remaining, + tile_idx, + tile_offset, + tile_state); + if (threadIdx.x == 0) + { + *d_num_selected_out = num_selections; + } + } + } +}; + + + +CUB_NAMESPACE_END diff --git a/cub/device/device_select.cuh b/cub/device/device_select.cuh index ed896a68d2..f9ed6d0a25 100644 --- a/cub/device/device_select.cuh +++ b/cub/device/device_select.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +38,7 @@ #include #include "dispatch/dispatch_select_if.cuh" +#include "dispatch/dispatch_unique_by_key.cuh" #include "../config.cuh" CUB_NAMESPACE_BEGIN @@ -323,7 +324,7 @@ struct DeviceSelect typename NumSelectedIteratorT> CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Unique( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items @@ -332,7 +333,7 @@ struct DeviceSelect cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - typedef int OffsetT; // Signed integer type for global offsets + typedef int OffsetT; // Signed integer type for global offsets typedef NullType* FlagIterator; // FlagT iterator type (not used) typedef NullType SelectOp; // Selection op (not used) typedef Equality EqualityOp; // Default == operator @@ -351,6 +352,88 @@ struct DeviceSelect debug_synchronous); } + + /** + * \brief Given an input sequence \p d_keys_in and \p d_values_in with runs of key-value pairs with consecutive equal-valued keys, only the first key and its value from each run is selectively copied to \p d_keys_out and \p d_values_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * int *d_keys_out; // e.g., [ , , , , , , , ] + * int *d_values_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items); + * + * // d_keys_out <-- [0, 2, 9, 5, 8] + * // d_values_out <-- [1, 2, 4, 5, 8] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam KeyInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator + * \tparam ValueInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator + * \tparam KeyOutputIteratorT [inferred] Random-access output iterator type for writing selected keys \iterator + * \tparam ValueOutputIteratorT [inferred] Random-access output iterator type for writing selected values \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyOutputIteratorT, + typename ValueOutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t UniqueByKey( + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected keys + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected values + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + using OffsetT = int; + using EqualityOp = Equality; + + return DispatchUniqueByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } }; /** diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index 8f4f0be638..0b480899c5 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -488,21 +488,6 @@ struct DispatchMergeSort : SelectedPolicy bool debug_synchronous; int ptx_version; - CUB_RUNTIME_FUNCTION __forceinline__ std::size_t - vshmem_size(std::size_t max_shmem, - std::size_t shmem_per_block, - std::size_t num_blocks) - { - if (shmem_per_block > max_shmem) - { - return shmem_per_block * num_blocks; - } - else - { - return 0; - } - } - // Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchMergeSort(void *d_temp_storage, @@ -637,9 +622,9 @@ struct DispatchMergeSort : SelectedPolicy static_cast(max_shmem); virtual_shared_memory_size = - vshmem_size(static_cast(max_shmem), - (cub::max)(block_sort_shmem_size, merge_shmem_size), - static_cast(num_tiles)); + detail::VshmemSize(static_cast(max_shmem), + (cub::max)(block_sort_shmem_size, merge_shmem_size), + static_cast(num_tiles)); } diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index 2aac82a262..c2d04588be 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -348,7 +348,7 @@ struct DispatchScan: // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index a052252317..94a17419a8 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -333,7 +333,7 @@ struct DispatchSelectIf // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; // Get grid size for scanning tiles dim3 scan_grid_size; @@ -446,5 +446,3 @@ struct DispatchSelectIf CUB_NAMESPACE_END - - diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh new file mode 100644 index 0000000000..8597e3c821 --- /dev/null +++ b/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -0,0 +1,430 @@ + +/****************************************************************************** + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory. + */ + +#include "../../agent/agent_unique_by_key.cuh" +#include "../../util_math.cuh" +#include "../../util_macro.cuh" + +#include "dispatch_scan.cuh" + +CUB_NAMESPACE_BEGIN + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Unique by key kernel entry point (multi-block) + */ +template < + typename AgentUniqueByKeyPolicyT, ///< Parameterized AgentUniqueByKeyPolicy tuning policy type + typename KeyInputIteratorT, ///< Random-access input iterator type for keys + typename ValueInputIteratorT, ///< Random-access input iterator type for values + typename KeyOutputIteratorT, ///< Random-access output iterator type for keys + typename ValueOutputIteratorT, ///< Random-access output iterator type for values + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< Equality operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS)) +__global__ void DeviceUniqueByKeySweepKernel( + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + ScanTileStateT tile_state, ///< [in] Tile status interface + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + using AgentUniqueByKeyT = AgentUniqueByKey< + typename AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT, + KeyInputIteratorT, + ValueInputIteratorT, + KeyOutputIteratorT, + ValueOutputIteratorT, + EqualityOpT, + OffsetT>; + + // Shared memory for AgentSelectIf + __shared__ typename AgentUniqueByKeyT::TempStorage temp_storage; + + // Process tiles + AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items).ConsumeRange( + num_tiles, + tile_state, + d_num_selected_out); +} + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +template +struct DeviceUniqueByKeyPolicy +{ + using KeyT = typename std::iterator_traits::value_type; + + // SM350 + struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { + const static int INPUT_SIZE = sizeof(KeyT); + enum + { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), + }; + + using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<128, + ITEMS_PER_THREAD, + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS>; + }; + + // SM520 + struct Policy520 : ChainedPolicy<520, Policy520, Policy350> + { + const static int INPUT_SIZE = sizeof(KeyT); + enum + { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), + }; + + using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<64, + ITEMS_PER_THREAD, + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS>; + }; + + /// MaxPolicy + using MaxPolicy = Policy520; +}; + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename KeyInputIteratorT, ///< Random-access input iterator type for keys + typename ValueInputIteratorT, ///< Random-access input iterator type for values + typename KeyOutputIteratorT, ///< Random-access output iterator type for keys + typename ValueOutputIteratorT, ///< Random-access output iterator type for values + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename EqualityOpT, ///< Equality operator type + typename OffsetT, ///< Signed integer type for global offsets + typename SelectedPolicy = DeviceUniqueByKeyPolicy> +struct DispatchUniqueByKey: SelectedPolicy +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // The input key and value type + using KeyT = typename std::iterator_traits::value_type; + using ValueT = typename std::iterator_traits::value_type; + + // Tile status descriptor interface type + using ScanTileStateT = ScanTileState; + + + void* d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in; ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in; ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out; ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out; ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out; ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + EqualityOpT equality_op; ///< [in] Equality operator + OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + cudaStream_t stream; ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; + + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchUniqueByKey( + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous + ): + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys_in(d_keys_in), + d_values_in(d_values_in), + d_keys_out(d_keys_out), + d_values_out(d_values_out), + d_num_selected_out(d_num_selected_out), + equality_op(equality_op), + num_items(num_items), + stream(stream), + debug_synchronous(debug_synchronous) + {} + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + template + CUB_RUNTIME_FUNCTION __host__ __forceinline__ + cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) + { +#ifndef CUB_RUNTIME_ENABLED + + (void)init_kernel; + (void)scan_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + using Policy = typename ActivePolicyT::UniqueByKeyPolicyT; + using UniqueByKeyAgentT = AgentUniqueByKey; + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Number of input tiles + int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); + + // Size of virtual shared memory + int max_shmem = 0; + if (CubDebug( + error = cudaDeviceGetAttribute(&max_shmem, + cudaDevAttrMaxSharedMemoryPerBlock, + device_ordinal))) + { + break; + } + std::size_t vshmem_size = detail::VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles); + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2] = {0, vshmem_size}; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void *allocations[2] = {NULL, NULL}; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + num_tiles = CUB_MAX(1, num_tiles); + int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(init_kernel, tile_state, num_tiles, d_num_selected_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log select_if_kernel configuration + if (debug_synchronous) + { + // Get SM occupancy for unique_by_key_kernel + int scan_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out + scan_kernel, + Policy::BLOCK_THREADS))) + { + break; + } + + _CubLog("Invoking unique_by_key_kernel<<<{%d,%d,%d}, %d, 0, " + "%lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, + scan_grid_size.y, + scan_grid_size.z, + Policy::BLOCK_THREADS, + (long long)stream, + Policy::ITEMS_PER_THREAD, + scan_sm_occupancy); + } + + // Invoke select_if_kernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + scan_grid_size, Policy::BLOCK_THREADS, 0, stream + ).doit(scan_kernel, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + tile_state, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while(0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + template + CUB_RUNTIME_FUNCTION __host__ __forceinline__ + cudaError_t Invoke() + { + // Ensure kernels are instantiated. + return Invoke( + DeviceCompactInitKernel, + DeviceUniqueByKeySweepKernel< + ActivePolicyT, + KeyInputIteratorT, + ValueInputIteratorT, + KeyOutputIteratorT, + ValueOutputIteratorT, + NumSelectedIteratorT, + ScanTileStateT, + EqualityOpT, + OffsetT> + ); + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; + + cudaError_t error; + do + { + // Get PTX version + int ptx_version = 0; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchUniqueByKey dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + equality_op, + num_items, + stream, + debug_synchronous + ); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + +CUB_NAMESPACE_END diff --git a/cub/util_math.cuh b/cub/util_math.cuh index 3c5406fa46..d69fc2ee2d 100644 --- a/cub/util_math.cuh +++ b/cub/util_math.cuh @@ -47,6 +47,14 @@ using is_integral_or_enum = std::integral_constant::value || std::is_enum::value>; +__host__ __device__ __forceinline__ constexpr std::size_t +VshmemSize(std::size_t max_shmem, + std::size_t shmem_per_block, + std::size_t num_blocks) +{ + return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0; +} + } /** diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu index d33e4895ac..497b43fc36 100644 --- a/test/test_device_select_unique.cu +++ b/test/test_device_select_unique.cu @@ -78,11 +78,11 @@ cudaError_t Dispatch( size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, - void* d_temp_storage, + void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) @@ -147,11 +147,11 @@ cudaError_t Dispatch( size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, - void* d_temp_storage, + void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) @@ -275,8 +275,8 @@ void Test( int num_items) { // Allocate device output array and num selected - T *d_out = NULL; - int *d_num_selected_out = NULL; + T *d_out = NULL; + int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); diff --git a/test/test_device_select_unique_by_key.cu b/test/test_device_select_unique_by_key.cu new file mode 100644 index 0000000000..d8db2f7935 --- /dev/null +++ b/test/test_device_select_unique_by_key.cu @@ -0,0 +1,602 @@ +/****************************************************************************** + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Test of DeviceSelect::Unique utilities + ******************************************************************************/ + +// Ensure printing of CUDA runtime errors to console +#define CUB_STDERR + +#include +#include + +#include +#include +#include + +#include "test_util.h" + +using namespace cub; + + +//--------------------------------------------------------------------- +// Globals, constants and typedefs +//--------------------------------------------------------------------- + +bool g_verbose = false; +int g_timing_iterations = 0; +int g_repeat = 0; +float g_device_giga_bandwidth; +CachingDeviceAllocator g_allocator(true); + +// Dispatch types +enum Backend +{ + CUB, // CUB method + CDP, // GPU-based (dynamic parallelism) dispatch to CUB method +}; + + +//--------------------------------------------------------------------- +// Dispatch to different CUB DeviceSelect entrypoints +//--------------------------------------------------------------------- + + +/** + * Dispatch to unique entrypoint + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t Dispatch( + Int2Type /*dispatch_to*/, + int timing_timing_iterations, + size_t */*d_temp_storage_bytes*/, + cudaError_t */*d_cdp_error*/, + + void* d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, stream, debug_synchronous); + } + return error; +} + +//--------------------------------------------------------------------- +// CUDA Nested Parallelism Test Kernel +//--------------------------------------------------------------------- + +/** + * Simple wrapper kernel to invoke DeviceSelect + */ +template +__global__ void CnpDispatchKernel( + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void* d_temp_storage, + size_t temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + bool debug_synchronous) +{ + +#ifndef CUB_CDP + (void)timing_timing_iterations; + (void)d_temp_storage_bytes; + (void)d_cdp_error; + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_keys_in; + (void)d_values_in; + (void)d_keys_out; + (void)d_values_out; + (void)d_num_selected_out; + (void)num_items; + (void)debug_synchronous; + *d_cdp_error = cudaErrorNotSupported; +#else + *d_cdp_error = Dispatch(Int2Type(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, + d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, debug_synchronous); + *d_temp_storage_bytes = temp_storage_bytes; +#endif +} + + +/** + * Dispatch to CDP kernel + */ +template +cudaError_t Dispatch( + Int2Type dispatch_to, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void* d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + // Invoke kernel to invoke device-side dispatch + CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, + d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, debug_synchronous); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); + + // Copy out error + cudaError_t retval; + CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); + return retval; +} + + + +//--------------------------------------------------------------------- +// Test generation +//--------------------------------------------------------------------- + + +/** + * Initialize problem + */ +template +void Initialize( + int entropy_reduction, + T *h_in, + int num_items, + int max_segment) +{ + unsigned int max_int = (unsigned int) -1; + + int key = 0; + int i = 0; + while (i < num_items) + { + // Select number of repeating occurrences for the current run + int repeat; + if (max_segment < 0) + { + repeat = num_items; + } + else if (max_segment < 2) + { + repeat = 1; + } + else + { + RandomBits(repeat, entropy_reduction); + repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int)); + repeat = CUB_MAX(1, repeat); + } + + int j = i; + while (j < CUB_MIN(i + repeat, num_items)) + { + InitValue(INTEGER_SEED, h_in[j], key); + j++; + } + + i = j; + key++; + } + + if (g_verbose) + { + printf("Input:\n"); + DisplayResults(h_in, num_items); + printf("\n\n"); + } +} + + +/** + * Solve unique problem + */ +template < + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyT, + typename ValueT> +int Solve( + KeyInputIteratorT h_keys_in, + ValueInputIteratorT h_values_in, + KeyT *h_keys_reference, + ValueT *h_values_reference, + int num_items) +{ + int num_selected = 0; + if (num_items > 0) + { + h_keys_reference[num_selected] = h_keys_in[0]; + h_values_reference[num_selected] = h_values_in[0]; + num_selected++; + } + + for (int i = 1; i < num_items; ++i) + { + if (h_keys_in[i] != h_keys_in[i - 1]) + { + h_keys_reference[num_selected] = h_keys_in[i]; + h_values_reference[num_selected] = h_values_in[i]; + num_selected++; + } + } + + return num_selected; +} + + + +/** + * Test DeviceSelect for a given problem input + */ +template < + Backend BACKEND, + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyT, + typename ValueT> +void Test( + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyT *h_keys_reference, + ValueT *h_values_reference, + int num_selected, + int num_items) +{ + // Allocate device output array and num selected + KeyT *d_keys_out = NULL; + ValueT *d_values_out = NULL; + int *d_num_selected_out = NULL; + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); + + // Allocate CDP device arrays + size_t *d_temp_storage_bytes = NULL; + cudaError_t *d_cdp_error = NULL; + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); + + // Allocate temporary storage + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, true)); + CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + + // Clear device output array + CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items)); + CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items)); + CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int))); + + // Run warmup/correctness iteration + CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, true)); + + // Check for correctness (and display results, if specified) + int compare11 = CompareDeviceResults(h_keys_reference, d_keys_out, num_selected, true, g_verbose); + int compare12 = CompareDeviceResults(h_values_reference, d_values_out, num_selected, true, g_verbose); + int compare1 = compare11 && compare12; + printf("\t Data %s ", compare1 ? "FAIL" : "PASS"); + + int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); + printf("\t Count %s ", compare2 ? "FAIL" : "PASS"); + + // Flush any stdout/stderr + fflush(stdout); + fflush(stderr); + + // Performance + GpuTimer gpu_timer; + gpu_timer.Start(); + CubDebugExit(Dispatch(Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, false)); + gpu_timer.Stop(); + float elapsed_millis = gpu_timer.ElapsedMillis(); + + // Display performance + if (g_timing_iterations > 0) + { + float avg_millis = elapsed_millis / g_timing_iterations; + float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; + float giga_bandwidth = float((num_items + num_selected) * (sizeof(KeyT) + sizeof(ValueT))) / avg_millis / 1000.0f / 1000.0f; + printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); + } + printf("\n\n"); + + // Flush any stdout/stderr + fflush(stdout); + fflush(stderr); + + // Cleanup + if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out)); + if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out)); + if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); + if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); + if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); + if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); + + // Correctness asserts + AssertEquals(0, compare1 | compare2); +} + + +/** + * Test DeviceSelect on pointer type + */ +template < + Backend BACKEND, + typename KeyT, + typename ValueT> +void TestPointer( + int num_items, + int entropy_reduction, + int max_segment) +{ + // Allocate host arrays + KeyT* h_keys_in = new KeyT[num_items]; + ValueT* h_values_in = new ValueT[num_items]; + KeyT* h_keys_reference = new KeyT[num_items]; + ValueT* h_values_reference = new ValueT[num_items]; + + // Initialize problem and solution + Initialize(entropy_reduction, h_keys_in, num_items, max_segment); + Initialize(entropy_reduction, h_values_in, num_items, max_segment); + int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); + + printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n", + (BACKEND == CDP) ? "CDP CUB" : "CUB", + num_items, num_selected, float(num_items) / num_selected, + typeid(KeyT).name(), + (int) sizeof(KeyT), + entropy_reduction); + fflush(stdout); + + // Allocate problem device arrays + KeyT *d_keys_in = NULL; + ValueT *d_values_in = NULL; + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items)); + + // Initialize device input + CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); + + // Run Test + Test(d_keys_in, d_values_in, h_keys_reference, h_values_reference, num_selected, num_items); + + // Cleanup + if (h_keys_in) delete[] h_keys_in; + if (h_values_in) delete[] h_values_in; + if (h_keys_reference) delete[] h_keys_reference; + if (h_values_reference) delete[] h_values_reference; + if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in)); + if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in)); +} + + +/** + * Test DeviceSelect on iterator type + */ +template < + Backend BACKEND, + typename KeyT, + typename ValueT> +void TestIterator( + int num_items) +{ + // Use a counting iterator as the input + CountingInputIterator h_keys_in(0); + CountingInputIterator h_values_in(0); + + // Allocate host arrays + KeyT* h_keys_reference = new KeyT[num_items]; + ValueT* h_values_reference = new ValueT[num_items]; + + // Initialize problem and solution + int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); + + printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n", + (BACKEND == CDP) ? "CDP CUB" : "CUB", + num_items, num_selected, float(num_items) / num_selected, + typeid(KeyT).name(), + (int) sizeof(ValueT)); + fflush(stdout); + + // Run Test + Test(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_selected, num_items); + + // Cleanup + if (h_keys_reference) delete[] h_keys_reference; + if (h_values_reference) delete[] h_values_reference; +} + + +/** + * Test different gen modes + */ +template < + Backend BACKEND, + typename KeyT, + typename ValueT> +void Test( + int num_items) +{ + for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11) + { + TestPointer(num_items, 0, max_segment); + TestPointer(num_items, 2, max_segment); + TestPointer(num_items, 7, max_segment); + } +} + + +/** + * Test different dispatch + */ +template < + typename KeyT, + typename ValueT> +void TestOp( + int num_items) +{ + Test(num_items); +#ifdef CUB_CDP + Test(num_items); +#endif +} + + +/** + * Test different input sizes + */ +template < + typename KeyT, + typename ValueT> +void Test( + int num_items) +{ + if (num_items < 0) + { + TestOp(0); + TestOp(1); + TestOp(100); + TestOp(10000); + TestOp(1000000); + } + else + { + TestOp(num_items); + } +} + + + +//--------------------------------------------------------------------- +// Main +//--------------------------------------------------------------------- + +/** + * Main + */ +int main(int argc, char** argv) +{ + int num_items = -1; + int entropy_reduction = 0; + int maxseg = 1000; + + // Initialize command line + CommandLineArgs args(argc, argv); + g_verbose = args.CheckCmdLineFlag("v"); + args.GetCmdLineArgument("n", num_items); + args.GetCmdLineArgument("i", g_timing_iterations); + args.GetCmdLineArgument("repeat", g_repeat); + args.GetCmdLineArgument("maxseg", maxseg); + args.GetCmdLineArgument("entropy", entropy_reduction); + + // Print usage + if (args.CheckCmdLineFlag("help")) + { + printf("%s " + "[--n= " + "[--i= " + "[--device=] " + "[--maxseg=]" + "[--entropy=]" + "[--v] " + "[--cdp]" + "\n", argv[0]); + exit(0); + } + + // Initialize device + CubDebugExit(args.DeviceInit()); + g_device_giga_bandwidth = args.device_giga_bandwidth; + printf("\n"); + + // Test different input types + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + return 0; +} + + +