From f42070e314021a29efeb1550b54930b19bb8932c Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Wed, 23 Mar 2022 15:02:47 -0400 Subject: [PATCH 1/7] Add libcu++ dependency. --- cub/cmake/cub-config.cmake | 67 +++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/cub/cmake/cub-config.cmake b/cub/cmake/cub-config.cmake index 3d7f64bd4d..ebe40efcd2 100644 --- a/cub/cmake/cub-config.cmake +++ b/cub/cmake/cub-config.cmake @@ -8,6 +8,9 @@ if (TARGET CUB::CUB) return() endif() +# Minimum supported libcudacxx version: +set(cub_libcudacxx_version 1.8.0) + function(_cub_declare_interface_alias alias_name ugly_name) # 1) Only IMPORTED and ALIAS targets can be placed in a namespace. # 2) When an IMPORTED library is linked to another target, its include @@ -27,16 +30,64 @@ function(_cub_declare_interface_alias alias_name ugly_name) endfunction() # -# Setup targets +# Setup some internal cache variables # -_cub_declare_interface_alias(CUB::CUB _CUB_CUB) # Pull in the include dir detected by cub-config-version.cmake set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE INTERNAL "Location of CUB headers." + FORCE ) unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache + +if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) + set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) + set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE) +else() + set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) + set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE) +endif() + +# +# Setup dependencies +# + +if (NOT TARGET CUB::libcudacxx) + if (TARGET Thrust::libcudacxx) + # Prefer the same libcudacxx as Thrust, if available: + _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) + target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx) + else() + if (NOT TARGET libcudacxx::libcudacxx) + # First do a non-required search for any co-packaged versions. + # These are preferred. + find_package(libcudacxx ${cub_libcudacxx_version} CONFIG + ${_CUB_QUIET_FLAG} + NO_DEFAULT_PATH # Only check the explicit HINTS below: + HINTS + "${_CUB_INCLUDE_DIR}/../libcudacxx" # Source layout + "${_CUB_CMAKE_DIR}/.." # Install layout + ) + + # A second required search allows externally packaged to be used and fails if + # no suitable package exists. + find_package(libcudacxx ${cub_libcudacxx_version} CONFIG + REQUIRED + ${_CUB_QUIET_FLAG} + ) + endif() + _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) + target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx) + endif() +endif() + +# +# Setup targets +# + +_cub_declare_interface_alias(CUB::CUB _CUB_CUB) target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}") +target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx) if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API") @@ -61,12 +112,12 @@ endif() # Standardize version info # -set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "") -set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "") -set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "") -set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "") -set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "") -set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "") +set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE) +set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE) +set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE) +set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE) +set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE) +set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE) include(FindPackageHandleStandardArgs) if (NOT CUB_CONFIG) From f9beaa51dbf1f708880e695073c6eb009a673a74 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 26 Mar 2021 14:46:35 -0400 Subject: [PATCH 2/7] Remove checks for obsolete architectures. --- cub/agent/agent_histogram.cuh | 5 ---- cub/agent/agent_segment_fixup.cuh | 3 +-- cub/block/block_histogram.cuh | 13 +--------- cub/block/block_radix_rank.cuh | 2 +- cub/block/block_radix_sort.cuh | 2 +- .../specializations/block_histogram_sort.cuh | 4 +-- cub/thread/thread_load.cuh | 23 ++++------------ cub/thread/thread_store.cuh | 16 +++--------- cub/util_arch.cuh | 2 +- cub/util_type.cuh | 15 ----------- cub/warp/specializations/warp_reduce_smem.cuh | 2 +- cub/warp/warp_reduce.cuh | 2 +- .../block/example_block_reduce_dyn_smem.cu | 26 +++---------------- test/test_iterator.cu | 12 +++------ 14 files changed, 25 insertions(+), 102 deletions(-) diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh index c94e7354d7..0f0eb1e622 100644 --- a/cub/agent/agent_histogram.cuh +++ b/cub/agent/agent_histogram.cuh @@ -562,15 +562,10 @@ struct AgentHistogram is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); // Accumulate samples -#if CUB_PTX_ARCH >= 120 if (prefer_smem) AccumulateSmemPixels(samples, is_valid); else AccumulateGmemPixels(samples, is_valid); -#else - AccumulateGmemPixels(samples, is_valid); -#endif - } diff --git a/cub/agent/agent_segment_fixup.cuh b/cub/agent/agent_segment_fixup.cuh index 78361ef354..bd2837ce06 100644 --- a/cub/agent/agent_segment_fixup.cuh +++ b/cub/agent/agent_segment_fixup.cuh @@ -111,8 +111,7 @@ struct AgentSegmentFixup TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, // Whether or not do fixup using RLE + global atomics - USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) && - (std::is_same::value || + USE_ATOMIC_FIXUP = (std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value), diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh index ef15332eaf..7f0e982c25 100644 --- a/cub/block/block_histogram.cuh +++ b/cub/block/block_histogram.cuh @@ -176,20 +176,9 @@ private: BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; - /** - * Ensure the template parameterization meets the requirements of the - * targeted device architecture. BLOCK_HISTO_ATOMIC can only be used - * on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used - * regardless. - */ - static const BlockHistogramAlgorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ? - BLOCK_HISTO_SORT : - ALGORITHM; - /// Internal specialization. using InternalBlockHistogram = - cub::detail::conditional_t= 350) ? true : false, + bool MEMOIZE_OUTER_SCAN = true, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh index 73a75636a9..4663f3b6f1 100644 --- a/cub/block/block_radix_sort.cuh +++ b/cub/block/block_radix_sort.cuh @@ -165,7 +165,7 @@ template < int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, - bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + bool MEMOIZE_OUTER_SCAN = true, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, diff --git a/cub/block/specializations/block_histogram_sort.cuh b/cub/block/specializations/block_histogram_sort.cuh index 5bd2a80b33..e60325d48d 100644 --- a/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/block/specializations/block_histogram_sort.cuh @@ -52,7 +52,7 @@ template < int BINS, ///< The number of bins into which histogram samples may fall int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective (unused) struct BlockHistogramSort { /// Constants @@ -69,7 +69,7 @@ struct BlockHistogramSort ITEMS_PER_THREAD, NullType, 4, - (PTX_ARCH >= 350) ? true : false, + true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, BLOCK_DIM_Y, diff --git a/cub/thread/thread_load.cuh b/cub/thread/thread_load.cuh index 97ee37d81c..73d251c331 100644 --- a/cub/thread/thread_load.cuh +++ b/cub/thread/thread_load.cuh @@ -268,24 +268,11 @@ struct IterateThreadLoad /** * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers */ -#if CUB_PTX_ARCH >= 200 - _CUB_LOAD_ALL(LOAD_CA, ca) - _CUB_LOAD_ALL(LOAD_CG, cg) - _CUB_LOAD_ALL(LOAD_CS, cs) - _CUB_LOAD_ALL(LOAD_CV, cv) -#else - _CUB_LOAD_ALL(LOAD_CA, global) - // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 - _CUB_LOAD_ALL(LOAD_CG, volatile.global) - _CUB_LOAD_ALL(LOAD_CS, global) - _CUB_LOAD_ALL(LOAD_CV, volatile.global) -#endif - -#if CUB_PTX_ARCH >= 350 - _CUB_LOAD_ALL(LOAD_LDG, global.nc) -#else - _CUB_LOAD_ALL(LOAD_LDG, global) -#endif +_CUB_LOAD_ALL(LOAD_CA, ca) +_CUB_LOAD_ALL(LOAD_CG, cg) +_CUB_LOAD_ALL(LOAD_CS, cs) +_CUB_LOAD_ALL(LOAD_CV, cv) +_CUB_LOAD_ALL(LOAD_LDG, global.nc) // Macro cleanup diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh index 00944ec927..3af2613d4b 100644 --- a/cub/thread/thread_store.cuh +++ b/cub/thread/thread_store.cuh @@ -257,18 +257,10 @@ struct IterateThreadStore /** * Define ThreadStore specializations for the various Cache load modifiers */ -#if CUB_PTX_ARCH >= 200 - _CUB_STORE_ALL(STORE_WB, wb) - _CUB_STORE_ALL(STORE_CG, cg) - _CUB_STORE_ALL(STORE_CS, cs) - _CUB_STORE_ALL(STORE_WT, wt) -#else - _CUB_STORE_ALL(STORE_WB, global) - _CUB_STORE_ALL(STORE_CG, global) - _CUB_STORE_ALL(STORE_CS, global) - _CUB_STORE_ALL(STORE_WT, volatile.global) -#endif - +_CUB_STORE_ALL(STORE_WB, wb) +_CUB_STORE_ALL(STORE_CG, cg) +_CUB_STORE_ALL(STORE_CS, cs) +_CUB_STORE_ALL(STORE_WT, wt) // Macro cleanup #undef _CUB_STORE_ALL diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index a4474afe16..f1bd8b6b99 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -92,7 +92,7 @@ CUB_NAMESPACE_BEGIN /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. #ifndef CUB_RUNTIME_FUNCTION - #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) + #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) #define CUB_RUNTIME_ENABLED #define CUB_RUNTIME_FUNCTION __host__ __device__ #else diff --git a/cub/util_type.cuh b/cub/util_type.cuh index d859bb16cb..d15444b488 100644 --- a/cub/util_type.cuh +++ b/cub/util_type.cuh @@ -445,13 +445,8 @@ template <> struct UnitWord { typedef int ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef float VolatileWord; - typedef uint2 DeviceWord; -#else typedef unsigned long long VolatileWord; typedef unsigned long long DeviceWord; -#endif typedef float2 TextureWord; }; @@ -460,13 +455,8 @@ template <> struct UnitWord { typedef int ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef float VolatileWord; - typedef uint4 DeviceWord; -#else typedef unsigned long long VolatileWord; typedef ulonglong2 DeviceWord; -#endif typedef float4 TextureWord; }; @@ -476,13 +466,8 @@ template <> struct UnitWord { typedef unsigned short ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef unsigned short VolatileWord; - typedef short DeviceWord; -#else typedef unsigned short VolatileWord; typedef unsigned short DeviceWord; -#endif typedef unsigned short TextureWord; }; diff --git a/cub/warp/specializations/warp_reduce_smem.cuh b/cub/warp/specializations/warp_reduce_smem.cuh index 80c7c62cc5..2b419cc217 100644 --- a/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/warp/specializations/warp_reduce_smem.cuh @@ -351,7 +351,7 @@ struct WarpReduceSmem FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail ReductionOp reduction_op) ///< [in] Reduction operator { - return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); + return SegmentedReduce(input, flag, reduction_op, Int2Type()); } diff --git a/cub/warp/warp_reduce.cuh b/cub/warp/warp_reduce.cuh index 24a2cd4f9d..d905703d75 100644 --- a/cub/warp/warp_reduce.cuh +++ b/cub/warp/warp_reduce.cuh @@ -152,7 +152,7 @@ private: public: - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two diff --git a/examples/block/example_block_reduce_dyn_smem.cu b/examples/block/example_block_reduce_dyn_smem.cu index c7f571e31e..2c2ce24c32 100644 --- a/examples/block/example_block_reduce_dyn_smem.cu +++ b/examples/block/example_block_reduce_dyn_smem.cu @@ -61,26 +61,6 @@ bool g_verbose = false; /// Default grid size int g_grid_size = 1; -// The following templated variables are helpers to get the shared memory -// This requires C++ 14 (template variables and constexpr max) -// but it's possible to work around these constructs for older C++ versions. -// we use default arguments from cub's block reduce for most parameters -template -constexpr std::size_t arch_bytes_size = sizeof( - typename cub::BlockReduce< - T, - BLOCK_THREADS, - BLOCK_REDUCE_WARP_REDUCTIONS /* ALGORITHM */, - 1 /* BLOCK_DIM_Y */, - 1 /* BLOCK_DIM_Z */, - ARCH>::TempStorage -); -template -constexpr auto archs_max_bytes = (std::max)( - {arch_bytes_size...,}); - - - //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- @@ -181,9 +161,9 @@ void Test() // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice); - // determine necessary storage for a few architectures - auto block_reduce_temp_bytes = archs_max_bytes< - int, BLOCK_THREADS, 600, 700, 800>; + // determine necessary storage size: + auto block_reduce_temp_bytes = + sizeof(typename cub::BlockReduce::TempStorage); // finally, we need to make sure that we can hold at least one integer // needed in the kernel to exchange data after reduction auto smem_size = (std::max)(1 * sizeof(int), block_reduce_temp_bytes); diff --git a/test/test_iterator.cu b/test/test_iterator.cu index 1b5a6c6d03..f042816a37 100644 --- a/test/test_iterator.cu +++ b/test/test_iterator.cu @@ -506,8 +506,7 @@ int main(int argc, char** argv) Test(); Test(); Test(); - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - Test(); + Test(); Test(); Test(); @@ -515,8 +514,7 @@ int main(int argc, char** argv) Test(); Test(); Test(); - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - Test(); + Test(); Test(); Test(); @@ -524,8 +522,7 @@ int main(int argc, char** argv) Test(); Test(); Test(); - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - Test(); + Test(); Test(); Test(); @@ -533,8 +530,7 @@ int main(int argc, char** argv) Test(); Test(); Test(); - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - Test(); + Test(); Test(); Test(); From 5799e96b75fa3173397018ff804ea92398b8a81f Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 26 Mar 2021 16:04:08 -0400 Subject: [PATCH 3/7] Remove PTX arguments from compile-time contexts. nvc++ will stop defining __NVCOMPILER_CUDA_ARCH__ soon, removing the ability to determine the PTX arch at compile time. This updates agents and collective algorithms to no longer require the PTX_ARCH template parameter, and changes the CUB_WARP_SIZE(PTX_ARCH), etc helpers to ignore their argument. These macros only differed on obsolete arches and have no effect on currently supported architectures. --- cub/agent/agent_histogram.cuh | 2 +- cub/agent/agent_rle.cuh | 2 +- cub/agent/agent_spmv_orig.cuh | 2 +- cub/agent/single_pass_scan_operators.cuh | 4 +- cub/block/block_adjacent_difference.cuh | 7 ++- cub/block/block_discontinuity.cuh | 4 +- cub/block/block_exchange.cuh | 9 ++-- cub/block/block_histogram.cuh | 7 ++- cub/block/block_load.cuh | 14 +++--- cub/block/block_radix_rank.cuh | 16 +++---- cub/block/block_radix_sort.cuh | 14 +++--- cub/block/block_raking_layout.cuh | 10 ++-- cub/block/block_reduce.cuh | 10 ++-- cub/block/block_scan.cuh | 10 ++-- cub/block/block_shuffle.cuh | 6 +-- cub/block/block_store.cuh | 14 +++--- .../specializations/block_histogram_sort.cuh | 6 +-- .../specializations/block_reduce_raking.cuh | 6 +-- .../block_reduce_raking_commutative_only.cuh | 10 ++-- .../block_reduce_warp_reductions.cuh | 6 +-- .../specializations/block_scan_raking.cuh | 6 +-- .../specializations/block_scan_warp_scans.cuh | 8 ++-- .../block_scan_warp_scans2.cuh | 8 ++-- .../block_scan_warp_scans3.cuh | 8 ++-- cub/device/dispatch/dispatch_radix_sort.cuh | 4 +- cub/device/dispatch/dispatch_reduce.cuh | 2 +- cub/util_arch.cuh | 46 ++++++------------- cub/util_ptx.cuh | 10 ++-- cub/warp/specializations/warp_reduce_shfl.cuh | 10 ++-- cub/warp/specializations/warp_reduce_smem.cuh | 7 ++- cub/warp/specializations/warp_scan_shfl.cuh | 8 ++-- cub/warp/specializations/warp_scan_smem.cuh | 7 ++- cub/warp/warp_exchange.cuh | 11 ++--- cub/warp/warp_load.cuh | 11 ++--- cub/warp/warp_merge_sort.cuh | 14 +++--- cub/warp/warp_reduce.cuh | 10 ++-- cub/warp/warp_scan.cuh | 10 ++-- cub/warp/warp_store.cuh | 11 ++--- test/test_warp_mask.cu | 3 +- 39 files changed, 160 insertions(+), 193 deletions(-) diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh index 0f0eb1e622..8dc2be96b3 100644 --- a/cub/agent/agent_histogram.cuh +++ b/cub/agent/agent_histogram.cuh @@ -103,7 +103,7 @@ template < typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel typename OffsetT, ///< Signed integer type for global offsets - int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability + int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused) struct AgentHistogram { //--------------------------------------------------------------------- diff --git a/cub/agent/agent_rle.cuh b/cub/agent/agent_rle.cuh index 37e5d37333..3c9109a13c 100644 --- a/cub/agent/agent_rle.cuh +++ b/cub/agent/agent_rle.cuh @@ -117,7 +117,7 @@ struct AgentRle // Constants enum { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + WARP_THREADS = CUB_WARP_THREADS(0), BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, diff --git a/cub/agent/agent_spmv_orig.cuh b/cub/agent/agent_spmv_orig.cuh index 3e1cf3d0cf..c89bc4a3cd 100644 --- a/cub/agent/agent_spmv_orig.cuh +++ b/cub/agent/agent_spmv_orig.cuh @@ -115,7 +115,7 @@ template < typename OffsetT, ///< Signed integer type for sequence offsets bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 bool HAS_BETA, ///< Whether the input parameter \p beta is 0 - int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability + int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused) struct AgentSpmv { //--------------------------------------------------------------------- diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh index 08d99e8f99..4f21863a7f 100644 --- a/cub/agent/single_pass_scan_operators.cuh +++ b/cub/agent/single_pass_scan_operators.cuh @@ -666,11 +666,11 @@ template < typename T, typename ScanOpT, typename ScanTileStateT, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> struct TilePrefixCallbackOp { // Parameterized warp reduce - typedef WarpReduce WarpReduceT; + typedef WarpReduce WarpReduceT; // Temporary storage type struct _TempStorage diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh index 14dab12881..6740149e19 100644 --- a/cub/block/block_adjacent_difference.cuh +++ b/cub/block/block_adjacent_difference.cuh @@ -41,7 +41,6 @@ CUB_NAMESPACE_BEGIN - /** * @brief BlockAdjacentDifference provides * [collective](index.html#sec0) methods for computing the @@ -125,9 +124,9 @@ CUB_NAMESPACE_BEGIN */ template + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int LEGACY_PTX_ARCH = 0> class BlockAdjacentDifference { private: diff --git a/cub/block/block_discontinuity.cuh b/cub/block/block_discontinuity.cuh index 08f89faae0..a3bf17f319 100644 --- a/cub/block/block_discontinuity.cuh +++ b/cub/block/block_discontinuity.cuh @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items @@ -107,7 +107,7 @@ template < int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockDiscontinuity { private: diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh index 8f254c2c3f..98ebfe7e82 100644 --- a/cub/block/block_exchange.cuh +++ b/cub/block/block_exchange.cuh @@ -50,7 +50,7 @@ CUB_NAMESPACE_BEGIN * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - It is commonplace for blocks of threads to rearrange data items between @@ -114,7 +114,7 @@ template < bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockExchange { private: @@ -129,11 +129,11 @@ private: /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0), SMEM_BANKS = 1 << LOG_SMEM_BANKS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, @@ -1126,4 +1126,3 @@ public: CUB_NAMESPACE_END - diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh index 7f0e982c25..b9ab759607 100644 --- a/cub/block/block_histogram.cuh +++ b/cub/block/block_histogram.cuh @@ -94,7 +94,7 @@ enum BlockHistogramAlgorithm * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A histogram @@ -160,7 +160,7 @@ template < BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockHistogram { private: @@ -184,8 +184,7 @@ private: ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH>, + BLOCK_DIM_Z>, BlockHistogramAtomic>; /// Shared memory storage layout type for BlockHistogram diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh index 3e78ce958b..b419ab6e72 100644 --- a/cub/block/block_load.cuh +++ b/cub/block/block_load.cuh @@ -568,7 +568,7 @@ enum BlockLoadAlgorithm * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - The BlockLoad class provides a single data movement abstraction that can be specialized @@ -638,7 +638,7 @@ template < BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockLoad { private: @@ -860,7 +860,7 @@ private: struct LoadInternal { // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage @@ -928,14 +928,14 @@ private: { enum { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage @@ -1003,14 +1003,14 @@ private: { enum { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh index ad3588afb9..b6675d02f9 100644 --- a/cub/block/block_radix_rank.cuh +++ b/cub/block/block_radix_rank.cuh @@ -104,7 +104,7 @@ struct BlockRadixRankEmptyCallback * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * Blah... @@ -143,7 +143,7 @@ template < cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockRadixRank { private: @@ -168,7 +168,7 @@ private: RADIX_DIGITS = 1 << RADIX_BITS, - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, @@ -203,8 +203,7 @@ private: BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> + BLOCK_DIM_Z> BlockScan; @@ -508,7 +507,7 @@ template < BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockRadixRankMatch { private: @@ -527,7 +526,7 @@ private: RADIX_DIGITS = 1 << RADIX_BITS, - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, @@ -558,8 +557,7 @@ private: BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> + BLOCK_DIM_Z> BlockScanT; diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh index 4663f3b6f1..7dbf3e6a13 100644 --- a/cub/block/block_radix_sort.cuh +++ b/cub/block/block_radix_sort.cuh @@ -57,7 +57,7 @@ CUB_NAMESPACE_BEGIN * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges @@ -170,7 +170,7 @@ template < cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockRadixSort { private: @@ -201,8 +201,7 @@ private: INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> + BLOCK_DIM_Z> AscendingBlockRadixRank; /// Descending BlockRadixRank utility type @@ -214,18 +213,17 @@ private: INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> + BLOCK_DIM_Z> DescendingBlockRadixRank; /// Digit extractor type typedef BFEDigitExtractor DigitExtractorT; /// BlockExchange utility type for keys - typedef BlockExchange BlockExchangeKeys; + typedef BlockExchange BlockExchangeKeys; /// BlockExchange utility type for values - typedef BlockExchange BlockExchangeValues; + typedef BlockExchange BlockExchangeValues; /// Shared memory storage layout type union _TempStorage diff --git a/cub/block/block_raking_layout.cuh b/cub/block/block_raking_layout.cuh index 8bd1be13ad..4d49f54f2c 100644 --- a/cub/block/block_raking_layout.cuh +++ b/cub/block/block_raking_layout.cuh @@ -52,12 +52,12 @@ CUB_NAMESPACE_BEGIN * * \tparam T The data type to be exchanged. * \tparam BLOCK_THREADS The thread block size in threads. - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. */ template < typename T, int BLOCK_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> struct BlockRakingLayout { //--------------------------------------------------------------------- @@ -70,7 +70,7 @@ struct BlockRakingLayout SHARED_ELEMENTS = BLOCK_THREADS, /// Maximum number of warp-synchronous raking threads - MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), + MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)), /// Number of raking elements per warp-synchronous raking thread (rounded up) SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, @@ -79,11 +79,11 @@ struct BlockRakingLayout RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) - HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), + HAS_CONFLICTS = (CUB_SMEM_BANKS(0) % SEGMENT_LENGTH == 0), /// Degree of bank conflicts (e.g., 4-way) CONFLICT_DEGREE = (HAS_CONFLICTS) ? - (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : + (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(0) : 1, /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh index eabcc285e5..dcaf2ee0db 100644 --- a/cub/block/block_reduce.cuh +++ b/cub/block/block_reduce.cuh @@ -158,7 +158,7 @@ enum BlockReduceAlgorithm * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A reduction (or fold) @@ -218,7 +218,7 @@ template < BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockReduce { private: @@ -234,9 +234,9 @@ private: BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; - typedef BlockReduceWarpReductions WarpReductions; - typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; - typedef BlockReduceRaking Raking; + typedef BlockReduceWarpReductions WarpReductions; + typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; + typedef BlockReduceRaking Raking; /// Internal specialization type using InternalBlockReduce = cub::detail::conditional_t< diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh index 0e2561ec15..d1b6a66e98 100644 --- a/cub/block/block_scan.cuh +++ b/cub/block/block_scan.cuh @@ -117,7 +117,7 @@ enum BlockScanAlgorithm * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) @@ -191,7 +191,7 @@ template < BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockScan { private: @@ -214,12 +214,12 @@ private: * architectural warp size. */ static const BlockScanAlgorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? + ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(0) != 0)) ? BLOCK_SCAN_RAKING : ALGORITHM; - typedef BlockScanWarpScans WarpScans; - typedef BlockScanRaking Raking; + typedef BlockScanWarpScans WarpScans; + typedef BlockScanRaking Raking; /// Define the delegate type for the desired algorithm using InternalBlockScan = diff --git a/cub/block/block_shuffle.cuh b/cub/block/block_shuffle.cuh index bae7206847..e4ebc7ff1e 100644 --- a/cub/block/block_shuffle.cuh +++ b/cub/block/block_shuffle.cuh @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * It is commonplace for blocks of threads to rearrange data items between @@ -60,7 +60,7 @@ template < int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockShuffle { private: @@ -73,7 +73,7 @@ private: { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, }; diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh index d363f3479b..2cb6bee433 100644 --- a/cub/block/block_store.cuh +++ b/cub/block/block_store.cuh @@ -454,7 +454,7 @@ enum BlockStoreAlgorithm * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - The BlockStore class provides a single data movement abstraction that can be specialized @@ -528,7 +528,7 @@ template < BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class BlockStore { private: @@ -691,7 +691,7 @@ private: struct StoreInternal { // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage @@ -752,14 +752,14 @@ private: { enum { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage @@ -820,14 +820,14 @@ private: { enum { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage diff --git a/cub/block/specializations/block_histogram_sort.cuh b/cub/block/specializations/block_histogram_sort.cuh index e60325d48d..79659ae106 100644 --- a/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/block/specializations/block_histogram_sort.cuh @@ -73,8 +73,7 @@ struct BlockHistogramSort BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> + BLOCK_DIM_Z> BlockRadixSortT; // Parameterize BlockDiscontinuity type for our thread block @@ -82,8 +81,7 @@ struct BlockHistogramSort T, BLOCK_DIM_X, BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> + BLOCK_DIM_Z> BlockDiscontinuityT; /// Shared memory diff --git a/cub/block/specializations/block_reduce_raking.cuh b/cub/block/specializations/block_reduce_raking.cuh index d828484915..0a9009d2e2 100644 --- a/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/block/specializations/block_reduce_raking.cuh @@ -60,7 +60,7 @@ template < int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockReduceRaking { /// Constants @@ -71,10 +71,10 @@ struct BlockReduceRaking }; /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; + typedef BlockRakingLayout BlockRakingLayout; /// WarpReduce utility type - typedef typename WarpReduce::InternalWarpReduce WarpReduce; + typedef typename WarpReduce::InternalWarpReduce WarpReduce; /// Constants enum diff --git a/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 4dd8a13050..529a170923 100644 --- a/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -50,7 +50,7 @@ template < int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockReduceRakingCommutativeOnly { /// Constants @@ -61,13 +61,13 @@ struct BlockReduceRakingCommutativeOnly }; // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values - typedef BlockReduceRaking FallBack; + typedef BlockReduceRaking FallBack; /// Constants enum { /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + WARP_THREADS = CUB_WARP_THREADS(0), /// Whether or not to use fall-back USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), @@ -83,10 +83,10 @@ struct BlockReduceRakingCommutativeOnly }; /// WarpReduce utility type - typedef WarpReduce WarpReduce; + typedef WarpReduce WarpReduce; /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; + typedef BlockRakingLayout BlockRakingLayout; /// Shared memory storage layout type union _TempStorage diff --git a/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/block/specializations/block_reduce_warp_reductions.cuh index 6b440e5a9c..c341c8ba55 100644 --- a/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -48,7 +48,7 @@ template < int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockReduceWarpReductions { /// Constants @@ -58,7 +58,7 @@ struct BlockReduceWarpReductions BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + WARP_THREADS = CUB_WARP_THREADS(0), /// Number of active warps WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, @@ -72,7 +72,7 @@ struct BlockReduceWarpReductions /// WarpReduce utility type - typedef typename WarpReduce::InternalWarpReduce WarpReduce; + typedef typename WarpReduce::InternalWarpReduce WarpReduce; /// Shared memory storage layout type diff --git a/cub/block/specializations/block_scan_raking.cuh b/cub/block/specializations/block_scan_raking.cuh index 863939e1fe..6327e48110 100644 --- a/cub/block/specializations/block_scan_raking.cuh +++ b/cub/block/specializations/block_scan_raking.cuh @@ -53,7 +53,7 @@ template < int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockScanRaking { //--------------------------------------------------------------------- @@ -68,7 +68,7 @@ struct BlockScanRaking }; /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; + typedef BlockRakingLayout BlockRakingLayout; /// Constants enum @@ -84,7 +84,7 @@ struct BlockScanRaking }; /// WarpScan utility type - typedef WarpScan WarpScan; + typedef WarpScan WarpScan; /// Shared memory storage layout type struct _TempStorage diff --git a/cub/block/specializations/block_scan_warp_scans.cuh b/cub/block/specializations/block_scan_warp_scans.cuh index 8273e43bf4..ed550162e2 100644 --- a/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/block/specializations/block_scan_warp_scans.cuh @@ -47,7 +47,7 @@ template < int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockScanWarpScans { //--------------------------------------------------------------------- @@ -58,7 +58,7 @@ struct BlockScanWarpScans enum { /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + WARP_THREADS = CUB_WARP_THREADS(0), /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, @@ -68,10 +68,10 @@ struct BlockScanWarpScans }; /// WarpScan utility type - typedef WarpScan WarpScanT; + typedef WarpScan WarpScanT; /// WarpScan utility type - typedef WarpScan WarpAggregateScan; + typedef WarpScan WarpAggregateScan; /// Shared memory storage layout type diff --git a/cub/block/specializations/block_scan_warp_scans2.cuh b/cub/block/specializations/block_scan_warp_scans2.cuh index a485356bc3..b634569d20 100644 --- a/cub/block/specializations/block_scan_warp_scans2.cuh +++ b/cub/block/specializations/block_scan_warp_scans2.cuh @@ -47,7 +47,7 @@ template < int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockScanWarpScans { //--------------------------------------------------------------------- @@ -58,7 +58,7 @@ struct BlockScanWarpScans enum { /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + WARP_THREADS = CUB_WARP_THREADS(0), /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, @@ -68,10 +68,10 @@ struct BlockScanWarpScans }; /// WarpScan utility type - typedef WarpScan WarpScanT; + typedef WarpScan WarpScanT; /// WarpScan utility type - typedef WarpScan WarpAggregateScanT; + typedef WarpScan WarpAggregateScanT; /// Shared memory storage layout type struct _TempStorage diff --git a/cub/block/specializations/block_scan_warp_scans3.cuh b/cub/block/specializations/block_scan_warp_scans3.cuh index dad06fd298..f7eed0be8f 100644 --- a/cub/block/specializations/block_scan_warp_scans3.cuh +++ b/cub/block/specializations/block_scan_warp_scans3.cuh @@ -47,7 +47,7 @@ template < int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct BlockScanWarpScans { //--------------------------------------------------------------------- @@ -61,7 +61,7 @@ struct BlockScanWarpScans BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, /// Number of warp threads - INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + INNER_WARP_THREADS = CUB_WARP_THREADS(0), OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, /// Number of outer scan warps @@ -69,10 +69,10 @@ struct BlockScanWarpScans }; /// Outer WarpScan utility type - typedef WarpScan OuterWarpScanT; + typedef WarpScan OuterWarpScanT; /// Inner WarpScan utility type - typedef WarpScan InnerWarpScanT; + typedef WarpScan InnerWarpScanT; typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]; diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 13c43948ef..bc4fa3dcd9 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1227,7 +1227,7 @@ struct DispatchRadixSort : UpsweepKernelT upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, - int ptx_version, + int /*ptx_version*/, int sm_count, OffsetT num_items) { @@ -1244,7 +1244,7 @@ struct DispatchRadixSort : if (CubDebug(error = scan_config.Init(scan_kernel))) break; if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; - max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version); + max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0); even_share.DispatchInit( num_items, diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index e0470ccb1e..0ac3b7b3c0 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -472,7 +472,7 @@ struct DispatchReduce : int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; // Even-share work distribution - int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version); + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0); GridEvenShare even_share; even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index f1bd8b6b99..00a21516b6 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -83,12 +83,10 @@ CUB_NAMESPACE_BEGIN /// Maximum number of devices supported. #ifndef CUB_MAX_DEVICES - #define CUB_MAX_DEVICES 128 + #define CUB_MAX_DEVICES (128) #endif -#if CUB_CPP_DIALECT >= 2011 - static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); -#endif +static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. #ifndef CUB_RUNTIME_FUNCTION @@ -103,49 +101,35 @@ CUB_NAMESPACE_BEGIN /// Number of threads per warp #ifndef CUB_LOG_WARP_THREADS - #define CUB_LOG_WARP_THREADS(arch) \ - (5) - #define CUB_WARP_THREADS(arch) \ - (1 << CUB_LOG_WARP_THREADS(arch)) + #define CUB_LOG_WARP_THREADS(unused) (5) + #define CUB_WARP_THREADS(unused) (1 << CUB_LOG_WARP_THREADS(0)) - #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) - #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) + #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(0) + #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(0) #endif /// Number of smem banks #ifndef CUB_LOG_SMEM_BANKS - #define CUB_LOG_SMEM_BANKS(arch) \ - ((arch >= 200) ? \ - (5) : \ - (4)) - #define CUB_SMEM_BANKS(arch) \ - (1 << CUB_LOG_SMEM_BANKS(arch)) - - #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) - #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) + #define CUB_LOG_SMEM_BANKS(unused) (5) + #define CUB_SMEM_BANKS(unused) (1 << CUB_LOG_SMEM_BANKS(0)) + + #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(0) + #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS #endif /// Oversubscription factor #ifndef CUB_SUBSCRIPTION_FACTOR - #define CUB_SUBSCRIPTION_FACTOR(arch) \ - ((arch >= 300) ? \ - (5) : \ - ((arch >= 200) ? \ - (3) : \ - (10))) - #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) + #define CUB_SUBSCRIPTION_FACTOR(unused) (5) + #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(0) #endif /// Prefer padding overhead vs X-way conflicts greater than this threshold #ifndef CUB_PREFER_CONFLICT_OVER_PADDING - #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ - ((arch >= 300) ? \ - (1) : \ - (4)) - #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) + #define CUB_PREFER_CONFLICT_OVER_PADDING(unused) (1) + #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0) #endif diff --git a/cub/util_ptx.cuh b/cub/util_ptx.cuh index 087fbcb733..5b2a20486c 100644 --- a/cub/util_ptx.cuh +++ b/cub/util_ptx.cuh @@ -442,17 +442,15 @@ __device__ __forceinline__ unsigned int WarpId() * hardware warp threads). * @param warp_id Id of virtual warp within architectural warp */ -template +template __host__ __device__ __forceinline__ unsigned int WarpMask(unsigned int warp_id) { constexpr bool is_pow_of_two = PowerOfTwo::VALUE; - constexpr bool is_arch_warp = LOGICAL_WARP_THREADS == - CUB_WARP_THREADS(PTX_ARCH); + constexpr bool is_arch_warp = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); - unsigned int member_mask = - 0xFFFFFFFFu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); + unsigned int member_mask = 0xFFFFFFFFu >> + (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS); if (is_pow_of_two && !is_arch_warp) { diff --git a/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/warp/specializations/warp_reduce_shfl.cuh index faa5cfcd25..bb71f395fd 100644 --- a/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/warp/specializations/warp_reduce_shfl.cuh @@ -51,7 +51,7 @@ CUB_NAMESPACE_BEGIN template < typename T, ///< Data type being reduced int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpReduceShfl { static_assert(PowerOfTwo::VALUE, @@ -64,16 +64,16 @@ struct WarpReduceShfl enum { /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp reduction steps STEPS = Log2::VALUE, /// Number of logical warps in a PTX warp - LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, + LOGICAL_WARPS = CUB_WARP_THREADS(0) / LOGICAL_WARP_THREADS, /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8 }; @@ -114,7 +114,7 @@ struct WarpReduceShfl TempStorage &/*temp_storage*/) : lane_id(static_cast(LaneId())) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) - , member_mask(WarpMask(warp_id)) + , member_mask(WarpMask(warp_id)) { if (!IS_ARCH_WARP) { diff --git a/cub/warp/specializations/warp_reduce_smem.cuh b/cub/warp/specializations/warp_reduce_smem.cuh index 2b419cc217..79c7a0ccdd 100644 --- a/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/warp/specializations/warp_reduce_smem.cuh @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN template < typename T, ///< Data type being reduced int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpReduceSmem { /****************************************************************************** @@ -57,7 +57,7 @@ struct WarpReduceSmem enum { /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = PowerOfTwo::VALUE, @@ -109,8 +109,7 @@ struct WarpReduceSmem : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) , member_mask( - WarpMask( - LaneId() / LOGICAL_WARP_THREADS)) + WarpMask(LaneId() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** diff --git a/cub/warp/specializations/warp_scan_shfl.cuh b/cub/warp/specializations/warp_scan_shfl.cuh index f6051c6ea2..2f2207c1ac 100644 --- a/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/warp/specializations/warp_scan_shfl.cuh @@ -48,7 +48,7 @@ CUB_NAMESPACE_BEGIN template < typename T, ///< Data type being scanned int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpScanShfl { //--------------------------------------------------------------------- @@ -58,13 +58,13 @@ struct WarpScanShfl enum { /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp scan steps STEPS = Log2::VALUE, /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8 }; template @@ -102,7 +102,7 @@ struct WarpScanShfl WarpScanShfl(TempStorage & /*temp_storage*/) : lane_id(LaneId()) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) - , member_mask(WarpMask(warp_id)) + , member_mask(WarpMask(warp_id)) { if (!IS_ARCH_WARP) { diff --git a/cub/warp/specializations/warp_scan_smem.cuh b/cub/warp/specializations/warp_scan_smem.cuh index 1fecf1c7b5..bd17118e99 100644 --- a/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/warp/specializations/warp_scan_smem.cuh @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN template < typename T, ///< Data type being scanned int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective + int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective struct WarpScanSmem { /****************************************************************************** @@ -57,7 +57,7 @@ struct WarpScanSmem enum { /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp scan steps STEPS = Log2::VALUE, @@ -103,8 +103,7 @@ struct WarpScanSmem LaneId() % LOGICAL_WARP_THREADS), member_mask( - WarpMask( - LaneId() / LOGICAL_WARP_THREADS)) + WarpMask(LaneId() / LOGICAL_WARP_THREADS)) {} diff --git a/cub/warp/warp_exchange.cuh b/cub/warp/warp_exchange.cuh index c0fbef1cc0..c1d20f5233 100644 --- a/cub/warp/warp_exchange.cuh +++ b/cub/warp/warp_exchange.cuh @@ -58,8 +58,8 @@ CUB_NAMESPACE_BEGIN * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * - * @tparam PTX_ARCH - * [optional] \ptxversion + * @tparam LEGACY_PTX_ARCH + * Unused. * * @par Overview * - It is commonplace for a warp of threads to rearrange data items between @@ -114,7 +114,7 @@ CUB_NAMESPACE_BEGIN template + int LEGACY_PTX_ARCH = 0> class WarpExchange { static_assert(PowerOfTwo::VALUE, @@ -123,10 +123,9 @@ class WarpExchange constexpr static int ITEMS_PER_TILE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS + 1; - constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == - CUB_WARP_THREADS(PTX_ARCH); + constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); - constexpr static int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH); + constexpr static int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0); // Insert padding if the number of items per thread is a power of two // and > 4 (otherwise we can typically use 128b loads) diff --git a/cub/warp/warp_load.cuh b/cub/warp/warp_load.cuh index 28a31f407a..664bf1185b 100644 --- a/cub/warp/warp_load.cuh +++ b/cub/warp/warp_load.cuh @@ -141,8 +141,8 @@ enum WarpLoadAlgorithm * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * - * @tparam PTX_ARCH - * [optional] \ptxversion + * @tparam LEGACY_PTX_ARCH + * Unused. * * @par Overview * - The WarpLoad class provides a single data movement abstraction that can be @@ -206,11 +206,10 @@ template + int LEGACY_PTX_ARCH = 0> class WarpLoad { - constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == - CUB_WARP_THREADS(PTX_ARCH); + constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); @@ -397,7 +396,7 @@ private: struct LoadInternal { using WarpExchangeT = - WarpExchange; + WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; diff --git a/cub/warp/warp_merge_sort.cuh b/cub/warp/warp_merge_sort.cuh index 6617084ae1..fbcd44f578 100644 --- a/cub/warp/warp_merge_sort.cuh +++ b/cub/warp/warp_merge_sort.cuh @@ -57,8 +57,8 @@ CUB_NAMESPACE_BEGIN * [optional] Value type (default: cub::NullType, which indicates a * keys-only sort) * - * @tparam PTX_ARCH - * [optional] \ptxversion + * @tparam LEGACY_PTX_ARCH + * Unused. * * @par Overview * WarpMergeSort arranges items into ascending order using a comparison @@ -115,19 +115,19 @@ CUB_NAMESPACE_BEGIN template < typename KeyT, int ITEMS_PER_THREAD, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int LOGICAL_WARP_THREADS = CUB_WARP_THREADS(0), typename ValueT = NullType, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class WarpMergeSort : public BlockMergeSortStrategy< KeyT, ValueT, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, - WarpMergeSort> + WarpMergeSort> { private: - constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH); + constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); constexpr static bool KEYS_ONLY = std::is_same::value; constexpr static int TILE_SIZE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS; @@ -150,7 +150,7 @@ public: ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) - , member_mask(WarpMask(warp_id)) + , member_mask(WarpMask(warp_id)) { } diff --git a/cub/warp/warp_reduce.cuh b/cub/warp/warp_reduce.cuh index d905703d75..7c8592df71 100644 --- a/cub/warp/warp_reduce.cuh +++ b/cub/warp/warp_reduce.cuh @@ -52,7 +52,7 @@ CUB_NAMESPACE_BEGIN * * \tparam T The reduction input/output element type * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - A reduction (or fold) @@ -132,7 +132,7 @@ CUB_NAMESPACE_BEGIN template < typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class WarpReduce { private: @@ -144,7 +144,7 @@ private: enum { /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = PowerOfTwo::VALUE, @@ -158,8 +158,8 @@ public: /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpReduce = cub::detail::conditional_t< IS_POW_OF_TWO, - WarpReduceShfl, - WarpReduceSmem>; + WarpReduceShfl, + WarpReduceSmem>; #endif // DOXYGEN_SHOULD_SKIP_THIS diff --git a/cub/warp/warp_scan.cuh b/cub/warp/warp_scan.cuh index 4e646957ec..9cb1491f1b 100644 --- a/cub/warp/warp_scan.cuh +++ b/cub/warp/warp_scan.cuh @@ -51,7 +51,7 @@ CUB_NAMESPACE_BEGIN * * \tparam T The scan input/output element type * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion + * \tparam LEGACY_PTX_ARCH [optional] Unused. * * \par Overview * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) @@ -137,7 +137,7 @@ CUB_NAMESPACE_BEGIN template < typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> + int LEGACY_PTX_ARCH = 0> class WarpScan { private: @@ -149,7 +149,7 @@ private: enum { /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), @@ -162,8 +162,8 @@ private: /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpScan = cub::detail::conditional_t< IS_POW_OF_TWO, - WarpScanShfl, - WarpScanSmem>; + WarpScanShfl, + WarpScanSmem>; /// Shared memory storage layout type for WarpScan typedef typename InternalWarpScan::TempStorage _TempStorage; diff --git a/cub/warp/warp_store.cuh b/cub/warp/warp_store.cuh index 18e8e2261d..33346e826d 100644 --- a/cub/warp/warp_store.cuh +++ b/cub/warp/warp_store.cuh @@ -136,8 +136,8 @@ enum WarpStoreAlgorithm * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * - * @tparam PTX_ARCH - * [optional] \ptxversion + * @tparam LEGACY_PTX_ARCH + * Unused. * * @par Overview * - The WarpStore class provides a single data movement abstraction that can be @@ -204,14 +204,13 @@ template + int LEGACY_PTX_ARCH = 0> class WarpStore { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); - constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == - CUB_WARP_THREADS(PTX_ARCH); + constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); private: @@ -319,7 +318,7 @@ private: struct StoreInternal { using WarpExchangeT = - WarpExchange; + WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; diff --git a/test/test_warp_mask.cu b/test/test_warp_mask.cu index a96e4fef46..037c74b609 100644 --- a/test/test_warp_mask.cu +++ b/test/test_warp_mask.cu @@ -45,8 +45,7 @@ void Test() for (unsigned int warp_id = 0; warp_id < warps; warp_id++) { - const unsigned int warp_mask = - cub::WarpMask(warp_id); + const unsigned int warp_mask = cub::WarpMask(warp_id); const unsigned int warp_begin = LOGICAL_WARP_THREADS * warp_id; const unsigned int warp_end = warp_begin + LOGICAL_WARP_THREADS; From c4299c46fdf88654f23a7c01dca3bc4c2976bb7a Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 30 Nov 2021 11:35:13 -0500 Subject: [PATCH 4/7] Use NV_IF_TARGET to select between host/device/sm implementations. --- cub/agent/agent_sub_warp_merge_sort.cuh | 33 ++- cub/detail/device_synchronize.cuh | 31 ++- cub/device/dispatch/dispatch_histogram.cuh | 71 ++--- cub/device/dispatch/dispatch_radix_sort.cuh | 37 ++- .../dispatch/dispatch_reduce_by_key.cuh | 51 ++-- cub/device/dispatch/dispatch_rle.cuh | 51 ++-- .../dispatch/dispatch_segmented_sort.cuh | 172 ++++++------ cub/device/dispatch/dispatch_select_if.cuh | 52 ++-- cub/device/dispatch/dispatch_spmv_orig.cuh | 89 +++---- .../dispatch/dispatch_three_way_partition.cuh | 33 +-- cub/grid/grid_queue.cuh | 98 ++++--- cub/iterator/tex_obj_input_iterator.cuh | 68 +++-- cub/util_arch.cuh | 42 +-- cub/util_debug.cuh | 163 +++++++----- cub/util_device.cuh | 247 ++++++++---------- test/test_device_segmented_sort.cu | 108 ++++---- test/test_util.h | 238 ++++++++--------- test/test_warp_reduce.cu | 12 +- 18 files changed, 794 insertions(+), 802 deletions(-) diff --git a/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/agent/agent_sub_warp_merge_sort.cuh index ad65f2a3cc..81447afdbb 100644 --- a/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/agent/agent_sub_warp_merge_sort.cuh @@ -33,6 +33,8 @@ #include #include +#include + #include @@ -108,6 +110,23 @@ class AgentSubWarpSort { template __device__ bool operator()(T lhs, T rhs) + { + return this->impl(lhs, rhs); + } + +#if defined(__CUDA_FP16_TYPES_EXIST__) + __device__ bool operator()(__half lhs, __half rhs) + { + // Need to explicitly cast to float for SM <= 52. + NV_IF_TARGET(NV_PROVIDES_SM_53, + (return this->impl(lhs, rhs);), + (return this->impl(__half2float(lhs), __half2float(rhs));)); + } +#endif + + private: + template + __device__ bool impl(T lhs, T rhs) { if (IS_DESCENDING) { @@ -118,19 +137,15 @@ class AgentSubWarpSort return lhs < rhs; } } - -#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530) - __device__ bool operator()(__half lhs, __half rhs) - { - return (*this)(__half2float(lhs), __half2float(rhs)); - } -#endif }; -#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530) +#if defined(__CUDA_FP16_TYPES_EXIST__) __device__ static bool equal(__half lhs, __half rhs) { - return __half2float(lhs) == __half2float(rhs); + // Need to explicitly cast to float for SM <= 52. + NV_IF_TARGET(NV_PROVIDES_SM_53, + (return lhs == rhs;), + (return __half2float(lhs) == __half2float(rhs);)); } #endif diff --git a/cub/detail/device_synchronize.cuh b/cub/detail/device_synchronize.cuh index 075c0aa61c..52c5a10663 100644 --- a/cub/detail/device_synchronize.cuh +++ b/cub/detail/device_synchronize.cuh @@ -20,6 +20,8 @@ #include #include +#include + #include CUB_NAMESPACE_BEGIN @@ -36,31 +38,28 @@ CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() { cudaError_t result = cudaErrorUnknown; - if (CUB_IS_HOST_CODE) - { -#if CUB_INCLUDE_HOST_CODE - result = cudaDeviceSynchronize(); -#endif - } - else - { - // Device code with the CUDA runtime. -#if defined(CUB_INCLUDE_DEVICE_CODE) && defined(CUB_RUNTIME_ENABLED) +#ifdef CUB_RUNTIME_ENABLED #if defined(__CUDACC__) && \ ((__CUDACC_VER_MAJOR__ > 11) || \ ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) - // CUDA >= 11.6 - result = __cudaDeviceSynchronizeDeprecationAvoidance(); + // CUDA >= 11.6 +#define CUB_TMP_DEVICE_SYNC_IMPL \ + result = __cudaDeviceSynchronizeDeprecationAvoidance(); #else // CUDA < 11.6 - result = cudaDeviceSynchronize(); +#define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize(); #endif #else // Device code without the CUDA runtime. - // Device side CUDA API calls are not supported in this configuration. - result = cudaErrorInvalidConfiguration; + // Device side CUDA API calls are not supported in this configuration. +#define CUB_TMP_DEVICE_SYNC_IMPL result = cudaErrorInvalidConfiguration; #endif - } + + NV_IF_TARGET(NV_IS_HOST, + (result = cudaDeviceSynchronize();), + (CUB_TMP_DEVICE_SYNC_IMPL)); + +#undef CUB_TMP_DEVICE_SYNC_IMPL return result; } diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index 035f3f2b5d..4609036d4c 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -34,20 +34,22 @@ #pragma once -#include -#include -#include - -#include "../../agent/agent_histogram.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_math.cuh" -#include "../../thread/thread_search.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../config.cuh" +#include +#include +#include +#include +#include +#include +#include #include +#include + +#include +#include +#include + CUB_NAMESPACE_BEGIN @@ -401,32 +403,31 @@ struct DispatchHistogram int ptx_version, KernelConfig &histogram_sweep_config) { - cudaError_t result = cudaErrorNotSupported; - if (CUB_IS_DEVICE_CODE) - { - #if CUB_INCLUDE_DEVICE_CODE - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - result = histogram_sweep_config.template Init(); - #endif - } - else - { - #if CUB_INCLUDE_HOST_CODE - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 500) - { - result = histogram_sweep_config.template Init(); - } - else - { - result = histogram_sweep_config.template Init(); - } - #endif - } - return result; + cudaError_t result = cudaErrorNotSupported; + NV_IF_TARGET( + NV_IS_DEVICE, + ( + // We're on the device, so initialize the kernel dispatch + // configurations with the current PTX policy + result = histogram_sweep_config.template Init(); + ), + ( // NV_IS_HOST: + // We're on the host, so lookup and initialize the kernel dispatch + // configurations with the policies that match the device's PTX + // version + if (ptx_version >= 500) + { + result = histogram_sweep_config.template Init(); + } + else + { + result = histogram_sweep_config.template Init(); + } + )); + + return result; } - /** * Kernel kernel dispatch configuration */ diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index bc4fa3dcd9..6265232fad 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1332,9 +1332,15 @@ struct DispatchRadixSort : MaxPolicyT, IS_DESCENDING, KeyT, OffsetT>; if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break; - histogram_kernel<<>> - (d_bins, d_keys.Current(), num_items, begin_bit, end_bit); - if (CubDebug(error = cudaPeekAtLastError())) break; + + error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream + ).doit(histogram_kernel, + d_bins, d_keys.Current(), num_items, begin_bit, end_bit); + if (CubDebug(error)) + { + break; + } // exclusive sums to determine starts const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; @@ -1368,17 +1374,22 @@ struct DispatchRadixSort : stream))) break; auto onesweep_kernel = DeviceRadixSortOnesweepKernel< MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>; - onesweep_kernel<<>> - (d_lookback, d_ctrs + portion * num_passes + pass, - portion < num_portions - 1 ? + errror = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream + ).doit(onesweep_kernel, + d_lookback, d_ctrs + portion * num_passes + pass, + portion < num_portions - 1 ? d_bins + ((portion + 1) * num_passes + pass) * RADIX_DIGITS : NULL, - d_bins + (portion * num_passes + pass) * RADIX_DIGITS, - d_keys.Alternate(), - d_keys.Current() + portion * PORTION_SIZE, - d_values.Alternate(), - d_values.Current() + portion * PORTION_SIZE, - portion_num_items, current_bit, num_bits); - if (CubDebug(error = cudaPeekAtLastError())) break; + d_bins + (portion * num_passes + pass) * RADIX_DIGITS, + d_keys.Alternate(), + d_keys.Current() + portion * PORTION_SIZE, + d_values.Alternate(), + d_values.Current() + portion * PORTION_SIZE, + portion_num_items, current_bit, num_bits); + if (CubDebug(error)) + { + break; + } } // use the temporary buffers if no overwrite is allowed diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index 00d4be5ff8..952ace4240 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -1,4 +1,3 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. @@ -34,19 +33,21 @@ #pragma once -#include -#include +#include +#include +#include +#include +#include +#include +#include -#include "dispatch_scan.cuh" -#include "../../config.cuh" -#include "../../agent/agent_reduce_by_key.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_math.cuh" +#include #include +#include +#include + CUB_NAMESPACE_BEGIN /****************************************************************************** @@ -193,27 +194,19 @@ struct DispatchReduceByKey template CUB_RUNTIME_FUNCTION __forceinline__ static void InitConfigs( - int ptx_version, + int /*ptx_version*/, KernelConfig &reduce_by_key_config) { - if (CUB_IS_DEVICE_CODE) - { - #if CUB_INCLUDE_DEVICE_CODE - (void)ptx_version; - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - reduce_by_key_config.template Init(); - #endif - } - else - { - #if CUB_INCLUDE_HOST_CODE - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - - // (There's only one policy right now) - (void)ptx_version; - reduce_by_key_config.template Init(); - #endif - } + NV_IF_TARGET(NV_IS_DEVICE, + ( + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_config.template Init(); + ), ( + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + + // (There's only one policy right now) + reduce_by_key_config.template Init(); + )); } diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh index 1ebb795355..692a55d4e9 100644 --- a/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/device/dispatch/dispatch_rle.cuh @@ -1,4 +1,3 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. @@ -34,19 +33,21 @@ #pragma once -#include -#include - -#include "dispatch_scan.cuh" -#include "../../config.cuh" -#include "../../agent/agent_rle.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_math.cuh" +#include +#include +#include +#include +#include +#include +#include #include +#include + +#include +#include + CUB_NAMESPACE_BEGIN @@ -181,25 +182,19 @@ struct DeviceRleDispatch template CUB_RUNTIME_FUNCTION __forceinline__ static void InitConfigs( - int ptx_version, + int /*ptx_version*/, KernelConfig& device_rle_config) { - if (CUB_IS_DEVICE_CODE) { - #if CUB_INCLUDE_DEVICE_CODE - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - device_rle_config.template Init(); - #endif - } - else - { - #if CUB_INCLUDE_HOST_CODE - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - - // (There's only one policy right now) - (void)ptx_version; - device_rle_config.template Init(); - #endif - } + NV_IF_TARGET(NV_IS_DEVICE, + ( + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + device_rle_config.template Init(); + ), ( + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + + // (There's only one policy right now) + device_rle_config.template Init(); + )); } diff --git a/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/device/dispatch/dispatch_segmented_sort.cuh index 6101695337..6fb68f9002 100644 --- a/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -28,23 +28,25 @@ #pragma once #include -#include -#include #include #include #include #include #include +#include #include +#include #include #include #include #include #include -#include #include #include +#include + +#include #include @@ -1591,92 +1593,94 @@ private: return error; } - if (CUB_IS_HOST_CODE) - { - #if CUB_INCLUDE_HOST_CODE - unsigned int h_group_sizes[num_selected_groups]; - - if (CubDebug( - error = cudaMemcpyAsync(h_group_sizes, - group_sizes.get(), - num_selected_groups * sizeof(unsigned int), - cudaMemcpyDeviceToHost, - stream))) - { - return error; - } - - if (CubDebug(error = SyncStream(stream))) - { - return error; - } - - error = DeviceSegmentedSortContinuation( - large_kernel, - small_kernel, - num_segments, - d_keys.Current(), - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), - d_keys_double_buffer, - d_values.Current(), - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), - d_values_double_buffer, - d_begin_offsets, - d_end_offsets, - h_group_sizes, - large_and_medium_segments_indices.get(), - small_segments_indices.get(), - stream, - debug_synchronous); - #endif - } - else - { - #if CUB_INCLUDE_DEVICE_CODE - #ifdef CUB_RUNTIME_ENABLED - using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) - .doit(DeviceSegmentedSortContinuationKernel, - large_kernel, - small_kernel, - num_segments, - d_keys.Current(), - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), - d_keys_double_buffer, - d_values.Current(), - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), - d_values_double_buffer, - d_begin_offsets, - d_end_offsets, - group_sizes.get(), - large_and_medium_segments_indices.get(), - small_segments_indices.get(), - debug_synchronous); - - if (CubDebug(error = cudaPeekAtLastError())) - { - return error; - } +#ifdef CUB_RUNTIME_ENABLED +#define CUB_TMP_DEVICE_CODE \ + using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; \ + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ + .doit(DeviceSegmentedSortContinuationKernel, \ + large_kernel, \ + small_kernel, \ + num_segments, \ + d_keys.Current(), \ + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ + d_keys_double_buffer, \ + d_values.Current(), \ + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ + d_values_double_buffer, \ + d_begin_offsets, \ + d_end_offsets, \ + group_sizes.get(), \ + large_and_medium_segments_indices.get(), \ + small_segments_indices.get(), \ + debug_synchronous); \ + \ + if (CubDebug(error = cudaPeekAtLastError())) \ + { \ + return error; \ + } \ + \ + if (debug_synchronous) \ + { \ + if (CubDebug(error = SyncStream(stream))) \ + { \ + return error; \ + } \ + } +#else +#define CUB_TMP_DEVICE_CODE error = CubDebug(cudaErrorNotSupported); +#endif + + // Clang format mangles some of this NV_IF_TARGET block + // clang-format off + NV_IF_TARGET( + NV_IS_HOST, + ( + unsigned int h_group_sizes[num_selected_groups]; + + if (CubDebug(error = cudaMemcpyAsync(h_group_sizes, + group_sizes.get(), + num_selected_groups * + sizeof(unsigned int), + cudaMemcpyDeviceToHost, + stream))) + { + return error; + } - if (debug_synchronous) - { if (CubDebug(error = SyncStream(stream))) { return error; } - } - #else - error = CubDebug(cudaErrorNotSupported); - #endif - #endif - } + + error = DeviceSegmentedSortContinuation( + large_kernel, + small_kernel, + num_segments, + d_keys.Current(), + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), + d_keys_double_buffer, + d_values.Current(), + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), + d_values_double_buffer, + d_begin_offsets, + d_end_offsets, + h_group_sizes, + large_and_medium_segments_indices.get(), + small_segments_indices.get(), + stream, + debug_synchronous);), + // NV_IS_DEVICE: + (CUB_TMP_DEVICE_CODE)); + // clang-format on + +#undef CUB_TMP_DEVICE_CODE return error; } diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index 5654ba29a3..bc19b1281b 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -1,4 +1,3 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. @@ -34,19 +33,21 @@ #pragma once -#include -#include - -#include "dispatch_scan.cuh" -#include "../../config.cuh" -#include "../../agent/agent_select_if.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_math.cuh" +#include +#include +#include +#include +#include +#include +#include #include +#include + +#include +#include + CUB_NAMESPACE_BEGIN /****************************************************************************** @@ -188,23 +189,18 @@ struct DispatchSelectIf int ptx_version, KernelConfig &select_if_config) { - if (CUB_IS_DEVICE_CODE) { - #if CUB_INCLUDE_DEVICE_CODE - (void)ptx_version; - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - select_if_config.template Init(); - #endif - } - else - { - #if CUB_INCLUDE_HOST_CODE - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - - // (There's only one policy right now) - (void)ptx_version; - select_if_config.template Init(); - #endif - } + NV_IF_TARGET(NV_IS_DEVICE, + ( + (void)ptx_version; + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_if_config.template Init(); + ), ( + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + + // (There's only one policy right now) + (void)ptx_version; + select_if_config.template Init(); + )); } diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh index d0de86404c..7e2d74b26d 100644 --- a/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -34,22 +34,24 @@ #pragma once -#include -#include - -#include "../../agent/single_pass_scan_operators.cuh" -#include "../../agent/agent_segment_fixup.cuh" -#include "../../agent/agent_spmv_orig.cuh" -#include "../../util_type.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_math.cuh" -#include "../../thread/thread_search.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../config.cuh" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include #include +#include +#include + CUB_NAMESPACE_BEGIN @@ -401,40 +403,33 @@ struct DispatchSpmv KernelConfig &spmv_config, KernelConfig &segment_fixup_config) { - if (CUB_IS_DEVICE_CODE) - { - #if CUB_INCLUDE_DEVICE_CODE - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - spmv_config.template Init(); - segment_fixup_config.template Init(); - #endif - } - else - { - #if CUB_INCLUDE_HOST_CODE - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 600) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 500) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 370) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - #endif - } + NV_IF_TARGET( + NV_IS_DEVICE, + ( // We're on the device, so initialize the kernel dispatch + // configurations with the current PTX policy + spmv_config.template Init(); + segment_fixup_config.template Init();), + ( + // We're on the host, so lookup and initialize the kernel dispatch + // configurations with the policies that match the device's PTX + // version + if (ptx_version >= 600) { + spmv_config.template Init(); + segment_fixup_config + .template Init(); + } else if (ptx_version >= 500) { + spmv_config.template Init(); + segment_fixup_config + .template Init(); + } else if (ptx_version >= 370) { + spmv_config.template Init(); + segment_fixup_config + .template Init(); + } else { + spmv_config.template Init(); + segment_fixup_config + .template Init(); + })); } diff --git a/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/device/dispatch/dispatch_three_way_partition.cuh index cb6d3c7a1b..cb06438063 100644 --- a/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -37,6 +37,8 @@ #include #include +#include + #include CUB_NAMESPACE_BEGIN @@ -204,25 +206,18 @@ struct DispatchThreeWayPartitionIf int ptx_version, KernelConfig &select_if_config) { - if (CUB_IS_DEVICE_CODE) - { -#if CUB_INCLUDE_DEVICE_CODE - (void)ptx_version; - // We're on the device, so initialize the kernel dispatch configurations - // with the current PTX policy - select_if_config.template Init(); -#endif - } - else - { -#if CUB_INCLUDE_HOST_CODE - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - - // (There's only one policy right now) - (void)ptx_version; - select_if_config.template Init(); -#endif - } + NV_IF_TARGET( + NV_IS_DEVICE, + ((void)ptx_version; + // We're on the device, so initialize the kernel dispatch configurations + // with the current PTX policy + select_if_config.template Init();), + (// We're on the host, so lookup and initialize the kernel dispatch + // configurations with the policies that match the device's PTX version + // (There's only one policy right now) + (void)ptx_version; + select_if_config + .template Init();)); } diff --git a/cub/grid/grid_queue.cuh b/cub/grid/grid_queue.cuh index ebb82e4dc0..e1933e3d38 100644 --- a/cub/grid/grid_queue.cuh +++ b/cub/grid/grid_queue.cuh @@ -33,8 +33,10 @@ #pragma once -#include "../config.cuh" -#include "../util_debug.cuh" +#include +#include + +#include CUB_NAMESPACE_BEGIN @@ -120,21 +122,20 @@ public: cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; - if (CUB_IS_DEVICE_CODE) { - #if CUB_INCLUDE_DEVICE_CODE - (void)stream; - d_counters[FILL] = fill_size; - d_counters[DRAIN] = 0; - result = cudaSuccess; - #endif - } else { - #if CUB_INCLUDE_HOST_CODE - OffsetT counters[2]; - counters[FILL] = fill_size; - counters[DRAIN] = 0; - result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); - #endif - } + + NV_IF_TARGET(NV_IS_DEVICE, + ( + (void)stream; + d_counters[FILL] = fill_size; + d_counters[DRAIN] = 0; + result = cudaSuccess; + ), ( + OffsetT counters[2]; + counters[FILL] = fill_size; + counters[DRAIN] = 0; + result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); + )); + return result; } @@ -143,17 +144,16 @@ public: __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; - if (CUB_IS_DEVICE_CODE) { - #if CUB_INCLUDE_DEVICE_CODE - (void)stream; - d_counters[DRAIN] = 0; - result = cudaSuccess; - #endif - } else { - #if CUB_INCLUDE_HOST_CODE - result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); - #endif - } + + NV_IF_TARGET(NV_IS_DEVICE, + ( + (void)stream; + d_counters[DRAIN] = 0; + result = cudaSuccess; + ), ( + result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); + )); + return result; } @@ -162,17 +162,16 @@ public: __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; - if (CUB_IS_DEVICE_CODE) { - #if CUB_INCLUDE_DEVICE_CODE - (void)stream; - d_counters[FILL] = 0; - result = cudaSuccess; - #endif - } else { - #if CUB_INCLUDE_HOST_CODE - result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); - #endif - } + + NV_IF_TARGET(NV_IS_DEVICE, + ( + (void)stream; + d_counters[FILL] = 0; + result = cudaSuccess; + ), ( + result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); + )); + return result; } @@ -183,17 +182,16 @@ public: cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; - if (CUB_IS_DEVICE_CODE) { - #if CUB_INCLUDE_DEVICE_CODE - (void)stream; - fill_size = d_counters[FILL]; - result = cudaSuccess; - #endif - } else { - #if CUB_INCLUDE_HOST_CODE - result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); - #endif - } + + NV_IF_TARGET(NV_IS_DEVICE, + ( + (void)stream; + fill_size = d_counters[FILL]; + result = cudaSuccess; + ), ( + result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); + )); + return result; } diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh index e7659d43c9..1611f0ee36 100644 --- a/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/iterator/tex_obj_input_iterator.cuh @@ -33,15 +33,17 @@ #pragma once +#include +#include +#include +#include +#include + +#include + #include #include -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_debug.cuh" -#include "../config.cuh" - #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include @@ -200,37 +202,9 @@ public: /// Indirection __host__ __device__ __forceinline__ reference operator*() const { - if (CUB_IS_HOST_CODE) { - #if CUB_INCLUDE_HOST_CODE - // Simply dereference the pointer on the host - return ptr[tex_offset]; - #else - // Never executed, just need a return value for this codepath. - // The `reference` type is actually just T, so we can fake this - // easily. - return reference{}; - #endif - } else { - #if CUB_INCLUDE_DEVICE_CODE - // Move array of uninitialized words, then alias and assign to return value - TextureWord words[TEXTURE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < TEXTURE_MULTIPLE; ++i) - { - words[i] = tex1Dfetch( - tex_obj, - (tex_offset * TEXTURE_MULTIPLE) + i); - } - - // Load from words - return *reinterpret_cast(words); - #else - // This is dead code which will never be executed. It is here - // only to avoid warnings about missing return statements. - return ptr[tex_offset]; - #endif - } + NV_IF_TARGET(NV_IS_HOST, + (return ptr[tex_offset];), + (return this->device_deref();)); } /// Addition @@ -312,6 +286,26 @@ public: return os; } +private: + // This is hoisted out of operator* because #pragma can't be used inside of + // NV_IF_TARGET + __device__ __forceinline__ reference device_deref() const + { + // Move array of uninitialized words, then alias and assign to return + // value + TextureWord words[TEXTURE_MULTIPLE]; + + const auto tex_idx_base = tex_offset * TEXTURE_MULTIPLE; + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch(tex_obj, tex_idx_base + i); + } + + // Load from words + return *reinterpret_cast(words); + } }; diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index 00a21516b6..2c4b36bee4 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -62,24 +62,32 @@ CUB_NAMESPACE_BEGIN #endif #endif -#ifndef CUB_IS_DEVICE_CODE - #if defined(_NVHPC_CUDA) - #define CUB_IS_DEVICE_CODE __builtin_is_device_code() - #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) - #define CUB_INCLUDE_DEVICE_CODE 1 - #define CUB_INCLUDE_HOST_CODE 1 - #elif CUB_PTX_ARCH > 0 - #define CUB_IS_DEVICE_CODE 1 - #define CUB_IS_HOST_CODE 0 - #define CUB_INCLUDE_DEVICE_CODE 1 - #define CUB_INCLUDE_HOST_CODE 0 - #else - #define CUB_IS_DEVICE_CODE 0 - #define CUB_IS_HOST_CODE 1 - #define CUB_INCLUDE_DEVICE_CODE 0 - #define CUB_INCLUDE_HOST_CODE 1 +// These definitions were intended for internal use only and are now obsolete. +// If you relied on them, consider porting your code to use the functionality +// in libcu++'s header. +// For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make +// them available again. These should be considered deprecated and will be +// fully removed in a future version. +#ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS + #ifndef CUB_IS_DEVICE_CODE + #if defined(_NVHPC_CUDA) + #define CUB_IS_DEVICE_CODE __builtin_is_device_code() + #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) + #define CUB_INCLUDE_DEVICE_CODE 1 + #define CUB_INCLUDE_HOST_CODE 1 + #elif CUB_PTX_ARCH > 0 + #define CUB_IS_DEVICE_CODE 1 + #define CUB_IS_HOST_CODE 0 + #define CUB_INCLUDE_DEVICE_CODE 1 + #define CUB_INCLUDE_HOST_CODE 0 + #else + #define CUB_IS_DEVICE_CODE 0 + #define CUB_IS_HOST_CODE 1 + #define CUB_INCLUDE_DEVICE_CODE 0 + #define CUB_INCLUDE_HOST_CODE 1 + #endif #endif -#endif +#endif // CUB_PROVIDE_LEGACY_ARCH_MACROS /// Maximum number of devices supported. #ifndef CUB_MAX_DEVICES diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh index d017b82149..c38b9f8c06 100644 --- a/cub/util_debug.cuh +++ b/cub/util_debug.cuh @@ -36,12 +36,14 @@ #pragma once -#include -#include "util_namespace.cuh" -#include "util_arch.cuh" +#include +#include -CUB_NAMESPACE_BEGIN +#include + +#include +CUB_NAMESPACE_BEGIN /** * \addtogroup UtilMgmt @@ -54,45 +56,57 @@ CUB_NAMESPACE_BEGIN #define CUB_STDERR #endif - - /** - * \brief If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. + * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the + * corresponding error message is printed to \p stderr (or \p stdout in device + * code) along with the supplied source context. * * \return The CUDA error. */ -__host__ __device__ __forceinline__ cudaError_t Debug( - cudaError_t error, - const char* filename, - int line) +__host__ __device__ +__forceinline__ +cudaError_t Debug(cudaError_t error, const char *filename, int line) { - (void)filename; - (void)line; + (void)filename; + (void)line; #ifdef CUB_RUNTIME_ENABLED - // Clear the global CUDA error state which may have been set by the last - // call. Otherwise, errors may "leak" to unrelated kernel launches. - cudaGetLastError(); + // Clear the global CUDA error state which may have been set by the last + // call. Otherwise, errors may "leak" to unrelated kernel launches. + cudaGetLastError(); #endif #ifdef CUB_STDERR - if (error) - { - if (CUB_IS_HOST_CODE) { - #if CUB_INCLUDE_HOST_CODE - fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); - fflush(stderr); - #endif - } else { - #if CUB_INCLUDE_DEVICE_CODE - printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); - #endif - } - } + if (error) + { + NV_IF_TARGET( + NV_IS_HOST, ( + fprintf(stderr, + "CUDA error %d [%s, %d]: %s\n", + error, + filename, + line, + cudaGetErrorString(error)); + fflush(stderr); + ), + ( + printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", + error, + blockIdx.z, + blockIdx.y, + blockIdx.x, + threadIdx.z, + threadIdx.y, + threadIdx.x, + filename, + line); + ) + ); + } #endif - return error; -} + return error; +} /** * \brief Debug macro @@ -114,43 +128,58 @@ __host__ __device__ __forceinline__ cudaError_t Debug( * \brief Log macro for printf statements. */ #if !defined(_CubLog) - #if defined(_NVHPC_CUDA) - #define _CubLog(format, ...) (__builtin_is_device_code() \ - ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ - blockIdx.z, blockIdx.y, blockIdx.x, \ - threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \ - : printf(format, __VA_ARGS__)); - #elif !(defined(__clang__) && defined(__CUDA__)) - #if (CUB_PTX_ARCH == 0) - #define _CubLog(format, ...) printf(format,__VA_ARGS__); - #elif (CUB_PTX_ARCH >= 200) - #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); - #endif - #else - // XXX shameless hack for clang around variadic printf... - // Compilies w/o supplying -std=c++11 but shows warning, - // so we sielence them :) - #pragma clang diagnostic ignored "-Wc++11-extensions" - #pragma clang diagnostic ignored "-Wunnamed-type-template-args" - template - inline __host__ __device__ void va_printf(char const* format, Args const&... args) - { - #ifdef __CUDA_ARCH__ - printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); - #else - printf(format, args...); - #endif - } - #ifndef __CUDA_ARCH__ - #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format,__VA_ARGS__); - #else - #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); - #endif - #endif +#if defined(_NVHPC_CUDA) || !(defined(__clang__) && defined(__CUDA__)) + +// NVCC / NVC++ +#define _CubLog(format, ...) \ + do \ + { \ + NV_IF_TARGET(NV_IS_HOST, \ + (printf(format, __VA_ARGS__);), \ + (printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + blockIdx.z, \ + blockIdx.y, \ + blockIdx.x, \ + threadIdx.z, \ + threadIdx.y, \ + threadIdx.x, \ + __VA_ARGS__);)); \ + } while (false) + +#else // Clang: + +// XXX shameless hack for clang around variadic printf... +// Compilies w/o supplying -std=c++11 but shows warning, +// so we silence them :) +#pragma clang diagnostic ignored "-Wc++11-extensions" +#pragma clang diagnostic ignored "-Wunnamed-type-template-args" +template +inline __host__ __device__ void va_printf(char const *format, + Args const &...args) +{ +#ifdef __CUDA_ARCH__ + printf(format, + blockIdx.z, + blockIdx.y, + blockIdx.x, + threadIdx.z, + threadIdx.y, + threadIdx.x, + args...); +#else + printf(format, args...); +#endif +} +#ifndef __CUDA_ARCH__ +#define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format, __VA_ARGS__); +#else +#define _CubLog(format, ...) \ + CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread " \ + "(%d,%d,%d)]: " format, \ + __VA_ARGS__); +#endif +#endif #endif - - - /** @} */ // end group UtilMgmt diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 403c2f0d40..9a8ffa1d06 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -33,14 +33,16 @@ #pragma once -#include "detail/device_synchronize.cuh" +#include -#include "util_type.cuh" -#include "util_arch.cuh" -#include "util_debug.cuh" -#include "util_cpp_dialect.cuh" -#include "util_namespace.cuh" -#include "util_macro.cuh" +#include +#include +#include +#include +#include +#include + +#include #include #include @@ -180,8 +182,6 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() #endif } -#if CUB_CPP_DIALECT >= 2011 // C++11 and later. - /** * \brief Cache for an arbitrary value produced by a nullary function. */ @@ -197,9 +197,6 @@ struct ValueCache __host__ inline ValueCache() : value(Function()) {} }; -#endif - -#if CUB_CPP_DIALECT >= 2011 // Host code, only safely usable in C++11 or newer, where thread-safe // initialization of static locals is guaranteed. This is a separate function // to avoid defining a local static in a host/device function. @@ -208,7 +205,6 @@ __host__ inline int DeviceCountCachedValue() static ValueCache cache; return cache.value; } -#endif /** * \brief Returns the number of CUDA devices available. @@ -220,27 +216,14 @@ __host__ inline int DeviceCountCachedValue() CUB_RUNTIME_FUNCTION inline int DeviceCount() { int result = -1; - if (CUB_IS_HOST_CODE) { - #if CUB_INCLUDE_HOST_CODE - #if CUB_CPP_DIALECT >= 2011 - // Host code and C++11. - result = DeviceCountCachedValue(); - #else - // Host code and C++98. - result = DeviceCountUncached(); - #endif - #endif - } else { - #if CUB_INCLUDE_DEVICE_CODE - // Device code. - result = DeviceCountUncached(); - #endif - } + + NV_IF_TARGET(NV_IS_HOST, + (result = DeviceCountCachedValue();), + (result = DeviceCountUncached();)); + return result; } -#if CUB_CPP_DIALECT >= 2011 // C++11 and later. - /** * \brief Per-device cache for a CUDA attribute value; the attribute is queried * and stored for each device upon construction. @@ -350,8 +333,6 @@ public: } }; -#endif - /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). */ @@ -366,27 +347,46 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015. (void)reinterpret_cast(empty_kernel); + // Define a temporary macro that expands to the current target ptx version + // in device code. + // may provide an abstraction for this eventually. For now, + // we have to keep this usage of __CUDA_ARCH__. +#if defined(_NVHPC_CUDA) +#define CUB_TEMP_GET_PTX __builtin_current_device_sm() +#else +#define CUB_TEMP_GET_PTX __CUDA_ARCH__ +#endif + cudaError_t result = cudaSuccess; - if (CUB_IS_HOST_CODE) { - #if CUB_INCLUDE_HOST_CODE - cudaFuncAttributes empty_kernel_attrs; - - result = cudaFuncGetAttributes(&empty_kernel_attrs, - reinterpret_cast(empty_kernel)); - CubDebug(result); - - ptx_version = empty_kernel_attrs.ptxVersion * 10; - #endif - } else { - #if CUB_INCLUDE_DEVICE_CODE - // This is necessary to ensure instantiation of EmptyKernel in device code. - // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings. - // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864 - (void)reinterpret_cast(empty_kernel); - - ptx_version = CUB_PTX_ARCH; - #endif - } + NV_IF_TARGET( + NV_IS_HOST, + ( + cudaFuncAttributes empty_kernel_attrs; + + do + { + if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, + empty_kernel))) + { + break; + } + } while (0); + + ptx_version = empty_kernel_attrs.ptxVersion * 10; + ), + // NV_IS_DEVICE + ( + // This is necessary to ensure instantiation of EmptyKernel in device + // code. The `reinterpret_cast` is necessary to suppress a + // set-but-unused warnings. This is a meme now: + // https://twitter.com/blelbach/status/1222391615576100864 + (void)reinterpret_cast(empty_kernel); + + ptx_version = CUB_TEMP_GET_PTX; + )); + +#undef CUB_TEMP_GET_PTX + return result; } @@ -400,7 +400,6 @@ __host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device) return PtxVersionUncached(ptx_version); } -#if CUB_CPP_DIALECT >= 2011 // C++11 and later. template __host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache() { @@ -411,7 +410,6 @@ __host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache() struct PtxVersionCacheTag {}; struct SmVersionCacheTag {}; -#endif /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). @@ -422,8 +420,6 @@ struct SmVersionCacheTag {}; */ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device) { -#if CUB_CPP_DIALECT >= 2011 // C++11 and later. - auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in the payload, // which we check with `CubDebug` below. @@ -434,12 +430,6 @@ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device) ptx_version = payload.attribute; return payload.error; - -#else // Pre C++11. - - return PtxVersionUncached(ptx_version, device); - -#endif } /** @@ -449,37 +439,31 @@ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device) * * \note This function is thread safe. */ -CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version) +CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version) { - cudaError_t result = cudaErrorUnknown; - if (CUB_IS_HOST_CODE) { - #if CUB_INCLUDE_HOST_CODE - #if CUB_CPP_DIALECT >= 2011 - // Host code and C++11. - auto const device = CurrentDevice(); - - auto const payload = GetPerDeviceAttributeCache()( - // If this call fails, then we get the error code back in the payload, - // which we check with `CubDebug` below. - [=] (int& pv) { return PtxVersionUncached(pv, device); }, - device); - - if (!CubDebug(payload.error)) - ptx_version = payload.attribute; - - result = payload.error; - #else - // Host code and C++98. - result = PtxVersionUncached(ptx_version); - #endif - #endif - } else { - #if CUB_INCLUDE_DEVICE_CODE - // Device code. - result = PtxVersionUncached(ptx_version); - #endif - } - return result; + cudaError_t result = cudaErrorUnknown; + NV_IF_TARGET( + NV_IS_HOST, + ( + auto const device = CurrentDevice(); + auto const payload = GetPerDeviceAttributeCache()( + // If this call fails, then we get the error code back in the payload, + // which we check with `CubDebug` below. + [=](int &pv) { return PtxVersionUncached(pv, device); }, + device); + + if (!CubDebug(payload.error)) + { + ptx_version = payload.attribute; + } + + result = payload.error; + ), + ( // NV_IS_DEVICE: + result = PtxVersionUncached(ptx_version); + )); + + return result; } /** @@ -519,34 +503,32 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d * * \note This function is thread safe. */ -CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice()) +CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version, + int device = CurrentDevice()) { - cudaError_t result = cudaErrorUnknown; - if (CUB_IS_HOST_CODE) { - #if CUB_INCLUDE_HOST_CODE - #if CUB_CPP_DIALECT >= 2011 - // Host code and C++11 - auto const payload = GetPerDeviceAttributeCache()( - // If this call fails, then we get the error code back in the payload, - // which we check with `CubDebug` below. - [=] (int& pv) { return SmVersionUncached(pv, device); }, - device); - - if (!CubDebug(payload.error)) - sm_version = payload.attribute; - - result = payload.error; - #else - // Host code and C++98 - result = SmVersionUncached(sm_version, device); - #endif - #endif - } else { - #if CUB_INCLUDE_DEVICE_CODE - result = SmVersionUncached(sm_version, device); - #endif - } - return result; + cudaError_t result = cudaErrorUnknown; + + NV_IF_TARGET( + NV_IS_HOST, + ( + auto const payload = GetPerDeviceAttributeCache()( + // If this call fails, then we get the error code back in + // the payload, which we check with `CubDebug` below. + [=](int &pv) { return SmVersionUncached(pv, device); }, + device); + + if (!CubDebug(payload.error)) + { + sm_version = payload.attribute; + }; + + result = payload.error; + ), + ( // NV_IS_DEVICE + result = SmVersionUncached(sm_version, device); + )); + + return result; } /** @@ -554,24 +536,13 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = */ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { - cudaError_t result = cudaErrorUnknown; - if (CUB_IS_HOST_CODE) { - #if CUB_INCLUDE_HOST_CODE - result = CubDebug(cudaStreamSynchronize(stream)); - #endif - } else { - #if CUB_INCLUDE_DEVICE_CODE - #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime. - (void)stream; - // Device can't yet sync on a specific stream - result = CubDebug(cub::detail::device_synchronize()); - #else // Device code without the CUDA runtime. - (void)stream; - // CUDA API calls are not supported from this device. - result = CubDebug(cudaErrorInvalidConfiguration); - #endif - #endif - } + cudaError_t result = cudaErrorUnknown; + + NV_IF_TARGET(NV_IS_HOST, + (result = CubDebug(cudaStreamSynchronize(stream));), + ((void)stream; + result = CubDebug(cub::detail::device_synchronize());)); + return result; } diff --git a/test/test_device_segmented_sort.cu b/test/test_device_segmented_sort.cu index 5360279c81..32a764f864 100644 --- a/test/test_device_segmented_sort.cu +++ b/test/test_device_segmented_sort.cu @@ -29,6 +29,9 @@ #define CUB_STDERR #include + +#include + #include #include @@ -1500,61 +1503,56 @@ struct EdgeTestDispatch template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { - if (CUB_IS_HOST_CODE) - { - #if CUB_INCLUDE_HOST_CODE - using SmallAndMediumPolicyT = - typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; - using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; - - const int small_segment_max_segment_size = - SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE; - - const int items_per_small_segment = - SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_THREAD; - - const int medium_segment_max_segment_size = - SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE; - - const int single_thread_segment_size = items_per_small_segment; - - const int large_cached_segment_max_segment_size = - LargeSegmentPolicyT::BLOCK_THREADS * - LargeSegmentPolicyT::ITEMS_PER_THREAD; - - for (bool sort_descending : {ascending, descending}) - { - Input edge_cases = - InputDescription() - .add({a_lot_of, empty_short_circuit_segment_size}) - .add({a_lot_of, copy_short_circuit_segment_size}) - .add({a_lot_of, swap_short_circuit_segment_size}) - .add({a_lot_of, swap_short_circuit_segment_size + 1}) - .add({a_lot_of, swap_short_circuit_segment_size + 1}) - .add({a_lot_of, single_thread_segment_size - 1}) - .add({a_lot_of, single_thread_segment_size }) - .add({a_lot_of, single_thread_segment_size + 1 }) - .add({a_lot_of, single_thread_segment_size * 2 - 1 }) - .add({a_lot_of, single_thread_segment_size * 2 }) - .add({a_lot_of, single_thread_segment_size * 2 + 1 }) - .add({a_bunch_of, small_segment_max_segment_size - 1}) - .add({a_bunch_of, small_segment_max_segment_size}) - .add({a_bunch_of, small_segment_max_segment_size + 1}) - .add({a_bunch_of, medium_segment_max_segment_size - 1}) - .add({a_bunch_of, medium_segment_max_segment_size}) - .add({a_bunch_of, medium_segment_max_segment_size + 1}) - .add({a_bunch_of, large_cached_segment_max_segment_size - 1}) - .add({a_bunch_of, large_cached_segment_max_segment_size}) - .add({a_bunch_of, large_cached_segment_max_segment_size + 1}) - .add({a_few, large_cached_segment_max_segment_size * 2}) - .add({a_few, large_cached_segment_max_segment_size * 3}) - .add({a_few, large_cached_segment_max_segment_size * 5}) - .template gen(sort_descending); - - InputTest(sort_descending, edge_cases); - } - #endif - } + NV_IF_TARGET(NV_IS_HOST, + (using SmallAndMediumPolicyT = + typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; + using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; + + const int small_segment_max_segment_size = + SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE; + + const int items_per_small_segment = + SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_THREAD; + + const int medium_segment_max_segment_size = + SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE; + + const int single_thread_segment_size = items_per_small_segment; + + const int large_cached_segment_max_segment_size = + LargeSegmentPolicyT::BLOCK_THREADS * + LargeSegmentPolicyT::ITEMS_PER_THREAD; + + for (bool sort_descending : {ascending, descending}) { + Input edge_cases = + InputDescription() + .add({a_lot_of, empty_short_circuit_segment_size}) + .add({a_lot_of, copy_short_circuit_segment_size}) + .add({a_lot_of, swap_short_circuit_segment_size}) + .add({a_lot_of, swap_short_circuit_segment_size + 1}) + .add({a_lot_of, swap_short_circuit_segment_size + 1}) + .add({a_lot_of, single_thread_segment_size - 1}) + .add({a_lot_of, single_thread_segment_size}) + .add({a_lot_of, single_thread_segment_size + 1}) + .add({a_lot_of, single_thread_segment_size * 2 - 1}) + .add({a_lot_of, single_thread_segment_size * 2}) + .add({a_lot_of, single_thread_segment_size * 2 + 1}) + .add({a_bunch_of, small_segment_max_segment_size - 1}) + .add({a_bunch_of, small_segment_max_segment_size}) + .add({a_bunch_of, small_segment_max_segment_size + 1}) + .add({a_bunch_of, medium_segment_max_segment_size - 1}) + .add({a_bunch_of, medium_segment_max_segment_size}) + .add({a_bunch_of, medium_segment_max_segment_size + 1}) + .add({a_bunch_of, large_cached_segment_max_segment_size - 1}) + .add({a_bunch_of, large_cached_segment_max_segment_size}) + .add({a_bunch_of, large_cached_segment_max_segment_size + 1}) + .add({a_few, large_cached_segment_max_segment_size * 2}) + .add({a_few, large_cached_segment_max_segment_size * 3}) + .add({a_few, large_cached_segment_max_segment_size * 5}) + .template gen(sort_descending); + + InputTest(sort_descending, edge_cases); + })); return cudaSuccess; } diff --git a/test/test_util.h b/test/test_util.h index f00c21bb9f..b1f3373aa1 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -51,13 +51,16 @@ #include "half.h" #include "bfloat16.h" -#include "cub/util_debug.cuh" -#include "cub/util_device.cuh" -#include "cub/util_type.cuh" -#include "cub/util_macro.cuh" -#include "cub/util_math.cuh" -#include "cub/util_ptx.cuh" -#include "cub/iterator/discard_output_iterator.cuh" +#include +#include +#include +#include +#include +#include +#include +#include + +#include /****************************************************************************** * Type conversion macros @@ -569,134 +572,121 @@ enum GenMode */ #pragma nv_exec_check_disable template -__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) +__host__ __device__ __forceinline__ +void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) { - // RandomBits is host-only. - if (CUB_IS_DEVICE_CODE) - { - #if CUB_INCLUDE_DEVICE_CODE - switch (gen_mode) - { - case RANDOM: - case RANDOM_BIT: - case RANDOM_MINUS_PLUS_ZERO: - _CubLog("%s\n", - "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); - break; - case UNIFORM: - value = 2; - break; - case INTEGER_SEED: - default: - value = (T) index; - break; - } - #endif // CUB_INCLUDE_DEVICE_CODE - } - else - { - #if CUB_INCLUDE_HOST_CODE - switch (gen_mode) - { - case RANDOM: - RandomBits(value); - break; - case RANDOM_BIT: - { - char c; - RandomBits(c, 0, 0, 1); - value = (c > 0) ? (T) 1 : (T) -1; - break; - } - case RANDOM_MINUS_PLUS_ZERO: - { - // Replace roughly 1/128 of values with -0.0 or +0.0, and generate the rest randomly - typedef typename CUB_NS_QUALIFIER::Traits::UnsignedBits UnsignedBits; - char c; - RandomBits(c); - if (c == 0) - { - // Replace 1/256 of values with +0.0 bit pattern - value = SafeBitCast(UnsignedBits(0)); - } - else if (c == 1) - { - // Replace 1/256 of values with -0.0 bit pattern - value = SafeBitCast(UnsignedBits(UnsignedBits(1) << - (sizeof(UnsignedBits) * 8) - 1)); - } - else - { - // 127/128 of values are random - RandomBits(value); - } - break; - } - case UNIFORM: - value = 2; - break; - case INTEGER_SEED: - default: - value = (T) index; - break; - } - #endif // CUB_INCLUDE_HOST_CODE - } + // RandomBits is host-only. + NV_IF_TARGET( + NV_IS_HOST, + ( + switch (gen_mode) { + case RANDOM: + RandomBits(value); + break; + case RANDOM_BIT: { + char c; + RandomBits(c, 0, 0, 1); + value = static_cast((c > 0) ? 1 : -1); + break; + } + case RANDOM_MINUS_PLUS_ZERO: { + // Replace roughly 1/128 of values with -0.0 or +0.0, and + // generate the rest randomly + using UnsignedBits = typename CUB_NS_QUALIFIER::Traits::UnsignedBits; + char c; + RandomBits(c); + if (c == 0) + { + // Replace 1/256 of values with +0.0 bit pattern + value = SafeBitCast(UnsignedBits(0)); + } + else if (c == 1) + { + // Replace 1/256 of values with -0.0 bit pattern + value = SafeBitCast( + UnsignedBits(UnsignedBits(1) << (sizeof(UnsignedBits) * 8) - 1)); + } + else + { + // 127/128 of values are random + RandomBits(value); + } + break; + } + case UNIFORM: + value = 2; + break; + case INTEGER_SEED: + default: + value = static_cast(index); + break; + }), + ( // NV_IS_DEVICE: + switch (gen_mode) { + case RANDOM: + case RANDOM_BIT: + case RANDOM_MINUS_PLUS_ZERO: + _CubLog("%s\n", + "cub::InitValue cannot generate random numbers on device."); + CUB_NS_QUALIFIER::ThreadTrap(); + break; + case UNIFORM: + value = 2; + break; + case INTEGER_SEED: + default: + value = static_cast(index); + break; + } + )); } - /** * Initialize value (bool) */ #pragma nv_exec_check_disable __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, std::size_t index = 0) { - // RandomBits is host-only. - if (CUB_IS_DEVICE_CODE) - { - #if CUB_INCLUDE_DEVICE_CODE - switch (gen_mode) - { - case RANDOM: - case RANDOM_BIT: - case RANDOM_MINUS_PLUS_ZERO: - _CubLog("%s\n", - "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); - break; - case UNIFORM: - value = true; - break; - case INTEGER_SEED: - default: - value = (index > 0); - break; - } - #endif // CUB_INCLUDE_DEVICE_CODE - } - else + // RandomBits is host-only. + NV_IF_TARGET( + NV_IS_HOST, + ( + switch (gen_mode) + { + case RANDOM: + case RANDOM_BIT: + char c; + RandomBits(c, 0, 0, 1); + value = (c > 0); + break; + case UNIFORM: + value = true; + break; + case INTEGER_SEED: + default: + value = (index > 0); + break; + } + ), + ( // NV_IS_DEVICE, + switch (gen_mode) { - #if CUB_INCLUDE_HOST_CODE - switch (gen_mode) - { - case RANDOM: - case RANDOM_BIT: - case RANDOM_MINUS_PLUS_ZERO: - char c; - RandomBits(c, 0, 0, 1); - value = (c > 0); - break; - case UNIFORM: - value = true; - break; - case INTEGER_SEED: - default: - value = (index > 0); - break; - } - #endif // CUB_INCLUDE_HOST_CODE + case RANDOM: + case RANDOM_BIT: + case RANDOM_MINUS_PLUS_ZERO: + _CubLog("%s\n", + "cub::InitValue cannot generate random numbers on device."); + CUB_NS_QUALIFIER::ThreadTrap(); + break; + case UNIFORM: + value = true; + break; + case INTEGER_SEED: + default: + value = (index > 0); + break; } + )); } diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu index 18308c1dba..d1569585c7 100644 --- a/test/test_warp_reduce.cu +++ b/test/test_warp_reduce.cu @@ -36,8 +36,10 @@ #include #include -#include #include +#include + +#include #include "test_util.h" @@ -67,16 +69,14 @@ struct WrapperFunctor template inline __host__ __device__ T operator()(const T &a, const T &b) const { - if (CUB_IS_DEVICE_CODE) - { - #if CUB_INCLUDE_DEVICE_CODE != 0 + NV_IF_TARGET(NV_IS_DEVICE, + ( if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid) { _CubLog("%s\n", "Invalid lane ID in cub::WrapperFunctor::operator()"); cub::ThreadTrap(); } - #endif - } + )); return op(a, b); } From 476c1b8849b5fb3236240a8c59738ef59be69285 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 11 May 2021 17:05:46 -0400 Subject: [PATCH 5/7] Don't use host-only functions in host-device contexts. This fixes the issue reported in NVIDIA/CUB#299. There's no clear reason why this should use `RandomBits` unconditionally. --- test/test_util.h | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/test/test_util.h b/test/test_util.h index b1f3373aa1..d7b5d28f7c 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -714,22 +714,18 @@ __host__ __device__ __forceinline__ void InitValue( // This specialization only appears to be used by test_warp_scan. // It initializes with uniform values and random keys, so we need to // protect the call to the host-only RandomBits. - if (CUB_IS_DEVICE_CODE) - { - #if CUB_INCLUDE_DEVICE_CODE - _CubLog("%s\n", - "cub::InitValue cannot generate random numbers on device."); - CUB_NS_QUALIFIER::ThreadTrap(); - #endif // CUB_INCLUDE_DEVICE_CODE - } - else - { - #if CUB_INCLUDE_HOST_CODE - // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3 - RandomBits(value.key, 3); - value.key = (value.key & 0x1); - #endif // CUB_INCLUDE_HOST_CODE - } + // clang-format off + NV_IF_TARGET(NV_IS_HOST, ( + // Assign corresponding flag with a likelihood of the last bit + // being set with entropy-reduction level 3 + RandomBits(value.key, 3); + value.key = (value.key & 0x1); + ), ( // NV_IS_DEVICE + _CubLog("%s\n", + "cub::InitValue cannot generate random numbers on device."); + CUB_NS_QUALIFIER::ThreadTrap(); + )); + // clang-format on } From f4d61fbb26c0c3b2ffd392bfbe6edbc60c0d5be3 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Wed, 23 Mar 2022 15:06:39 -0400 Subject: [PATCH 6/7] Use Thrust's kernel launch helper in DispatchRadixSort. --- cub/device/dispatch/dispatch_radix_sort.cuh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 6265232fad..39a6441a0c 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1344,9 +1344,14 @@ struct DispatchRadixSort : // exclusive sums to determine starts const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; - DeviceRadixSortExclusiveSumKernel - <<>>(d_bins); - if (CubDebug(error = cudaPeekAtLastError())) break; + error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + num_passes, SCAN_BLOCK_THREADS, 0, stream + ).doit(DeviceRadixSortExclusiveSumKernel, + d_bins); + if (CubDebug(error)) + { + break; + } // use the other buffer if no overwrite is allowed KeyT* d_keys_tmp = d_keys.Alternate(); @@ -1374,7 +1379,8 @@ struct DispatchRadixSort : stream))) break; auto onesweep_kernel = DeviceRadixSortOnesweepKernel< MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>; - errror = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + + error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream ).doit(onesweep_kernel, d_lookback, d_ctrs + portion * num_passes + pass, From 4de961aee49c894e9c380d7c2f7e750016976f00 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Thu, 24 Mar 2022 12:50:41 -0400 Subject: [PATCH 7/7] Skip large allocation tests that exceed device memory. The merge sort test with pow2 >20 fails on GTX 1650. Detect bad_alloc failures and skip those tests. Tests for smaller problem sizes will still fail if there's a bad_alloc. --- test/test_device_merge_sort.cu | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test/test_device_merge_sort.cu b/test/test_device_merge_sort.cu index 1a639027e1..52154ac177 100644 --- a/test/test_device_merge_sort.cu +++ b/test/test_device_merge_sort.cu @@ -49,6 +49,7 @@ #include #include +#include // for std::bad_alloc #include #include @@ -322,12 +323,29 @@ void Test(thrust::default_random_engine &rng) { for (unsigned int pow2 = 9; pow2 < 22; pow2 += 2) { - const unsigned int num_items = 1 << pow2; - AllocateAndTestIterators(num_items); - - - TestHelper::AllocateAndTest(rng, num_items); - Test(rng, num_items); + try + { + const unsigned int num_items = 1 << pow2; + AllocateAndTestIterators(num_items); + + TestHelper::AllocateAndTest(rng, num_items); + Test(rng, num_items); + } + catch (std::bad_alloc &e) + { + if (pow2 > 20) + { // Some cards don't have enough memory for large allocations, these + // can be skipped. + printf("Skipping large memory test. (num_items=2^%u): %s\n", + pow2, + e.what()); + } + else + { // For smaller problem sizes, treat as an error: + printf("Error (num_items=2^%u): %s", pow2, e.what()); + throw; + } + } } }