From f42070e314021a29efeb1550b54930b19bb8932c Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Wed, 23 Mar 2022 15:02:47 -0400
Subject: [PATCH 1/7] Add libcu++ dependency.

---
 cub/cmake/cub-config.cmake | 67 +++++++++++++++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 8 deletions(-)

diff --git a/cub/cmake/cub-config.cmake b/cub/cmake/cub-config.cmake
index 3d7f64bd4d..ebe40efcd2 100644
--- a/cub/cmake/cub-config.cmake
+++ b/cub/cmake/cub-config.cmake
@@ -8,6 +8,9 @@ if (TARGET CUB::CUB)
   return()
 endif()
 
+# Minimum supported libcudacxx version:
+set(cub_libcudacxx_version 1.8.0)
+
 function(_cub_declare_interface_alias alias_name ugly_name)
   # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
   # 2) When an IMPORTED library is linked to another target, its include
@@ -27,16 +30,64 @@ function(_cub_declare_interface_alias alias_name ugly_name)
 endfunction()
 
 #
-# Setup targets
+# Setup some internal cache variables
 #
 
-_cub_declare_interface_alias(CUB::CUB _CUB_CUB)
 # Pull in the include dir detected by cub-config-version.cmake
 set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}"
   CACHE INTERNAL "Location of CUB headers."
+  FORCE
 )
 unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache
+
+if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
+  set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE)
+  set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE)
+else()
+  set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE)
+  set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE)
+endif()
+
+#
+# Setup dependencies
+#
+
+if (NOT TARGET CUB::libcudacxx)
+  if (TARGET Thrust::libcudacxx)
+    # Prefer the same libcudacxx as Thrust, if available:
+    _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx)
+    target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx)
+  else()
+    if (NOT TARGET libcudacxx::libcudacxx)
+      # First do a non-required search for any co-packaged versions.
+      # These are preferred.
+      find_package(libcudacxx ${cub_libcudacxx_version} CONFIG
+        ${_CUB_QUIET_FLAG}
+        NO_DEFAULT_PATH # Only check the explicit HINTS below:
+        HINTS
+          "${_CUB_INCLUDE_DIR}/../libcudacxx"           # Source layout
+          "${_CUB_CMAKE_DIR}/.."                        # Install layout
+      )
+
+      # A second required search allows externally packaged to be used and fails if
+      # no suitable package exists.
+      find_package(libcudacxx ${cub_libcudacxx_version} CONFIG
+        REQUIRED
+        ${_CUB_QUIET_FLAG}
+      )
+    endif()
+    _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx)
+    target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx)
+  endif()
+endif()
+
+#
+# Setup targets
+#
+
+_cub_declare_interface_alias(CUB::CUB _CUB_CUB)
 target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
+target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx)
 
 if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API)
   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API")
@@ -61,12 +112,12 @@ endif()
 # Standardize version info
 #
 
-set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "")
-set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "")
-set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "")
-set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "")
-set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "")
-set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "")
+set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE)
+set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE)
+set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE)
+set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE)
+set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE)
+set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE)
 
 include(FindPackageHandleStandardArgs)
 if (NOT CUB_CONFIG)

From f9beaa51dbf1f708880e695073c6eb009a673a74 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 26 Mar 2021 14:46:35 -0400
Subject: [PATCH 2/7] Remove checks for obsolete architectures.

---
 cub/agent/agent_histogram.cuh                 |  5 ----
 cub/agent/agent_segment_fixup.cuh             |  3 +--
 cub/block/block_histogram.cuh                 | 13 +---------
 cub/block/block_radix_rank.cuh                |  2 +-
 cub/block/block_radix_sort.cuh                |  2 +-
 .../specializations/block_histogram_sort.cuh  |  4 +--
 cub/thread/thread_load.cuh                    | 23 ++++------------
 cub/thread/thread_store.cuh                   | 16 +++---------
 cub/util_arch.cuh                             |  2 +-
 cub/util_type.cuh                             | 15 -----------
 cub/warp/specializations/warp_reduce_smem.cuh |  2 +-
 cub/warp/warp_reduce.cuh                      |  2 +-
 .../block/example_block_reduce_dyn_smem.cu    | 26 +++----------------
 test/test_iterator.cu                         | 12 +++------
 14 files changed, 25 insertions(+), 102 deletions(-)

diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh
index c94e7354d7..0f0eb1e622 100644
--- a/cub/agent/agent_histogram.cuh
+++ b/cub/agent/agent_histogram.cuh
@@ -562,15 +562,10 @@ struct AgentHistogram
             is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
 
         // Accumulate samples
-#if CUB_PTX_ARCH >= 120
         if (prefer_smem)
             AccumulateSmemPixels(samples, is_valid);
         else
             AccumulateGmemPixels(samples, is_valid);
-#else
-        AccumulateGmemPixels(samples, is_valid);
-#endif
-
     }
 
 
diff --git a/cub/agent/agent_segment_fixup.cuh b/cub/agent/agent_segment_fixup.cuh
index 78361ef354..bd2837ce06 100644
--- a/cub/agent/agent_segment_fixup.cuh
+++ b/cub/agent/agent_segment_fixup.cuh
@@ -111,8 +111,7 @@ struct AgentSegmentFixup
       TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
 
       // Whether or not do fixup using RLE + global atomics
-      USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
-                         (std::is_same<ValueT, float>::value ||
+      USE_ATOMIC_FIXUP = (std::is_same<ValueT, float>::value ||
                           std::is_same<ValueT, int>::value ||
                           std::is_same<ValueT, unsigned int>::value ||
                           std::is_same<ValueT, unsigned long long>::value),
diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh
index ef15332eaf..7f0e982c25 100644
--- a/cub/block/block_histogram.cuh
+++ b/cub/block/block_histogram.cuh
@@ -176,20 +176,9 @@ private:
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
     };
 
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
-     * regardless.
-     */
-    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
-            BLOCK_HISTO_SORT :
-            ALGORITHM;
-
     /// Internal specialization.
     using InternalBlockHistogram =
-      cub::detail::conditional_t<SAFE_ALGORITHM == BLOCK_HISTO_SORT,
+      cub::detail::conditional_t<ALGORITHM == BLOCK_HISTO_SORT,
                                  BlockHistogramSort<T,
                                                     BLOCK_DIM_X,
                                                     ITEMS_PER_THREAD,
diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh
index e75d636304..ad3588afb9 100644
--- a/cub/block/block_radix_rank.cuh
+++ b/cub/block/block_radix_rank.cuh
@@ -138,7 +138,7 @@ template <
     int                     BLOCK_DIM_X,
     int                     RADIX_BITS,
     bool                    IS_DESCENDING,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    bool                    MEMOIZE_OUTER_SCAN      = true,
     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
     cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
     int                     BLOCK_DIM_Y             = 1,
diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh
index 73a75636a9..4663f3b6f1 100644
--- a/cub/block/block_radix_sort.cuh
+++ b/cub/block/block_radix_sort.cuh
@@ -165,7 +165,7 @@ template <
     int                     ITEMS_PER_THREAD,
     typename                ValueT                   = NullType,
     int                     RADIX_BITS              = 4,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    bool                    MEMOIZE_OUTER_SCAN      = true,
     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
     cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
     int                     BLOCK_DIM_Y             = 1,
diff --git a/cub/block/specializations/block_histogram_sort.cuh b/cub/block/specializations/block_histogram_sort.cuh
index 5bd2a80b33..e60325d48d 100644
--- a/cub/block/specializations/block_histogram_sort.cuh
+++ b/cub/block/specializations/block_histogram_sort.cuh
@@ -52,7 +52,7 @@ template <
     int         BINS,               ///< The number of bins into which histogram samples may fall
     int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective (unused)
 struct BlockHistogramSort
 {
     /// Constants
@@ -69,7 +69,7 @@ struct BlockHistogramSort
             ITEMS_PER_THREAD,
             NullType,
             4,
-            (PTX_ARCH >= 350) ? true : false,
+            true,
             BLOCK_SCAN_WARP_SCANS,
             cudaSharedMemBankSizeFourByte,
             BLOCK_DIM_Y,
diff --git a/cub/thread/thread_load.cuh b/cub/thread/thread_load.cuh
index 97ee37d81c..73d251c331 100644
--- a/cub/thread/thread_load.cuh
+++ b/cub/thread/thread_load.cuh
@@ -268,24 +268,11 @@ struct IterateThreadLoad<MAX, MAX>
 /**
  * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
  */
-#if CUB_PTX_ARCH >= 200
-    _CUB_LOAD_ALL(LOAD_CA, ca)
-    _CUB_LOAD_ALL(LOAD_CG, cg)
-    _CUB_LOAD_ALL(LOAD_CS, cs)
-    _CUB_LOAD_ALL(LOAD_CV, cv)
-#else
-    _CUB_LOAD_ALL(LOAD_CA, global)
-    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
-    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
-    _CUB_LOAD_ALL(LOAD_CS, global)
-    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
-#endif
-
-#if CUB_PTX_ARCH >= 350
-    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
-#else
-    _CUB_LOAD_ALL(LOAD_LDG, global)
-#endif
+_CUB_LOAD_ALL(LOAD_CA, ca)
+_CUB_LOAD_ALL(LOAD_CG, cg)
+_CUB_LOAD_ALL(LOAD_CS, cs)
+_CUB_LOAD_ALL(LOAD_CV, cv)
+_CUB_LOAD_ALL(LOAD_LDG, global.nc)
 
 
 // Macro cleanup
diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh
index 00944ec927..3af2613d4b 100644
--- a/cub/thread/thread_store.cuh
+++ b/cub/thread/thread_store.cuh
@@ -257,18 +257,10 @@ struct IterateThreadStore<MAX, MAX>
 /**
  * Define ThreadStore specializations for the various Cache load modifiers
  */
-#if CUB_PTX_ARCH >= 200
-    _CUB_STORE_ALL(STORE_WB, wb)
-    _CUB_STORE_ALL(STORE_CG, cg)
-    _CUB_STORE_ALL(STORE_CS, cs)
-    _CUB_STORE_ALL(STORE_WT, wt)
-#else
-    _CUB_STORE_ALL(STORE_WB, global)
-    _CUB_STORE_ALL(STORE_CG, global)
-    _CUB_STORE_ALL(STORE_CS, global)
-    _CUB_STORE_ALL(STORE_WT, volatile.global)
-#endif
-
+_CUB_STORE_ALL(STORE_WB, wb)
+_CUB_STORE_ALL(STORE_CG, cg)
+_CUB_STORE_ALL(STORE_CS, cs)
+_CUB_STORE_ALL(STORE_WT, wt)
 
 // Macro cleanup
 #undef _CUB_STORE_ALL
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
index a4474afe16..f1bd8b6b99 100644
--- a/cub/util_arch.cuh
+++ b/cub/util_arch.cuh
@@ -92,7 +92,7 @@ CUB_NAMESPACE_BEGIN
 
 /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
 #ifndef CUB_RUNTIME_FUNCTION
-    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+    #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
         #define CUB_RUNTIME_ENABLED
         #define CUB_RUNTIME_FUNCTION __host__ __device__
     #else
diff --git a/cub/util_type.cuh b/cub/util_type.cuh
index d859bb16cb..d15444b488 100644
--- a/cub/util_type.cuh
+++ b/cub/util_type.cuh
@@ -445,13 +445,8 @@ template <>
 struct UnitWord <float2>
 {
     typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float       VolatileWord;
-    typedef uint2       DeviceWord;
-#else
     typedef unsigned long long   VolatileWord;
     typedef unsigned long long   DeviceWord;
-#endif
     typedef float2      TextureWord;
 };
 
@@ -460,13 +455,8 @@ template <>
 struct UnitWord <float4>
 {
     typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float               VolatileWord;
-    typedef uint4               DeviceWord;
-#else
     typedef unsigned long long  VolatileWord;
     typedef ulonglong2          DeviceWord;
-#endif
     typedef float4              TextureWord;
 };
 
@@ -476,13 +466,8 @@ template <>
 struct UnitWord <char2>
 {
     typedef unsigned short      ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef unsigned short      VolatileWord;
-    typedef short               DeviceWord;
-#else
     typedef unsigned short      VolatileWord;
     typedef unsigned short      DeviceWord;
-#endif
     typedef unsigned short      TextureWord;
 };
 
diff --git a/cub/warp/specializations/warp_reduce_smem.cuh b/cub/warp/specializations/warp_reduce_smem.cuh
index 80c7c62cc5..2b419cc217 100644
--- a/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/warp/specializations/warp_reduce_smem.cuh
@@ -351,7 +351,7 @@ struct WarpReduceSmem
         FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
         ReductionOp     reduction_op)       ///< [in] Reduction operator
     {
-        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<true>());
     }
 
 
diff --git a/cub/warp/warp_reduce.cuh b/cub/warp/warp_reduce.cuh
index 24a2cd4f9d..d905703d75 100644
--- a/cub/warp/warp_reduce.cuh
+++ b/cub/warp/warp_reduce.cuh
@@ -152,7 +152,7 @@ private:
 
 public:
 
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
     /// Internal specialization.
     /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
diff --git a/examples/block/example_block_reduce_dyn_smem.cu b/examples/block/example_block_reduce_dyn_smem.cu
index c7f571e31e..2c2ce24c32 100644
--- a/examples/block/example_block_reduce_dyn_smem.cu
+++ b/examples/block/example_block_reduce_dyn_smem.cu
@@ -61,26 +61,6 @@ bool g_verbose = false;
 /// Default grid size
 int g_grid_size = 1;
 
-// The following templated variables are helpers to get the shared memory
-// This requires C++ 14 (template variables and constexpr max)
-// but it's possible to work around these constructs for older C++ versions.
-// we use default arguments from cub's block reduce for most parameters
-template <typename T, int BLOCK_THREADS, int ARCH>
-constexpr std::size_t arch_bytes_size = sizeof(
-    typename cub::BlockReduce<
-        T,
-        BLOCK_THREADS,
-        BLOCK_REDUCE_WARP_REDUCTIONS /* ALGORITHM */,
-        1 /* BLOCK_DIM_Y */,
-        1 /* BLOCK_DIM_Z */,
-        ARCH>::TempStorage
-);
-template <typename T, int BLOCK_THREADS, int... Archs>
-constexpr auto archs_max_bytes = (std::max)(
-    {arch_bytes_size<T, BLOCK_THREADS, Archs>...,});
-
-
-
 //---------------------------------------------------------------------
 // Kernels
 //---------------------------------------------------------------------
@@ -181,9 +161,9 @@ void Test()
     // Copy problem to device
     cudaMemcpy(d_in, h_in, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice);
 
-    // determine necessary storage for a few architectures
-    auto block_reduce_temp_bytes = archs_max_bytes<
-        int, BLOCK_THREADS, 600, 700, 800>;
+    // determine necessary storage size:
+    auto block_reduce_temp_bytes =
+      sizeof(typename cub::BlockReduce<int, BLOCK_THREADS>::TempStorage);
     // finally, we need to make sure that we can hold at least one integer
     // needed in the kernel to exchange data after reduction
     auto smem_size = (std::max)(1 * sizeof(int), block_reduce_temp_bytes);
diff --git a/test/test_iterator.cu b/test/test_iterator.cu
index 1b5a6c6d03..f042816a37 100644
--- a/test/test_iterator.cu
+++ b/test/test_iterator.cu
@@ -506,8 +506,7 @@ int main(int argc, char** argv)
     Test<long>();
     Test<long long>();
     Test<float>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double>();
+    Test<double>();
 
     Test<char2>();
     Test<short2>();
@@ -515,8 +514,7 @@ int main(int argc, char** argv)
     Test<long2>();
     Test<longlong2>();
     Test<float2>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double2>();
+    Test<double2>();
 
     Test<char3>();
     Test<short3>();
@@ -524,8 +522,7 @@ int main(int argc, char** argv)
     Test<long3>();
     Test<longlong3>();
     Test<float3>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double3>();
+    Test<double3>();
 
     Test<char4>();
     Test<short4>();
@@ -533,8 +530,7 @@ int main(int argc, char** argv)
     Test<long4>();
     Test<longlong4>();
     Test<float4>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double4>();
+    Test<double4>();
 
     Test<TestFoo>();
     Test<TestBar>();

From 5799e96b75fa3173397018ff804ea92398b8a81f Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 26 Mar 2021 16:04:08 -0400
Subject: [PATCH 3/7] Remove PTX arguments from compile-time contexts.

nvc++ will stop defining __NVCOMPILER_CUDA_ARCH__ soon, removing the
ability to determine the PTX arch at compile time.

This updates agents and collective algorithms to no longer require the
PTX_ARCH template parameter, and changes the CUB_WARP_SIZE(PTX_ARCH), etc
helpers to ignore their argument. These macros only differed on
obsolete arches and have no effect on currently supported architectures.
---
 cub/agent/agent_histogram.cuh                 |  2 +-
 cub/agent/agent_rle.cuh                       |  2 +-
 cub/agent/agent_spmv_orig.cuh                 |  2 +-
 cub/agent/single_pass_scan_operators.cuh      |  4 +-
 cub/block/block_adjacent_difference.cuh       |  7 ++-
 cub/block/block_discontinuity.cuh             |  4 +-
 cub/block/block_exchange.cuh                  |  9 ++--
 cub/block/block_histogram.cuh                 |  7 ++-
 cub/block/block_load.cuh                      | 14 +++---
 cub/block/block_radix_rank.cuh                | 16 +++----
 cub/block/block_radix_sort.cuh                | 14 +++---
 cub/block/block_raking_layout.cuh             | 10 ++--
 cub/block/block_reduce.cuh                    | 10 ++--
 cub/block/block_scan.cuh                      | 10 ++--
 cub/block/block_shuffle.cuh                   |  6 +--
 cub/block/block_store.cuh                     | 14 +++---
 .../specializations/block_histogram_sort.cuh  |  6 +--
 .../specializations/block_reduce_raking.cuh   |  6 +--
 .../block_reduce_raking_commutative_only.cuh  | 10 ++--
 .../block_reduce_warp_reductions.cuh          |  6 +--
 .../specializations/block_scan_raking.cuh     |  6 +--
 .../specializations/block_scan_warp_scans.cuh |  8 ++--
 .../block_scan_warp_scans2.cuh                |  8 ++--
 .../block_scan_warp_scans3.cuh                |  8 ++--
 cub/device/dispatch/dispatch_radix_sort.cuh   |  4 +-
 cub/device/dispatch/dispatch_reduce.cuh       |  2 +-
 cub/util_arch.cuh                             | 46 ++++++-------------
 cub/util_ptx.cuh                              | 10 ++--
 cub/warp/specializations/warp_reduce_shfl.cuh | 10 ++--
 cub/warp/specializations/warp_reduce_smem.cuh |  7 ++-
 cub/warp/specializations/warp_scan_shfl.cuh   |  8 ++--
 cub/warp/specializations/warp_scan_smem.cuh   |  7 ++-
 cub/warp/warp_exchange.cuh                    | 11 ++---
 cub/warp/warp_load.cuh                        | 11 ++---
 cub/warp/warp_merge_sort.cuh                  | 14 +++---
 cub/warp/warp_reduce.cuh                      | 10 ++--
 cub/warp/warp_scan.cuh                        | 10 ++--
 cub/warp/warp_store.cuh                       | 11 ++---
 test/test_warp_mask.cu                        |  3 +-
 39 files changed, 160 insertions(+), 193 deletions(-)

diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh
index 0f0eb1e622..8dc2be96b3 100644
--- a/cub/agent/agent_histogram.cuh
+++ b/cub/agent/agent_histogram.cuh
@@ -103,7 +103,7 @@ template <
     typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
     typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
     typename    OffsetT,                        ///< Signed integer type for global offsets
-    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+    int         LEGACY_PTX_ARCH = 0>            ///< PTX compute capability (unused)
 struct AgentHistogram
 {
     //---------------------------------------------------------------------
diff --git a/cub/agent/agent_rle.cuh b/cub/agent/agent_rle.cuh
index 37e5d37333..3c9109a13c 100644
--- a/cub/agent/agent_rle.cuh
+++ b/cub/agent/agent_rle.cuh
@@ -117,7 +117,7 @@ struct AgentRle
     // Constants
     enum
     {
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS            = CUB_WARP_THREADS(0),
         BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
         ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
         WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
diff --git a/cub/agent/agent_spmv_orig.cuh b/cub/agent/agent_spmv_orig.cuh
index 3e1cf3d0cf..c89bc4a3cd 100644
--- a/cub/agent/agent_spmv_orig.cuh
+++ b/cub/agent/agent_spmv_orig.cuh
@@ -115,7 +115,7 @@ template <
     typename    OffsetT,                    ///< Signed integer type for sequence offsets
     bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
     bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
-    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+    int         LEGACY_PTX_ARCH = 0>        ///< PTX compute capability (unused)
 struct AgentSpmv
 {
     //---------------------------------------------------------------------
diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh
index 08d99e8f99..4f21863a7f 100644
--- a/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/agent/single_pass_scan_operators.cuh
@@ -666,11 +666,11 @@ template <
     typename    T,
     typename    ScanOpT,
     typename    ScanTileStateT,
-    int         PTX_ARCH = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH = 0>
 struct TilePrefixCallbackOp
 {
     // Parameterized warp reduce
-    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS> WarpReduceT;
 
     // Temporary storage type
     struct _TempStorage
diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh
index 14dab12881..6740149e19 100644
--- a/cub/block/block_adjacent_difference.cuh
+++ b/cub/block/block_adjacent_difference.cuh
@@ -41,7 +41,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
  * @brief BlockAdjacentDifference provides
  *        [<em>collective</em>](index.html#sec0) methods for computing the
@@ -125,9 +124,9 @@ CUB_NAMESPACE_BEGIN
  */
 template <typename T,
           int BLOCK_DIM_X,
-          int BLOCK_DIM_Y = 1,
-          int BLOCK_DIM_Z = 1,
-          int PTX_ARCH    = CUB_PTX_ARCH>
+          int BLOCK_DIM_Y     = 1,
+          int BLOCK_DIM_Z     = 1,
+          int LEGACY_PTX_ARCH = 0>
 class BlockAdjacentDifference
 {
 private:
diff --git a/cub/block/block_discontinuity.cuh b/cub/block/block_discontinuity.cuh
index 08f89faae0..a3bf17f319 100644
--- a/cub/block/block_discontinuity.cuh
+++ b/cub/block/block_discontinuity.cuh
@@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
  * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
  *
  * \par Overview
  * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
@@ -107,7 +107,7 @@ template <
     int         BLOCK_DIM_X,
     int         BLOCK_DIM_Y     = 1,
     int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH = 0>
 class BlockDiscontinuity
 {
 private:
diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh
index 8f254c2c3f..98ebfe7e82 100644
--- a/cub/block/block_exchange.cuh
+++ b/cub/block/block_exchange.cuh
@@ -50,7 +50,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - It is commonplace for blocks of threads to rearrange data items between
@@ -114,7 +114,7 @@ template <
     bool        WARP_TIME_SLICING   = false,
     int         BLOCK_DIM_Y         = 1,
     int         BLOCK_DIM_Z         = 1,
-    int         PTX_ARCH            = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH     = 0>
 class BlockExchange
 {
 private:
@@ -129,11 +129,11 @@ private:
         /// The thread block size in threads
         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(0),
         SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
 
         TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
@@ -1126,4 +1126,3 @@ public:
 
 
 CUB_NAMESPACE_END
-
diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh
index 7f0e982c25..b9ab759607 100644
--- a/cub/block/block_histogram.cuh
+++ b/cub/block/block_histogram.cuh
@@ -94,7 +94,7 @@ enum BlockHistogramAlgorithm
  * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
@@ -160,7 +160,7 @@ template <
     BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
     int                     BLOCK_DIM_Y         = 1,
     int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH     = 0>
 class BlockHistogram
 {
 private:
@@ -184,8 +184,7 @@ private:
                                                     ITEMS_PER_THREAD,
                                                     BINS,
                                                     BLOCK_DIM_Y,
-                                                    BLOCK_DIM_Z,
-                                                    PTX_ARCH>,
+                                                    BLOCK_DIM_Z>,
                                  BlockHistogramAtomic<BINS>>;
 
     /// Shared memory storage layout type for BlockHistogram
diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
index 3e78ce958b..b419ab6e72 100644
--- a/cub/block/block_load.cuh
+++ b/cub/block/block_load.cuh
@@ -568,7 +568,7 @@ enum BlockLoadAlgorithm
  * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - The BlockLoad class provides a single data movement abstraction that can be specialized
@@ -638,7 +638,7 @@ template <
     BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
     int                 BLOCK_DIM_Y         = 1,
     int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
+    int                 LEGACY_PTX_ARCH     = 0>
 class BlockLoad
 {
 private:
@@ -860,7 +860,7 @@ private:
     struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
     {
         // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
@@ -928,14 +928,14 @@ private:
     {
         enum
         {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+            WARP_THREADS = CUB_WARP_THREADS(0)
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
         CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
@@ -1003,14 +1003,14 @@ private:
     {
         enum
         {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+            WARP_THREADS = CUB_WARP_THREADS(0)
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
         CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh
index ad3588afb9..b6675d02f9 100644
--- a/cub/block/block_radix_rank.cuh
+++ b/cub/block/block_radix_rank.cuh
@@ -104,7 +104,7 @@ struct BlockRadixRankEmptyCallback
  * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * Blah...
@@ -143,7 +143,7 @@ template <
     cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
     int                     BLOCK_DIM_Y             = 1,
     int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH         = 0>
 class BlockRadixRank
 {
 private:
@@ -168,7 +168,7 @@ private:
 
         RADIX_DIGITS                = 1 << RADIX_BITS,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
@@ -203,8 +203,7 @@ private:
             BLOCK_DIM_X,
             INNER_SCAN_ALGORITHM,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         BlockScan;
 
 
@@ -508,7 +507,7 @@ template <
     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
     int                     BLOCK_DIM_Y             = 1,
     int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH         = 0>
 class BlockRadixRankMatch
 {
 private:
@@ -527,7 +526,7 @@ private:
 
         RADIX_DIGITS                = 1 << RADIX_BITS,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
@@ -558,8 +557,7 @@ private:
             BLOCK_THREADS,
             INNER_SCAN_ALGORITHM,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         BlockScanT;
 
 
diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh
index 4663f3b6f1..7dbf3e6a13 100644
--- a/cub/block/block_radix_sort.cuh
+++ b/cub/block/block_radix_sort.cuh
@@ -57,7 +57,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
@@ -170,7 +170,7 @@ template <
     cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
     int                     BLOCK_DIM_Y             = 1,
     int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH         = 0>
 class BlockRadixSort
 {
 private:
@@ -201,8 +201,7 @@ private:
             INNER_SCAN_ALGORITHM,
             SMEM_CONFIG,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         AscendingBlockRadixRank;
 
     /// Descending BlockRadixRank utility type
@@ -214,18 +213,17 @@ private:
             INNER_SCAN_ALGORITHM,
             SMEM_CONFIG,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         DescendingBlockRadixRank;
 
     /// Digit extractor type
     typedef BFEDigitExtractor<KeyT> DigitExtractorT;
 
     /// BlockExchange utility type for keys
-    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchangeKeys;
 
     /// BlockExchange utility type for values
-    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchangeValues;
 
     /// Shared memory storage layout type
     union _TempStorage
diff --git a/cub/block/block_raking_layout.cuh b/cub/block/block_raking_layout.cuh
index 8bd1be13ad..4d49f54f2c 100644
--- a/cub/block/block_raking_layout.cuh
+++ b/cub/block/block_raking_layout.cuh
@@ -52,12 +52,12 @@ CUB_NAMESPACE_BEGIN
  *
  * \tparam T                        The data type to be exchanged.
  * \tparam BLOCK_THREADS            The thread block size in threads.
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH          <b>[optional]</b> Unused.
  */
 template <
     typename    T,
     int         BLOCK_THREADS,
-    int         PTX_ARCH = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH = 0>
 struct BlockRakingLayout
 {
     //---------------------------------------------------------------------
@@ -70,7 +70,7 @@ struct BlockRakingLayout
         SHARED_ELEMENTS = BLOCK_THREADS,
 
         /// Maximum number of warp-synchronous raking threads
-        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)),
 
         /// Number of raking elements per warp-synchronous raking thread (rounded up)
         SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
@@ -79,11 +79,11 @@ struct BlockRakingLayout
         RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
 
         /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
-        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(0) % SEGMENT_LENGTH == 0),
 
         /// Degree of bank conflicts (e.g., 4-way)
         CONFLICT_DEGREE = (HAS_CONFLICTS) ?
-            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(0) :
             1,
 
         /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh
index eabcc285e5..dcaf2ee0db 100644
--- a/cub/block/block_reduce.cuh
+++ b/cub/block/block_reduce.cuh
@@ -158,7 +158,7 @@ enum BlockReduceAlgorithm
  * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
  * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
  *
  * \par Overview
  * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
@@ -218,7 +218,7 @@ template <
     BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
     int                     BLOCK_DIM_Y     = 1,
     int                     BLOCK_DIM_Z     = 1,
-    int                     PTX_ARCH        = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH = 0>
 class BlockReduce
 {
 private:
@@ -234,9 +234,9 @@ private:
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
     };
 
-    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
-    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>                   Raking;
 
     /// Internal specialization type
     using InternalBlockReduce = cub::detail::conditional_t<
diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh
index 0e2561ec15..d1b6a66e98 100644
--- a/cub/block/block_scan.cuh
+++ b/cub/block/block_scan.cuh
@@ -117,7 +117,7 @@ enum BlockScanAlgorithm
  * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
  * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
  *
  * \par Overview
  * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
@@ -191,7 +191,7 @@ template <
     BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
     int                 BLOCK_DIM_Y     = 1,
     int                 BLOCK_DIM_Z     = 1,
-    int                 PTX_ARCH        = CUB_PTX_ARCH>
+    int                 LEGACY_PTX_ARCH = 0>
 class BlockScan
 {
 private:
@@ -214,12 +214,12 @@ private:
      * architectural warp size.
      */
     static const BlockScanAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(0) != 0)) ?
             BLOCK_SCAN_RAKING :
             ALGORITHM;
 
-    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
-    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)> Raking;
 
     /// Define the delegate type for the desired algorithm
     using InternalBlockScan =
diff --git a/cub/block/block_shuffle.cuh b/cub/block/block_shuffle.cuh
index bae7206847..e4ebc7ff1e 100644
--- a/cub/block/block_shuffle.cuh
+++ b/cub/block/block_shuffle.cuh
@@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * It is commonplace for blocks of threads to rearrange data items between
@@ -60,7 +60,7 @@ template <
     int                 BLOCK_DIM_X,
     int                 BLOCK_DIM_Y         = 1,
     int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
+    int                 LEGACY_PTX_ARCH     = 0>
 class BlockShuffle
 {
 private:
@@ -73,7 +73,7 @@ private:
     {
         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
     };
diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh
index d363f3479b..2cb6bee433 100644
--- a/cub/block/block_store.cuh
+++ b/cub/block/block_store.cuh
@@ -454,7 +454,7 @@ enum BlockStoreAlgorithm
  * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - The BlockStore class provides a single data movement abstraction that can be specialized
@@ -528,7 +528,7 @@ template <
     BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
     int                     BLOCK_DIM_Y         = 1,
     int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH     = 0>
 class BlockStore
 {
 private:
@@ -691,7 +691,7 @@ private:
     struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
     {
         // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
@@ -752,14 +752,14 @@ private:
     {
         enum
         {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+            WARP_THREADS = CUB_WARP_THREADS(0)
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
         CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
@@ -820,14 +820,14 @@ private:
     {
         enum
         {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+            WARP_THREADS = CUB_WARP_THREADS(0)
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
         CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
diff --git a/cub/block/specializations/block_histogram_sort.cuh b/cub/block/specializations/block_histogram_sort.cuh
index e60325d48d..79659ae106 100644
--- a/cub/block/specializations/block_histogram_sort.cuh
+++ b/cub/block/specializations/block_histogram_sort.cuh
@@ -73,8 +73,7 @@ struct BlockHistogramSort
             BLOCK_SCAN_WARP_SCANS,
             cudaSharedMemBankSizeFourByte,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         BlockRadixSortT;
 
     // Parameterize BlockDiscontinuity type for our thread block
@@ -82,8 +81,7 @@ struct BlockHistogramSort
             T,
             BLOCK_DIM_X,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         BlockDiscontinuityT;
 
     /// Shared memory
diff --git a/cub/block/specializations/block_reduce_raking.cuh b/cub/block/specializations/block_reduce_raking.cuh
index d828484915..0a9009d2e2 100644
--- a/cub/block/specializations/block_reduce_raking.cuh
+++ b/cub/block/specializations/block_reduce_raking.cuh
@@ -60,7 +60,7 @@ template <
     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
 struct BlockReduceRaking
 {
     /// Constants
@@ -71,10 +71,10 @@ struct BlockReduceRaking
     };
 
     /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+    typedef BlockRakingLayout<T, BLOCK_THREADS> BlockRakingLayout;
 
     ///  WarpReduce utility type
-    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS>::InternalWarpReduce WarpReduce;
 
     /// Constants
     enum
diff --git a/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/block/specializations/block_reduce_raking_commutative_only.cuh
index 4dd8a13050..529a170923 100644
--- a/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ b/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -50,7 +50,7 @@ template <
     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
 struct BlockReduceRakingCommutativeOnly
 {
     /// Constants
@@ -61,13 +61,13 @@ struct BlockReduceRakingCommutativeOnly
     };
 
     // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z> FallBack;
 
     /// Constants
     enum
     {
         /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS = CUB_WARP_THREADS(0),
 
         /// Whether or not to use fall-back
         USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
@@ -83,10 +83,10 @@ struct BlockReduceRakingCommutativeOnly
     };
 
     ///  WarpReduce utility type
-    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+    typedef WarpReduce<T, RAKING_THREADS> WarpReduce;
 
     /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+    typedef BlockRakingLayout<T, SHARING_THREADS> BlockRakingLayout;
 
     /// Shared memory storage layout type
     union _TempStorage
diff --git a/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/block/specializations/block_reduce_warp_reductions.cuh
index 6b440e5a9c..c341c8ba55 100644
--- a/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -48,7 +48,7 @@ template <
     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
 struct BlockReduceWarpReductions
 {
     /// Constants
@@ -58,7 +58,7 @@ struct BlockReduceWarpReductions
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 
         /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS = CUB_WARP_THREADS(0),
 
         /// Number of active warps
         WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
@@ -72,7 +72,7 @@ struct BlockReduceWarpReductions
 
 
     ///  WarpReduce utility type
-    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE>::InternalWarpReduce WarpReduce;
 
 
     /// Shared memory storage layout type
diff --git a/cub/block/specializations/block_scan_raking.cuh b/cub/block/specializations/block_scan_raking.cuh
index 863939e1fe..6327e48110 100644
--- a/cub/block/specializations/block_scan_raking.cuh
+++ b/cub/block/specializations/block_scan_raking.cuh
@@ -53,7 +53,7 @@ template <
     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
     bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
 struct BlockScanRaking
 {
     //---------------------------------------------------------------------
@@ -68,7 +68,7 @@ struct BlockScanRaking
     };
 
     /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+    typedef BlockRakingLayout<T, BLOCK_THREADS> BlockRakingLayout;
 
     /// Constants
     enum
@@ -84,7 +84,7 @@ struct BlockScanRaking
     };
 
     ///  WarpScan utility type
-    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+    typedef WarpScan<T, RAKING_THREADS> WarpScan;
 
     /// Shared memory storage layout type
     struct _TempStorage
diff --git a/cub/block/specializations/block_scan_warp_scans.cuh b/cub/block/specializations/block_scan_warp_scans.cuh
index 8273e43bf4..ed550162e2 100644
--- a/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/block/specializations/block_scan_warp_scans.cuh
@@ -47,7 +47,7 @@ template <
     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
 struct BlockScanWarpScans
 {
     //---------------------------------------------------------------------
@@ -58,7 +58,7 @@ struct BlockScanWarpScans
     enum
     {
         /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS = CUB_WARP_THREADS(0),
 
         /// The thread block size in threads
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
@@ -68,10 +68,10 @@ struct BlockScanWarpScans
     };
 
     ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+    typedef WarpScan<T, WARP_THREADS> WarpScanT;
 
     ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+    typedef WarpScan<T, WARPS> WarpAggregateScan;
 
     /// Shared memory storage layout type
 
diff --git a/cub/block/specializations/block_scan_warp_scans2.cuh b/cub/block/specializations/block_scan_warp_scans2.cuh
index a485356bc3..b634569d20 100644
--- a/cub/block/specializations/block_scan_warp_scans2.cuh
+++ b/cub/block/specializations/block_scan_warp_scans2.cuh
@@ -47,7 +47,7 @@ template <
     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
 struct BlockScanWarpScans
 {
     //---------------------------------------------------------------------
@@ -58,7 +58,7 @@ struct BlockScanWarpScans
     enum
     {
         /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS = CUB_WARP_THREADS(0),
 
         /// The thread block size in threads
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
@@ -68,10 +68,10 @@ struct BlockScanWarpScans
     };
 
     ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+    typedef WarpScan<T, WARP_THREADS> WarpScanT;
 
     ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+    typedef WarpScan<T, WARPS> WarpAggregateScanT;
 
     /// Shared memory storage layout type
     struct _TempStorage
diff --git a/cub/block/specializations/block_scan_warp_scans3.cuh b/cub/block/specializations/block_scan_warp_scans3.cuh
index dad06fd298..f7eed0be8f 100644
--- a/cub/block/specializations/block_scan_warp_scans3.cuh
+++ b/cub/block/specializations/block_scan_warp_scans3.cuh
@@ -47,7 +47,7 @@ template <
     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
 struct BlockScanWarpScans
 {
     //---------------------------------------------------------------------
@@ -61,7 +61,7 @@ struct BlockScanWarpScans
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 
         /// Number of warp threads
-        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        INNER_WARP_THREADS = CUB_WARP_THREADS(0),
         OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
 
         /// Number of outer scan warps
@@ -69,10 +69,10 @@ struct BlockScanWarpScans
     };
 
     ///  Outer WarpScan utility type
-    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+    typedef WarpScan<T, OUTER_WARP_THREADS> OuterWarpScanT;
 
     ///  Inner WarpScan utility type
-    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+    typedef WarpScan<T, INNER_WARP_THREADS> InnerWarpScanT;
 
     typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
 
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index 13c43948ef..bc4fa3dcd9 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1227,7 +1227,7 @@ struct DispatchRadixSort :
             UpsweepKernelT      upsweep_kernel,
             ScanKernelT         scan_kernel,
             DownsweepKernelT    downsweep_kernel,
-            int                 ptx_version,
+            int                 /*ptx_version*/,
             int                 sm_count,
             OffsetT             num_items)
         {
@@ -1244,7 +1244,7 @@ struct DispatchRadixSort :
                 if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
                 if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
 
-                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0);
 
                 even_share.DispatchInit(
                     num_items,
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index e0470ccb1e..0ac3b7b3c0 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -472,7 +472,7 @@ struct DispatchReduce :
             int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
 
             // Even-share work distribution
-            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0);
             GridEvenShare<OffsetT> even_share;
             even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
 
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
index f1bd8b6b99..00a21516b6 100644
--- a/cub/util_arch.cuh
+++ b/cub/util_arch.cuh
@@ -83,12 +83,10 @@ CUB_NAMESPACE_BEGIN
 
 /// Maximum number of devices supported.
 #ifndef CUB_MAX_DEVICES
-    #define CUB_MAX_DEVICES 128
+    #define CUB_MAX_DEVICES (128)
 #endif
 
-#if CUB_CPP_DIALECT >= 2011
-    static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
-#endif
+static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
 
 /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
 #ifndef CUB_RUNTIME_FUNCTION
@@ -103,49 +101,35 @@ CUB_NAMESPACE_BEGIN
 
 /// Number of threads per warp
 #ifndef CUB_LOG_WARP_THREADS
-    #define CUB_LOG_WARP_THREADS(arch)                      \
-        (5)
-    #define CUB_WARP_THREADS(arch)                          \
-        (1 << CUB_LOG_WARP_THREADS(arch))
+    #define CUB_LOG_WARP_THREADS(unused) (5)
+    #define CUB_WARP_THREADS(unused) (1 << CUB_LOG_WARP_THREADS(0))
 
-    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
-    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(0)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(0)
 #endif
 
 
 /// Number of smem banks
 #ifndef CUB_LOG_SMEM_BANKS
-    #define CUB_LOG_SMEM_BANKS(arch)                        \
-        ((arch >= 200) ?                                    \
-            (5) :                                           \
-            (4))
-    #define CUB_SMEM_BANKS(arch)                            \
-        (1 << CUB_LOG_SMEM_BANKS(arch))
-
-    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
-    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_LOG_SMEM_BANKS(unused) (5)
+    #define CUB_SMEM_BANKS(unused) (1 << CUB_LOG_SMEM_BANKS(0))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(0)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS
 #endif
 
 
 /// Oversubscription factor
 #ifndef CUB_SUBSCRIPTION_FACTOR
-    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
-        ((arch >= 300) ?                                    \
-            (5) :                                           \
-            ((arch >= 200) ?                                \
-                (3) :                                       \
-                (10)))
-    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+    #define CUB_SUBSCRIPTION_FACTOR(unused) (5)
+    #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(0)
 #endif
 
 
 /// Prefer padding overhead vs X-way conflicts greater than this threshold
 #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
-    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
-        ((arch >= 300) ?                                    \
-            (1) :                                           \
-            (4))
-    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(unused) (1)
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0)
 #endif
 
 
diff --git a/cub/util_ptx.cuh b/cub/util_ptx.cuh
index 087fbcb733..5b2a20486c 100644
--- a/cub/util_ptx.cuh
+++ b/cub/util_ptx.cuh
@@ -442,17 +442,15 @@ __device__ __forceinline__ unsigned int WarpId()
  *                              hardware warp threads).
  * @param warp_id Id of virtual warp within architectural warp
  */
-template <int LOGICAL_WARP_THREADS,
-          int PTX_ARCH = CUB_PTX_ARCH>
+template <int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
 __host__ __device__ __forceinline__
 unsigned int WarpMask(unsigned int warp_id)
 {
   constexpr bool is_pow_of_two = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE;
-  constexpr bool is_arch_warp = LOGICAL_WARP_THREADS ==
-                                CUB_WARP_THREADS(PTX_ARCH);
+  constexpr bool is_arch_warp  = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
 
-  unsigned int member_mask =
-    0xFFFFFFFFu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+  unsigned int member_mask = 0xFFFFFFFFu >>
+                             (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS);
 
   if (is_pow_of_two && !is_arch_warp)
   {
diff --git a/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/warp/specializations/warp_reduce_shfl.cuh
index faa5cfcd25..bb71f395fd 100644
--- a/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -51,7 +51,7 @@ CUB_NAMESPACE_BEGIN
 template <
     typename    T,                      ///< Data type being reduced
     int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
 struct WarpReduceShfl
 {
     static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
@@ -64,16 +64,16 @@ struct WarpReduceShfl
     enum
     {
         /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
 
         /// The number of warp reduction steps
         STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
 
         /// Number of logical warps in a PTX warp
-        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+        LOGICAL_WARPS = CUB_WARP_THREADS(0) / LOGICAL_WARP_THREADS,
 
         /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+        SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8
 
     };
 
@@ -114,7 +114,7 @@ struct WarpReduceShfl
         TempStorage &/*temp_storage*/)
         : lane_id(static_cast<int>(LaneId()))
         , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
-        , member_mask(WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(warp_id))
+        , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
     {
         if (!IS_ARCH_WARP)
         {
diff --git a/cub/warp/specializations/warp_reduce_smem.cuh b/cub/warp/specializations/warp_reduce_smem.cuh
index 2b419cc217..79c7a0ccdd 100644
--- a/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/warp/specializations/warp_reduce_smem.cuh
@@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
 template <
     typename    T,                      ///< Data type being reduced
     int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
 struct WarpReduceSmem
 {
     /******************************************************************************
@@ -57,7 +57,7 @@ struct WarpReduceSmem
     enum
     {
         /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
 
         /// Whether the logical warp size is a power-of-two
         IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
@@ -109,8 +109,7 @@ struct WarpReduceSmem
         : temp_storage(temp_storage.Alias())
         , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
         , member_mask(
-            WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(
-              LaneId() / LOGICAL_WARP_THREADS))
+            WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
     {}
 
     /******************************************************************************
diff --git a/cub/warp/specializations/warp_scan_shfl.cuh b/cub/warp/specializations/warp_scan_shfl.cuh
index f6051c6ea2..2f2207c1ac 100644
--- a/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/warp/specializations/warp_scan_shfl.cuh
@@ -48,7 +48,7 @@ CUB_NAMESPACE_BEGIN
 template <
     typename    T,                      ///< Data type being scanned
     int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
 struct WarpScanShfl
 {
     //---------------------------------------------------------------------
@@ -58,13 +58,13 @@ struct WarpScanShfl
     enum
     {
         /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
 
         /// The number of warp scan steps
         STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
 
         /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+        SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8
     };
 
     template <typename S>
@@ -102,7 +102,7 @@ struct WarpScanShfl
     WarpScanShfl(TempStorage & /*temp_storage*/)
         : lane_id(LaneId())
         , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
-        , member_mask(WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(warp_id))
+        , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
     {
         if (!IS_ARCH_WARP)
         {
diff --git a/cub/warp/specializations/warp_scan_smem.cuh b/cub/warp/specializations/warp_scan_smem.cuh
index 1fecf1c7b5..bd17118e99 100644
--- a/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/warp/specializations/warp_scan_smem.cuh
@@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
 template <
     typename    T,                      ///< Data type being scanned
     int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
 struct WarpScanSmem
 {
     /******************************************************************************
@@ -57,7 +57,7 @@ struct WarpScanSmem
     enum
     {
         /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
 
         /// The number of warp scan steps
         STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
@@ -103,8 +103,7 @@ struct WarpScanSmem
             LaneId() % LOGICAL_WARP_THREADS),
 
         member_mask(
-          WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(
-            LaneId() / LOGICAL_WARP_THREADS))
+          WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
     {}
 
 
diff --git a/cub/warp/warp_exchange.cuh b/cub/warp/warp_exchange.cuh
index c0fbef1cc0..c1d20f5233 100644
--- a/cub/warp/warp_exchange.cuh
+++ b/cub/warp/warp_exchange.cuh
@@ -58,8 +58,8 @@ CUB_NAMESPACE_BEGIN
  *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
  *   power of two.
  *
- * @tparam PTX_ARCH
- *   <b>[optional]</b> \ptxversion
+ * @tparam LEGACY_PTX_ARCH
+ *   Unused.
  *
  * @par Overview
  * - It is commonplace for a warp of threads to rearrange data items between
@@ -114,7 +114,7 @@ CUB_NAMESPACE_BEGIN
 template <typename InputT,
           int ITEMS_PER_THREAD,
           int LOGICAL_WARP_THREADS  = CUB_PTX_WARP_THREADS,
-          int PTX_ARCH              = CUB_PTX_ARCH>
+          int LEGACY_PTX_ARCH       = 0>
 class WarpExchange
 {
   static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
@@ -123,10 +123,9 @@ class WarpExchange
   constexpr static int ITEMS_PER_TILE =
     ITEMS_PER_THREAD * LOGICAL_WARP_THREADS + 1;
 
-  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS ==
-                                       CUB_WARP_THREADS(PTX_ARCH);
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
 
-  constexpr static int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH);
+  constexpr static int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0);
 
   // Insert padding if the number of items per thread is a power of two
   // and > 4 (otherwise we can typically use 128b loads)
diff --git a/cub/warp/warp_load.cuh b/cub/warp/warp_load.cuh
index 28a31f407a..664bf1185b 100644
--- a/cub/warp/warp_load.cuh
+++ b/cub/warp/warp_load.cuh
@@ -141,8 +141,8 @@ enum WarpLoadAlgorithm
  *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
  *   power of two.
  *
- * @tparam PTX_ARCH
- *   <b>[optional]</b> \ptxversion
+ * @tparam LEGACY_PTX_ARCH
+ *   Unused.
  *
  * @par Overview
  * - The WarpLoad class provides a single data movement abstraction that can be
@@ -206,11 +206,10 @@ template <typename          InputT,
           int               ITEMS_PER_THREAD,
           WarpLoadAlgorithm ALGORITHM            = WARP_LOAD_DIRECT,
           int               LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
-          int               PTX_ARCH             = CUB_PTX_ARCH>
+          int               LEGACY_PTX_ARCH      = 0>
 class WarpLoad
 {
-  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS ==
-                                       CUB_WARP_THREADS(PTX_ARCH);
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
 
   static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
                 "LOGICAL_WARP_THREADS must be a power of two");
@@ -397,7 +396,7 @@ private:
   struct LoadInternal<WARP_LOAD_TRANSPOSE, DUMMY>
   {
     using WarpExchangeT =
-      WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, PTX_ARCH>;
+      WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
 
     struct _TempStorage : WarpExchangeT::TempStorage
     {};
diff --git a/cub/warp/warp_merge_sort.cuh b/cub/warp/warp_merge_sort.cuh
index 6617084ae1..fbcd44f578 100644
--- a/cub/warp/warp_merge_sort.cuh
+++ b/cub/warp/warp_merge_sort.cuh
@@ -57,8 +57,8 @@ CUB_NAMESPACE_BEGIN
  *   <b>[optional]</b> Value type (default: cub::NullType, which indicates a
  *   keys-only sort)
  *
- * @tparam PTX_ARCH
- *   <b>[optional]</b> \ptxversion
+ * @tparam LEGACY_PTX_ARCH
+ *   Unused.
  *
  * @par Overview
  *   WarpMergeSort arranges items into ascending order using a comparison
@@ -115,19 +115,19 @@ CUB_NAMESPACE_BEGIN
 template <
   typename    KeyT,
   int         ITEMS_PER_THREAD,
-  int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+  int         LOGICAL_WARP_THREADS    = CUB_WARP_THREADS(0),
   typename    ValueT                  = NullType,
-  int         PTX_ARCH                = CUB_PTX_ARCH>
+  int         LEGACY_PTX_ARCH         = 0>
 class WarpMergeSort
     : public BlockMergeSortStrategy<
         KeyT,
         ValueT,
         LOGICAL_WARP_THREADS,
         ITEMS_PER_THREAD,
-        WarpMergeSort<KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH>>
+        WarpMergeSort<KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT>>
 {
 private:
-  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH);
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
   constexpr static bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
   constexpr static int TILE_SIZE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
 
@@ -150,7 +150,7 @@ public:
                                   ? LaneId()
                                   : (LaneId() % LOGICAL_WARP_THREADS))
       , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
-      , member_mask(WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(warp_id))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
   {
   }
 
diff --git a/cub/warp/warp_reduce.cuh b/cub/warp/warp_reduce.cuh
index d905703d75..7c8592df71 100644
--- a/cub/warp/warp_reduce.cuh
+++ b/cub/warp/warp_reduce.cuh
@@ -52,7 +52,7 @@ CUB_NAMESPACE_BEGIN
  *
  * \tparam T                        The reduction input/output element type
  * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH          <b>[optional]</b> Unused.
  *
  * \par Overview
  * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
@@ -132,7 +132,7 @@ CUB_NAMESPACE_BEGIN
 template <
     typename    T,
     int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH         = 0>
 class WarpReduce
 {
 private:
@@ -144,7 +144,7 @@ private:
     enum
     {
         /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
 
         /// Whether the logical warp size is a power-of-two
         IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
@@ -158,8 +158,8 @@ public:
     /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
     using InternalWarpReduce = cub::detail::conditional_t<
       IS_POW_OF_TWO,
-      WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-      WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH>>;
+      WarpReduceShfl<T, LOGICAL_WARP_THREADS>,
+      WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
diff --git a/cub/warp/warp_scan.cuh b/cub/warp/warp_scan.cuh
index 4e646957ec..9cb1491f1b 100644
--- a/cub/warp/warp_scan.cuh
+++ b/cub/warp/warp_scan.cuh
@@ -51,7 +51,7 @@ CUB_NAMESPACE_BEGIN
  *
  * \tparam T                        The scan input/output element type
  * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH          <b>[optional]</b> Unused.
  *
  * \par Overview
  * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
@@ -137,7 +137,7 @@ CUB_NAMESPACE_BEGIN
 template <
     typename    T,
     int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH         = 0>
 class WarpScan
 {
 private:
@@ -149,7 +149,7 @@ private:
     enum
     {
         /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
 
         /// Whether the logical warp size is a power-of-two
         IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
@@ -162,8 +162,8 @@ private:
     /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
     using InternalWarpScan = cub::detail::conditional_t<
       IS_POW_OF_TWO,
-      WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-      WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH>>;
+      WarpScanShfl<T, LOGICAL_WARP_THREADS>,
+      WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
 
     /// Shared memory storage layout type for WarpScan
     typedef typename InternalWarpScan::TempStorage _TempStorage;
diff --git a/cub/warp/warp_store.cuh b/cub/warp/warp_store.cuh
index 18e8e2261d..33346e826d 100644
--- a/cub/warp/warp_store.cuh
+++ b/cub/warp/warp_store.cuh
@@ -136,8 +136,8 @@ enum WarpStoreAlgorithm
  *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
  *   power of two.
  *
- * @tparam PTX_ARCH
- *   <b>[optional]</b> \ptxversion
+ * @tparam LEGACY_PTX_ARCH
+ *   Unused.
  *
  * @par Overview
  * - The WarpStore class provides a single data movement abstraction that can be
@@ -204,14 +204,13 @@ template <typename           T,
           int                ITEMS_PER_THREAD,
           WarpStoreAlgorithm ALGORITHM            = WARP_STORE_DIRECT,
           int                LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
-          int                PTX_ARCH             = CUB_PTX_ARCH>
+          int                LEGACY_PTX_ARCH      = 0>
 class WarpStore
 {
   static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
                 "LOGICAL_WARP_THREADS must be a power of two");
 
-  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS ==
-                                       CUB_WARP_THREADS(PTX_ARCH);
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
 
 private:
 
@@ -319,7 +318,7 @@ private:
   struct StoreInternal<WARP_STORE_TRANSPOSE, DUMMY>
   {
     using WarpExchangeT =
-      WarpExchange<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, PTX_ARCH>;
+      WarpExchange<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
 
     struct _TempStorage : WarpExchangeT::TempStorage
     {};
diff --git a/test/test_warp_mask.cu b/test/test_warp_mask.cu
index a96e4fef46..037c74b609 100644
--- a/test/test_warp_mask.cu
+++ b/test/test_warp_mask.cu
@@ -45,8 +45,7 @@ void Test()
 
   for (unsigned int warp_id = 0; warp_id < warps; warp_id++)
   {
-    const unsigned int warp_mask =
-      cub::WarpMask<LOGICAL_WARP_THREADS, 860>(warp_id);
+    const unsigned int warp_mask = cub::WarpMask<LOGICAL_WARP_THREADS>(warp_id);
 
     const unsigned int warp_begin = LOGICAL_WARP_THREADS * warp_id;
     const unsigned int warp_end = warp_begin + LOGICAL_WARP_THREADS;

From c4299c46fdf88654f23a7c01dca3bc4c2976bb7a Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Tue, 30 Nov 2021 11:35:13 -0500
Subject: [PATCH 4/7] Use NV_IF_TARGET to select between host/device/sm
 implementations.

---
 cub/agent/agent_sub_warp_merge_sort.cuh       |  33 ++-
 cub/detail/device_synchronize.cuh             |  31 ++-
 cub/device/dispatch/dispatch_histogram.cuh    |  71 ++---
 cub/device/dispatch/dispatch_radix_sort.cuh   |  37 ++-
 .../dispatch/dispatch_reduce_by_key.cuh       |  51 ++--
 cub/device/dispatch/dispatch_rle.cuh          |  51 ++--
 .../dispatch/dispatch_segmented_sort.cuh      | 172 ++++++------
 cub/device/dispatch/dispatch_select_if.cuh    |  52 ++--
 cub/device/dispatch/dispatch_spmv_orig.cuh    |  89 +++----
 .../dispatch/dispatch_three_way_partition.cuh |  33 +--
 cub/grid/grid_queue.cuh                       |  98 ++++---
 cub/iterator/tex_obj_input_iterator.cuh       |  68 +++--
 cub/util_arch.cuh                             |  42 +--
 cub/util_debug.cuh                            | 163 +++++++-----
 cub/util_device.cuh                           | 247 ++++++++----------
 test/test_device_segmented_sort.cu            | 108 ++++----
 test/test_util.h                              | 238 ++++++++---------
 test/test_warp_reduce.cu                      |  12 +-
 18 files changed, 794 insertions(+), 802 deletions(-)

diff --git a/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/agent/agent_sub_warp_merge_sort.cuh
index ad65f2a3cc..81447afdbb 100644
--- a/cub/agent/agent_sub_warp_merge_sort.cuh
+++ b/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -33,6 +33,8 @@
 #include <cub/warp/warp_merge_sort.cuh>
 #include <cub/warp/warp_store.cuh>
 
+#include <nv/target>
+
 #include <thrust/system/cuda/detail/core/util.h>
 
 
@@ -108,6 +110,23 @@ class AgentSubWarpSort
   {
     template <typename T>
     __device__ bool operator()(T lhs, T rhs)
+    {
+      return this->impl(lhs, rhs);
+    }
+
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+    __device__ bool operator()(__half lhs, __half rhs)
+    {
+      // Need to explicitly cast to float for SM <= 52.
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (return this->impl(lhs, rhs);),
+                   (return this->impl(__half2float(lhs), __half2float(rhs));));
+    }
+#endif
+
+  private:
+    template <typename T>
+    __device__ bool impl(T lhs, T rhs)
     {
       if (IS_DESCENDING)
       {
@@ -118,19 +137,15 @@ class AgentSubWarpSort
         return lhs < rhs;
       }
     }
-
-#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
-    __device__ bool operator()(__half lhs, __half rhs)
-    {
-      return (*this)(__half2float(lhs), __half2float(rhs));
-    }
-#endif
   };
 
-#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
+#if defined(__CUDA_FP16_TYPES_EXIST__)
   __device__ static bool equal(__half lhs, __half rhs)
   {
-    return __half2float(lhs) == __half2float(rhs);
+    // Need to explicitly cast to float for SM <= 52.
+    NV_IF_TARGET(NV_PROVIDES_SM_53,
+                 (return lhs == rhs;),
+                 (return __half2float(lhs) == __half2float(rhs);));
   }
 #endif
 
diff --git a/cub/detail/device_synchronize.cuh b/cub/detail/device_synchronize.cuh
index 075c0aa61c..52c5a10663 100644
--- a/cub/detail/device_synchronize.cuh
+++ b/cub/detail/device_synchronize.cuh
@@ -20,6 +20,8 @@
 #include <cub/util_arch.cuh>
 #include <cub/util_namespace.cuh>
 
+#include <nv/target>
+
 #include <cuda_runtime_api.h>
 
 CUB_NAMESPACE_BEGIN
@@ -36,31 +38,28 @@ CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
 {
   cudaError_t result = cudaErrorUnknown;
 
-  if (CUB_IS_HOST_CODE)
-  {
-#if CUB_INCLUDE_HOST_CODE
-    result = cudaDeviceSynchronize();
-#endif
-  }
-  else
-  {
-    // Device code with the CUDA runtime.
-#if defined(CUB_INCLUDE_DEVICE_CODE) && defined(CUB_RUNTIME_ENABLED)
+#ifdef CUB_RUNTIME_ENABLED
 
 #if defined(__CUDACC__) &&                                                     \
   ((__CUDACC_VER_MAJOR__ > 11) ||                                              \
    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
-    // CUDA >= 11.6
-    result = __cudaDeviceSynchronizeDeprecationAvoidance();
+  // CUDA >= 11.6
+#define CUB_TMP_DEVICE_SYNC_IMPL                                               \
+  result = __cudaDeviceSynchronizeDeprecationAvoidance();
 #else // CUDA < 11.6
-    result = cudaDeviceSynchronize();
+#define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
 #endif
 
 #else // Device code without the CUDA runtime.
-    // Device side CUDA API calls are not supported in this configuration.
-    result = cudaErrorInvalidConfiguration;
+  // Device side CUDA API calls are not supported in this configuration.
+#define CUB_TMP_DEVICE_SYNC_IMPL result = cudaErrorInvalidConfiguration;
 #endif
-  }
+
+  NV_IF_TARGET(NV_IS_HOST,
+               (result = cudaDeviceSynchronize();),
+               (CUB_TMP_DEVICE_SYNC_IMPL));
+
+#undef CUB_TMP_DEVICE_SYNC_IMPL
 
   return result;
 }
diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh
index 035f3f2b5d..4609036d4c 100644
--- a/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/device/dispatch/dispatch_histogram.cuh
@@ -34,20 +34,22 @@
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "../../agent/agent_histogram.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_math.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../config.cuh"
+#include <cub/agent/agent_histogram.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/thread/thread_search.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/config.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <nv/target>
+
+#include <cstdio>
+#include <iterator>
+#include <limits>
+
 CUB_NAMESPACE_BEGIN
 
 
@@ -401,32 +403,31 @@ struct DispatchHistogram
         int             ptx_version,
         KernelConfig    &histogram_sweep_config)
     {
-        cudaError_t result = cudaErrorNotSupported;
-        if (CUB_IS_DEVICE_CODE)
-        {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-                if (ptx_version >= 500)
-                {
-                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
-                }
-                else
-                {
-                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
-                }
-            #endif
-        }
-        return result;
+      cudaError_t result = cudaErrorNotSupported;
+      NV_IF_TARGET(
+        NV_IS_DEVICE,
+        (
+          // We're on the device, so initialize the kernel dispatch
+          // configurations with the current PTX policy
+          result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+        ),
+        ( // NV_IS_HOST:
+          // We're on the host, so lookup and initialize the kernel dispatch
+          // configurations with the policies that match the device's PTX
+          // version
+          if (ptx_version >= 500)
+          {
+            result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+          }
+          else
+          {
+            result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+          }
+        ));
+
+      return result;
     }
 
-
     /**
      * Kernel kernel dispatch configuration
      */
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index bc4fa3dcd9..6265232fad 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1332,9 +1332,15 @@ struct DispatchRadixSort :
                 MaxPolicyT, IS_DESCENDING, KeyT, OffsetT>;
             if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
                 &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break;
-            histogram_kernel<<<histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream>>>
-                (d_bins, d_keys.Current(), num_items, begin_bit, end_bit);
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+              histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream
+            ).doit(histogram_kernel,
+                   d_bins, d_keys.Current(), num_items, begin_bit, end_bit);
+            if (CubDebug(error))
+            {
+                break;
+            }
 
             // exclusive sums to determine starts
             const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
@@ -1368,17 +1374,22 @@ struct DispatchRadixSort :
                            stream))) break;
                     auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
                         MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>;
-                    onesweep_kernel<<<num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream>>>
-                        (d_lookback, d_ctrs + portion * num_passes + pass,
-                         portion < num_portions - 1 ?
+                    errror = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                      num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream
+                    ).doit(onesweep_kernel,
+                           d_lookback, d_ctrs + portion * num_passes + pass,
+                           portion < num_portions - 1 ?
                              d_bins + ((portion + 1) * num_passes + pass) * RADIX_DIGITS : NULL,
-                         d_bins + (portion * num_passes + pass) * RADIX_DIGITS,
-                         d_keys.Alternate(),
-                         d_keys.Current() + portion * PORTION_SIZE,
-                         d_values.Alternate(),
-                         d_values.Current() + portion * PORTION_SIZE,
-                         portion_num_items, current_bit, num_bits);
-                    if (CubDebug(error = cudaPeekAtLastError())) break;
+                           d_bins + (portion * num_passes + pass) * RADIX_DIGITS,
+                           d_keys.Alternate(),
+                           d_keys.Current() + portion * PORTION_SIZE,
+                           d_values.Alternate(),
+                           d_values.Current() + portion * PORTION_SIZE,
+                           portion_num_items, current_bit, num_bits);
+                    if (CubDebug(error))
+                    {
+                      break;
+                    }
                 }
                 
                 // use the temporary buffers if no overwrite is allowed
diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 00d4be5ff8..952ace4240 100644
--- a/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -1,4 +1,3 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
@@ -34,19 +33,21 @@
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
+#include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
 
-#include "dispatch_scan.cuh"
-#include "../../config.cuh"
-#include "../../agent/agent_reduce_by_key.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_math.cuh"
+#include <nv/target>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cstdio>
+#include <iterator>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -193,27 +194,19 @@ struct DispatchReduceByKey
     template <typename KernelConfig>
     CUB_RUNTIME_FUNCTION __forceinline__
     static void InitConfigs(
-        int             ptx_version,
+        int             /*ptx_version*/,
         KernelConfig    &reduce_by_key_config)
     {
-        if (CUB_IS_DEVICE_CODE)
-        {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)ptx_version;
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-
-                // (There's only one policy right now)
-                (void)ptx_version;
-                reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
-            #endif
-        }
+        NV_IF_TARGET(NV_IS_DEVICE,
+        (
+            // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+            reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+        ), (
+            // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+
+            // (There's only one policy right now)
+            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+        ));
     }
 
 
diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh
index 1ebb795355..692a55d4e9 100644
--- a/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/device/dispatch/dispatch_rle.cuh
@@ -1,4 +1,3 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
@@ -34,19 +33,21 @@
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../config.cuh"
-#include "../../agent/agent_rle.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_math.cuh"
+#include <cub/agent/agent_rle.cuh>
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <nv/target>
+
+#include <cstdio>
+#include <iterator>
+
 CUB_NAMESPACE_BEGIN
 
 
@@ -181,25 +182,19 @@ struct DeviceRleDispatch
     template <typename KernelConfig>
     CUB_RUNTIME_FUNCTION __forceinline__
     static void InitConfigs(
-        int             ptx_version,
+        int             /*ptx_version*/,
         KernelConfig&   device_rle_config)
     {
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                device_rle_config.template Init<PtxRleSweepPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-
-                // (There's only one policy right now)
-                (void)ptx_version;
-                device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
-            #endif
-        }
+      NV_IF_TARGET(NV_IS_DEVICE,
+      (
+          // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+          device_rle_config.template Init<PtxRleSweepPolicy>();
+      ), (
+          // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+
+          // (There's only one policy right now)
+          device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+      ));
     }
 
 
diff --git a/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/device/dispatch/dispatch_segmented_sort.cuh
index 6101695337..6fb68f9002 100644
--- a/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -28,23 +28,25 @@
 #pragma once
 
 #include <cub/agent/agent_segmented_radix_sort.cuh>
-#include <cub/detail/temporary_storage.cuh>
-#include <cub/detail/device_double_buffer.cuh>
 #include <cub/agent/agent_sub_warp_merge_sort.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_merge_sort.cuh>
 #include <cub/block/block_radix_rank.cuh>
 #include <cub/block/block_scan.cuh>
+#include <cub/detail/device_double_buffer.cuh>
 #include <cub/device/device_partition.cuh>
+#include <cub/detail/temporary_storage.cuh>
 #include <cub/thread/thread_sort.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
 #include <cub/util_namespace.cuh>
 #include <cub/warp/warp_merge_sort.cuh>
 
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <nv/target>
 
 #include <type_traits>
 
@@ -1591,92 +1593,94 @@ private:
       return error;
     }
 
-    if (CUB_IS_HOST_CODE)
-    {
-      #if CUB_INCLUDE_HOST_CODE
-      unsigned int h_group_sizes[num_selected_groups];
-
-      if (CubDebug(
-            error = cudaMemcpyAsync(h_group_sizes,
-                                    group_sizes.get(),
-                                    num_selected_groups * sizeof(unsigned int),
-                                    cudaMemcpyDeviceToHost,
-                                    stream)))
-      {
-        return error;
-      }
-
-      if (CubDebug(error = SyncStream(stream)))
-      {
-        return error;
-      }
-
-      error = DeviceSegmentedSortContinuation<LargeSegmentPolicyT,
-                                              SmallAndMediumPolicyT>(
-        large_kernel,
-        small_kernel,
-        num_segments,
-        d_keys.Current(),
-        GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),
-        d_keys_double_buffer,
-        d_values.Current(),
-        GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),
-        d_values_double_buffer,
-        d_begin_offsets,
-        d_end_offsets,
-        h_group_sizes,
-        large_and_medium_segments_indices.get(),
-        small_segments_indices.get(),
-        stream,
-        debug_synchronous);
-      #endif
-    }
-    else
-    {
-      #if CUB_INCLUDE_DEVICE_CODE
-      #ifdef CUB_RUNTIME_ENABLED
-      using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
-        .doit(DeviceSegmentedSortContinuationKernel<MaxPolicyT,
-                                                    LargeKernelT,
-                                                    SmallKernelT,
-                                                    KeyT,
-                                                    ValueT,
-                                                    BeginOffsetIteratorT,
-                                                    EndOffsetIteratorT>,
-              large_kernel,
-              small_kernel,
-              num_segments,
-              d_keys.Current(),
-              GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),
-              d_keys_double_buffer,
-              d_values.Current(),
-              GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),
-              d_values_double_buffer,
-              d_begin_offsets,
-              d_end_offsets,
-              group_sizes.get(),
-              large_and_medium_segments_indices.get(),
-              small_segments_indices.get(),
-              debug_synchronous);
-
-      if (CubDebug(error = cudaPeekAtLastError()))
-      {
-        return error;
-      }
+#ifdef CUB_RUNTIME_ENABLED
+#define CUB_TMP_DEVICE_CODE                                                    \
+  using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;                \
+  THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)     \
+    .doit(DeviceSegmentedSortContinuationKernel<MaxPolicyT,                    \
+                                                LargeKernelT,                  \
+                                                SmallKernelT,                  \
+                                                KeyT,                          \
+                                                ValueT,                        \
+                                                BeginOffsetIteratorT,          \
+                                                EndOffsetIteratorT>,           \
+          large_kernel,                                                        \
+          small_kernel,                                                        \
+          num_segments,                                                        \
+          d_keys.Current(),                                                    \
+          GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),       \
+          d_keys_double_buffer,                                                \
+          d_values.Current(),                                                  \
+          GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),   \
+          d_values_double_buffer,                                              \
+          d_begin_offsets,                                                     \
+          d_end_offsets,                                                       \
+          group_sizes.get(),                                                   \
+          large_and_medium_segments_indices.get(),                             \
+          small_segments_indices.get(),                                        \
+          debug_synchronous);                                                  \
+                                                                               \
+  if (CubDebug(error = cudaPeekAtLastError()))                                 \
+  {                                                                            \
+    return error;                                                              \
+  }                                                                            \
+                                                                               \
+  if (debug_synchronous)                                                       \
+  {                                                                            \
+    if (CubDebug(error = SyncStream(stream)))                                  \
+    {                                                                          \
+      return error;                                                            \
+    }                                                                          \
+  }
+#else
+#define CUB_TMP_DEVICE_CODE error = CubDebug(cudaErrorNotSupported);
+#endif
+
+    // Clang format mangles some of this NV_IF_TARGET block
+    // clang-format off
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (
+        unsigned int h_group_sizes[num_selected_groups];
+
+        if (CubDebug(error = cudaMemcpyAsync(h_group_sizes,
+                                             group_sizes.get(),
+                                             num_selected_groups *
+                                               sizeof(unsigned int),
+                                             cudaMemcpyDeviceToHost,
+                                             stream)))
+        {
+            return error;
+        }
 
-      if (debug_synchronous)
-      {
         if (CubDebug(error = SyncStream(stream)))
         {
           return error;
         }
-      }
-      #else
-      error = CubDebug(cudaErrorNotSupported);
-      #endif
-      #endif
-    }
+
+        error = DeviceSegmentedSortContinuation<LargeSegmentPolicyT,
+                                                SmallAndMediumPolicyT>(
+          large_kernel,
+          small_kernel,
+          num_segments,
+          d_keys.Current(),
+          GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),
+          d_keys_double_buffer,
+          d_values.Current(),
+          GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),
+          d_values_double_buffer,
+          d_begin_offsets,
+          d_end_offsets,
+          h_group_sizes,
+          large_and_medium_segments_indices.get(),
+          small_segments_indices.get(),
+          stream,
+          debug_synchronous);),
+      // NV_IS_DEVICE:
+      (CUB_TMP_DEVICE_CODE));
+    // clang-format on
+
+#undef CUB_TMP_DEVICE_CODE
 
     return error;
   }
diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh
index 5654ba29a3..bc19b1281b 100644
--- a/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/device/dispatch/dispatch_select_if.cuh
@@ -1,4 +1,3 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
@@ -34,19 +33,21 @@
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../config.cuh"
-#include "../../agent/agent_select_if.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_math.cuh"
+#include <cub/agent/agent_select_if.cuh>
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <nv/target>
+
+#include <cstdio>
+#include <iterator>
+
 CUB_NAMESPACE_BEGIN
 
 /******************************************************************************
@@ -188,23 +189,18 @@ struct DispatchSelectIf
         int             ptx_version,
         KernelConfig    &select_if_config)
     {
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)ptx_version;
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                select_if_config.template Init<PtxSelectIfPolicyT>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-
-                // (There's only one policy right now)
-                (void)ptx_version;
-                select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
-            #endif
-        }
+        NV_IF_TARGET(NV_IS_DEVICE,
+        (
+            (void)ptx_version;
+            // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+            select_if_config.template Init<PtxSelectIfPolicyT>();
+        ), (
+            // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+
+            // (There's only one policy right now)
+            (void)ptx_version;
+            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+        ));
     }
 
 
diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh
index d0de86404c..7e2d74b26d 100644
--- a/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -34,22 +34,24 @@
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/single_pass_scan_operators.cuh"
-#include "../../agent/agent_segment_fixup.cuh"
-#include "../../agent/agent_spmv_orig.cuh"
-#include "../../util_type.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_math.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../config.cuh"
+#include <cub/agent/agent_segment_fixup.cuh>
+#include <cub/agent/agent_spmv_orig.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/config.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_search.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <nv/target>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cstdio>
+#include <iterator>
+
 CUB_NAMESPACE_BEGIN
 
 
@@ -401,40 +403,33 @@ struct DispatchSpmv
         KernelConfig    &spmv_config,
         KernelConfig    &segment_fixup_config)
     {
-        if (CUB_IS_DEVICE_CODE)
-        {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                spmv_config.template Init<PtxSpmvPolicyT>();
-                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-                if (ptx_version >= 600)
-                {
-                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
-                }
-                else if (ptx_version >= 500)
-                {
-                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
-                }
-                else if (ptx_version >= 370)
-                {
-                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
-                }
-                else
-                {
-                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
-                }
-            #endif
-        }
+      NV_IF_TARGET(
+        NV_IS_DEVICE,
+        ( // We're on the device, so initialize the kernel dispatch
+          // configurations with the current PTX policy
+          spmv_config.template Init<PtxSpmvPolicyT>();
+          segment_fixup_config.template Init<PtxSegmentFixupPolicy>();),
+        (
+          // We're on the host, so lookup and initialize the kernel dispatch
+          // configurations with the policies that match the device's PTX
+          // version
+          if (ptx_version >= 600) {
+            spmv_config.template Init<typename Policy600::SpmvPolicyT>();
+            segment_fixup_config
+              .template Init<typename Policy600::SegmentFixupPolicyT>();
+          } else if (ptx_version >= 500) {
+            spmv_config.template Init<typename Policy500::SpmvPolicyT>();
+            segment_fixup_config
+              .template Init<typename Policy500::SegmentFixupPolicyT>();
+          } else if (ptx_version >= 370) {
+            spmv_config.template Init<typename Policy370::SpmvPolicyT>();
+            segment_fixup_config
+              .template Init<typename Policy370::SegmentFixupPolicyT>();
+          } else {
+            spmv_config.template Init<typename Policy350::SpmvPolicyT>();
+            segment_fixup_config
+              .template Init<typename Policy350::SegmentFixupPolicyT>();
+          }));
     }
 
 
diff --git a/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/device/dispatch/dispatch_three_way_partition.cuh
index cb6d3c7a1b..cb06438063 100644
--- a/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -37,6 +37,8 @@
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
 
+#include <nv/target>
+
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 CUB_NAMESPACE_BEGIN
@@ -204,25 +206,18 @@ struct DispatchThreeWayPartitionIf
     int             ptx_version,
     KernelConfig    &select_if_config)
   {
-    if (CUB_IS_DEVICE_CODE)
-    {
-#if CUB_INCLUDE_DEVICE_CODE
-      (void)ptx_version;
-      // We're on the device, so initialize the kernel dispatch configurations
-      // with the current PTX policy
-      select_if_config.template Init<PtxThreeWayPartitionPolicyT>();
-#endif
-    }
-    else
-    {
-#if CUB_INCLUDE_HOST_CODE
-      // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-
-      // (There's only one policy right now)
-      (void)ptx_version;
-      select_if_config.template Init<typename Policy350::ThreeWayPartitionPolicy>();
-#endif
-    }
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      ((void)ptx_version;
+       // We're on the device, so initialize the kernel dispatch configurations
+       // with the current PTX policy
+       select_if_config.template Init<PtxThreeWayPartitionPolicyT>();),
+      (// We're on the host, so lookup and initialize the kernel dispatch
+       // configurations with the policies that match the device's PTX version
+       // (There's only one policy right now)
+       (void)ptx_version;
+       select_if_config
+         .template Init<typename Policy350::ThreeWayPartitionPolicy>();));
   }
 
 
diff --git a/cub/grid/grid_queue.cuh b/cub/grid/grid_queue.cuh
index ebb82e4dc0..e1933e3d38 100644
--- a/cub/grid/grid_queue.cuh
+++ b/cub/grid/grid_queue.cuh
@@ -33,8 +33,10 @@
 
 #pragma once
 
-#include "../config.cuh"
-#include "../util_debug.cuh"
+#include <cub/config.cuh>
+#include <cub/util_debug.cuh>
+
+#include <nv/target>
 
 CUB_NAMESPACE_BEGIN
 
@@ -120,21 +122,20 @@ public:
         cudaStream_t stream = 0)
     {
         cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                d_counters[FILL] = fill_size;
-                d_counters[DRAIN] = 0;
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                OffsetT counters[2];
-                counters[FILL] = fill_size;
-                counters[DRAIN] = 0;
-                result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
-            #endif
-        }
+
+        NV_IF_TARGET(NV_IS_DEVICE,
+        (
+            (void)stream;
+            d_counters[FILL] = fill_size;
+            d_counters[DRAIN] = 0;
+            result = cudaSuccess;
+        ), (
+            OffsetT counters[2];
+            counters[FILL] = fill_size;
+            counters[DRAIN] = 0;
+            result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+        ));
+
         return result;
     }
 
@@ -143,17 +144,16 @@ public:
     __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
     {
         cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                d_counters[DRAIN] = 0;
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
-            #endif
-        }
+
+        NV_IF_TARGET(NV_IS_DEVICE,
+        (
+            (void)stream;
+            d_counters[DRAIN] = 0;
+            result = cudaSuccess;
+        ), (
+            result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+        ));
+
         return result;
     }
 
@@ -162,17 +162,16 @@ public:
     __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
     {
         cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                d_counters[FILL] = 0;
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
-            #endif
-        }
+
+        NV_IF_TARGET(NV_IS_DEVICE,
+        (
+            (void)stream;
+            d_counters[FILL] = 0;
+            result = cudaSuccess;
+        ), (
+            result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+        ));
+
         return result;
     }
 
@@ -183,17 +182,16 @@ public:
         cudaStream_t stream = 0)
     {
         cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                fill_size = d_counters[FILL];
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
-            #endif
-        }
+
+        NV_IF_TARGET(NV_IS_DEVICE,
+        (
+            (void)stream;
+            fill_size = d_counters[FILL];
+            result = cudaSuccess;
+        ), (
+            result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+        ));
+
         return result;
     }
 
diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh
index e7659d43c9..1611f0ee36 100644
--- a/cub/iterator/tex_obj_input_iterator.cuh
+++ b/cub/iterator/tex_obj_input_iterator.cuh
@@ -33,15 +33,17 @@
 
 #pragma once
 
+#include <cub/config.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+
+#include <nv/target>
+
 #include <iterator>
 #include <iostream>
 
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../config.cuh"
-
 #if (THRUST_VERSION >= 100700)
     // This iterator is compatible with Thrust API 1.7 and newer
     #include <thrust/iterator/iterator_facade.h>
@@ -200,37 +202,9 @@ public:
     /// Indirection
     __host__ __device__ __forceinline__ reference operator*() const
     {
-        if (CUB_IS_HOST_CODE) {
-            #if CUB_INCLUDE_HOST_CODE
-                // Simply dereference the pointer on the host
-                return ptr[tex_offset];
-            #else
-                // Never executed, just need a return value for this codepath.
-                // The `reference` type is actually just T, so we can fake this
-                // easily.
-                return reference{};
-            #endif
-        } else {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // Move array of uninitialized words, then alias and assign to return value
-                TextureWord words[TEXTURE_MULTIPLE];
-
-                #pragma unroll
-                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-                {
-                    words[i] = tex1Dfetch<TextureWord>(
-                        tex_obj,
-                        (tex_offset * TEXTURE_MULTIPLE) + i);
-                }
-
-                // Load from words
-                return *reinterpret_cast<T*>(words);
-            #else
-                // This is dead code which will never be executed.  It is here
-                // only to avoid warnings about missing return statements.
-                return ptr[tex_offset];
-            #endif
-        }
+        NV_IF_TARGET(NV_IS_HOST,
+                     (return ptr[tex_offset];),
+                     (return this->device_deref();));
     }
 
     /// Addition
@@ -312,6 +286,26 @@ public:
         return os;
     }
 
+private:
+    // This is hoisted out of operator* because #pragma can't be used inside of
+    // NV_IF_TARGET
+    __device__ __forceinline__ reference device_deref() const
+    {
+        // Move array of uninitialized words, then alias and assign to return
+        // value
+        TextureWord words[TEXTURE_MULTIPLE];
+
+        const auto tex_idx_base = tex_offset * TEXTURE_MULTIPLE;
+
+        #pragma unroll
+        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+        {
+          words[i] = tex1Dfetch<TextureWord>(tex_obj, tex_idx_base + i);
+        }
+
+        // Load from words
+        return *reinterpret_cast<T *>(words);
+    }
 };
 
 
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
index 00a21516b6..2c4b36bee4 100644
--- a/cub/util_arch.cuh
+++ b/cub/util_arch.cuh
@@ -62,24 +62,32 @@ CUB_NAMESPACE_BEGIN
     #endif
 #endif
 
-#ifndef CUB_IS_DEVICE_CODE
-    #if defined(_NVHPC_CUDA)
-        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
-        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
-        #define CUB_INCLUDE_DEVICE_CODE 1
-        #define CUB_INCLUDE_HOST_CODE 1
-    #elif CUB_PTX_ARCH > 0
-        #define CUB_IS_DEVICE_CODE 1
-        #define CUB_IS_HOST_CODE 0
-        #define CUB_INCLUDE_DEVICE_CODE 1
-        #define CUB_INCLUDE_HOST_CODE 0
-    #else
-        #define CUB_IS_DEVICE_CODE 0
-        #define CUB_IS_HOST_CODE 1
-        #define CUB_INCLUDE_DEVICE_CODE 0
-        #define CUB_INCLUDE_HOST_CODE 1
+// These definitions were intended for internal use only and are now obsolete.
+// If you relied on them, consider porting your code to use the functionality
+// in libcu++'s <nv/target> header.
+// For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make
+// them available again. These should be considered deprecated and will be
+// fully removed in a future version.
+#ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS
+    #ifndef CUB_IS_DEVICE_CODE
+        #if defined(_NVHPC_CUDA)
+            #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
+            #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
+            #define CUB_INCLUDE_DEVICE_CODE 1
+            #define CUB_INCLUDE_HOST_CODE 1
+        #elif CUB_PTX_ARCH > 0
+            #define CUB_IS_DEVICE_CODE 1
+            #define CUB_IS_HOST_CODE 0
+            #define CUB_INCLUDE_DEVICE_CODE 1
+            #define CUB_INCLUDE_HOST_CODE 0
+        #else
+            #define CUB_IS_DEVICE_CODE 0
+            #define CUB_IS_HOST_CODE 1
+            #define CUB_INCLUDE_DEVICE_CODE 0
+            #define CUB_INCLUDE_HOST_CODE 1
+        #endif
     #endif
-#endif
+#endif // CUB_PROVIDE_LEGACY_ARCH_MACROS
 
 /// Maximum number of devices supported.
 #ifndef CUB_MAX_DEVICES
diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh
index d017b82149..c38b9f8c06 100644
--- a/cub/util_debug.cuh
+++ b/cub/util_debug.cuh
@@ -36,12 +36,14 @@
 
 #pragma once
 
-#include <stdio.h>
-#include "util_namespace.cuh"
-#include "util_arch.cuh"
+#include <cub/util_namespace.cuh>
+#include <cub/util_arch.cuh>
 
-CUB_NAMESPACE_BEGIN
+#include <nv/target>
+
+#include <cstdio>
 
+CUB_NAMESPACE_BEGIN
 
 /**
  * \addtogroup UtilMgmt
@@ -54,45 +56,57 @@ CUB_NAMESPACE_BEGIN
     #define CUB_STDERR
 #endif
 
-
-
 /**
- * \brief If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the
+ * corresponding error message is printed to \p stderr (or \p stdout in device
+ * code) along with the supplied source context.
  *
  * \return The CUDA error.
  */
-__host__ __device__ __forceinline__ cudaError_t Debug(
-    cudaError_t     error,
-    const char*     filename,
-    int             line)
+__host__ __device__
+__forceinline__
+cudaError_t Debug(cudaError_t error, const char *filename, int line)
 {
-    (void)filename;
-    (void)line;
+  (void)filename;
+  (void)line;
 
 #ifdef CUB_RUNTIME_ENABLED
-    // Clear the global CUDA error state which may have been set by the last
-    // call. Otherwise, errors may "leak" to unrelated kernel launches.
-    cudaGetLastError();
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+  cudaGetLastError();
 #endif
 
 #ifdef CUB_STDERR
-    if (error)
-    {
-        if (CUB_IS_HOST_CODE) {
-            #if CUB_INCLUDE_HOST_CODE
-                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-                fflush(stderr);
-            #endif
-        } else {
-            #if CUB_INCLUDE_DEVICE_CODE
-                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
-            #endif
-        }
-    }
+  if (error)
+  {
+    NV_IF_TARGET(
+      NV_IS_HOST, (
+        fprintf(stderr,
+                "CUDA error %d [%s, %d]: %s\n",
+                error,
+                filename,
+                line,
+                cudaGetErrorString(error));
+        fflush(stderr);
+      ),
+      (
+        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n",
+               error,
+               blockIdx.z,
+               blockIdx.y,
+               blockIdx.x,
+               threadIdx.z,
+               threadIdx.y,
+               threadIdx.x,
+               filename,
+               line);
+      )
+    );
+  }
 #endif
-    return error;
-}
 
+  return error;
+}
 
 /**
  * \brief Debug macro
@@ -114,43 +128,58 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
  * \brief Log macro for printf statements.
  */
 #if !defined(_CubLog)
-    #if defined(_NVHPC_CUDA)
-        #define _CubLog(format, ...) (__builtin_is_device_code() \
-            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
-                     blockIdx.z, blockIdx.y, blockIdx.x, \
-                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
-            : printf(format, __VA_ARGS__));
-    #elif !(defined(__clang__) && defined(__CUDA__))
-        #if (CUB_PTX_ARCH == 0)
-            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
-        #elif (CUB_PTX_ARCH >= 200)
-            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
-        #endif
-    #else
-        // XXX shameless hack for clang around variadic printf...
-        //     Compilies w/o supplying -std=c++11 but shows warning,
-        //     so we sielence them :)
-        #pragma clang diagnostic ignored "-Wc++11-extensions"
-        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
-            template <class... Args>
-            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
-            {
-        #ifdef __CUDA_ARCH__
-              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
-        #else
-              printf(format, args...);
-        #endif
-            }
-        #ifndef __CUDA_ARCH__
-            #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format,__VA_ARGS__);
-        #else
-            #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
-        #endif
-    #endif
+#if defined(_NVHPC_CUDA) || !(defined(__clang__) && defined(__CUDA__))
+
+// NVCC / NVC++
+#define _CubLog(format, ...)                                                   \
+  do                                                                           \
+  {                                                                            \
+    NV_IF_TARGET(NV_IS_HOST,                                                   \
+                 (printf(format, __VA_ARGS__);),                               \
+                 (printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format,     \
+                         blockIdx.z,                                           \
+                         blockIdx.y,                                           \
+                         blockIdx.x,                                           \
+                         threadIdx.z,                                          \
+                         threadIdx.y,                                          \
+                         threadIdx.x,                                          \
+                         __VA_ARGS__);));                                      \
+  } while (false)
+
+#else // Clang:
+
+// XXX shameless hack for clang around variadic printf...
+//     Compilies w/o supplying -std=c++11 but shows warning,
+//     so we silence them :)
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+#pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+template <class... Args>
+inline __host__ __device__ void va_printf(char const *format,
+                                          Args const &...args)
+{
+#ifdef __CUDA_ARCH__
+  printf(format,
+         blockIdx.z,
+         blockIdx.y,
+         blockIdx.x,
+         threadIdx.z,
+         threadIdx.y,
+         threadIdx.x,
+         args...);
+#else
+  printf(format, args...);
+#endif
+}
+#ifndef __CUDA_ARCH__
+#define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format, __VA_ARGS__);
+#else
+#define _CubLog(format, ...)                                                   \
+  CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread "                     \
+                              "(%d,%d,%d)]: " format,                          \
+                              __VA_ARGS__);
+#endif
+#endif
 #endif
-
-
-
 
 /** @} */       // end group UtilMgmt
 
diff --git a/cub/util_device.cuh b/cub/util_device.cuh
index 403c2f0d40..9a8ffa1d06 100644
--- a/cub/util_device.cuh
+++ b/cub/util_device.cuh
@@ -33,14 +33,16 @@
 
 #pragma once
 
-#include "detail/device_synchronize.cuh"
+#include <cub/detail/device_synchronize.cuh>
 
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_cpp_dialect.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
+#include <cub/util_arch.cuh>
+#include <cub/util_cpp_dialect.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <nv/target>
 
 #include <atomic>
 #include <array>
@@ -180,8 +182,6 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
 #endif
 }
 
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-
 /**
  * \brief Cache for an arbitrary value produced by a nullary function.
  */
@@ -197,9 +197,6 @@ struct ValueCache
     __host__ inline ValueCache() : value(Function()) {}
 };
 
-#endif
-
-#if CUB_CPP_DIALECT >= 2011
 // Host code, only safely usable in C++11 or newer, where thread-safe
 // initialization of static locals is guaranteed.  This is a separate function
 // to avoid defining a local static in a host/device function.
@@ -208,7 +205,6 @@ __host__ inline int DeviceCountCachedValue()
     static ValueCache<int, DeviceCountUncached> cache;
     return cache.value;
 }
-#endif
 
 /**
  * \brief Returns the number of CUDA devices available.
@@ -220,27 +216,14 @@ __host__ inline int DeviceCountCachedValue()
 CUB_RUNTIME_FUNCTION inline int DeviceCount()
 {
     int result = -1;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            #if CUB_CPP_DIALECT >= 2011
-                // Host code and C++11.
-                result = DeviceCountCachedValue();
-            #else
-                // Host code and C++98.
-                result = DeviceCountUncached();
-            #endif
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            // Device code.
-            result = DeviceCountUncached();
-        #endif
-    }
+
+    NV_IF_TARGET(NV_IS_HOST,
+                 (result = DeviceCountCachedValue();),
+                 (result = DeviceCountUncached();));
+
     return result;
 }
 
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-
 /**
  * \brief Per-device cache for a CUDA attribute value; the attribute is queried
  *        and stored for each device upon construction.
@@ -350,8 +333,6 @@ public:
     }
 };
 
-#endif
-
 /**
  * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
  */
@@ -366,27 +347,46 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
     // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
     (void)reinterpret_cast<void*>(empty_kernel);
 
+    // Define a temporary macro that expands to the current target ptx version
+    // in device code.
+    // <nv/target> may provide an abstraction for this eventually. For now,
+    // we have to keep this usage of __CUDA_ARCH__.
+#if defined(_NVHPC_CUDA)
+#define CUB_TEMP_GET_PTX __builtin_current_device_sm()
+#else
+#define CUB_TEMP_GET_PTX __CUDA_ARCH__
+#endif
+
     cudaError_t result = cudaSuccess;
-    if (CUB_IS_HOST_CODE) {
-       #if CUB_INCLUDE_HOST_CODE
-            cudaFuncAttributes empty_kernel_attrs;
-
-            result = cudaFuncGetAttributes(&empty_kernel_attrs,
-                                           reinterpret_cast<void*>(empty_kernel));
-            CubDebug(result);
-
-            ptx_version = empty_kernel_attrs.ptxVersion * 10;
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            // This is necessary to ensure instantiation of EmptyKernel in device code.
-            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
-            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
-            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
-
-            ptx_version = CUB_PTX_ARCH;
-        #endif
-    }
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (
+        cudaFuncAttributes empty_kernel_attrs;
+
+        do
+        {
+          if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs,
+                                                      empty_kernel)))
+          {
+            break;
+          }
+        } while (0);
+
+        ptx_version = empty_kernel_attrs.ptxVersion * 10;
+      ),
+      // NV_IS_DEVICE
+      (
+        // This is necessary to ensure instantiation of EmptyKernel in device
+        // code. The `reinterpret_cast` is necessary to suppress a
+        // set-but-unused warnings. This is a meme now:
+        // https://twitter.com/blelbach/status/1222391615576100864
+        (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+
+        ptx_version = CUB_TEMP_GET_PTX;
+      ));
+
+#undef CUB_TEMP_GET_PTX
+
     return result;
 }
 
@@ -400,7 +400,6 @@ __host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
     return PtxVersionUncached(ptx_version);
 }
 
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
 template <typename Tag>
 __host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
 {
@@ -411,7 +410,6 @@ __host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
 
 struct PtxVersionCacheTag {};
 struct SmVersionCacheTag {};
-#endif
 
 /**
  * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
@@ -422,8 +420,6 @@ struct SmVersionCacheTag {};
  */
 __host__ inline cudaError_t PtxVersion(int& ptx_version, int device)
 {
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-
     auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
       // If this call fails, then we get the error code back in the payload,
       // which we check with `CubDebug` below.
@@ -434,12 +430,6 @@ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device)
         ptx_version = payload.attribute;
 
     return payload.error;
-
-#else // Pre C++11.
-
-    return PtxVersionUncached(ptx_version, device);
-
-#endif
 }
 
 /**
@@ -449,37 +439,31 @@ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device)
  *
  * \note This function is thread safe.
  */
-CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
+CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version)
 {
-    cudaError_t result = cudaErrorUnknown;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            #if CUB_CPP_DIALECT >= 2011
-                // Host code and C++11.
-                auto const device = CurrentDevice();
-
-                auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
-                  // If this call fails, then we get the error code back in the payload,
-                  // which we check with `CubDebug` below.
-                  [=] (int& pv) { return PtxVersionUncached(pv, device); },
-                  device);
-
-                if (!CubDebug(payload.error))
-                    ptx_version = payload.attribute;
-
-                result = payload.error;
-            #else
-                // Host code and C++98.
-                result = PtxVersionUncached(ptx_version);
-            #endif
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            // Device code.
-            result = PtxVersionUncached(ptx_version);
-        #endif
-    }
-    return result;
+  cudaError_t result = cudaErrorUnknown;
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (
+      auto const device  = CurrentDevice();
+      auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+        // If this call fails, then we get the error code back in the payload,
+        // which we check with `CubDebug` below.
+        [=](int &pv) { return PtxVersionUncached(pv, device); },
+        device);
+
+      if (!CubDebug(payload.error))
+      {
+        ptx_version = payload.attribute;
+      }
+
+      result = payload.error;
+    ),
+    ( // NV_IS_DEVICE:
+      result = PtxVersionUncached(ptx_version);
+    ));
+
+  return result;
 }
 
 /**
@@ -519,34 +503,32 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d
  *
  * \note This function is thread safe.
  */
-CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
+CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version,
+                                                  int device = CurrentDevice())
 {
-    cudaError_t result = cudaErrorUnknown;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            #if CUB_CPP_DIALECT >= 2011
-                // Host code and C++11
-                auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
-                  // If this call fails, then we get the error code back in the payload,
-                  // which we check with `CubDebug` below.
-                  [=] (int& pv) { return SmVersionUncached(pv, device); },
-                  device);
-
-                if (!CubDebug(payload.error))
-                    sm_version = payload.attribute;
-
-                result = payload.error;
-            #else
-                // Host code and C++98
-                result = SmVersionUncached(sm_version, device);
-            #endif
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            result = SmVersionUncached(sm_version, device);
-        #endif
-    }
-    return result;
+  cudaError_t result = cudaErrorUnknown;
+
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (
+      auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
+      // If this call fails, then we get the error code back in
+      // the payload, which we check with `CubDebug` below.
+      [=](int &pv) { return SmVersionUncached(pv, device); },
+      device);
+
+      if (!CubDebug(payload.error))
+      {
+        sm_version = payload.attribute;
+      };
+
+      result = payload.error;
+    ),
+    ( // NV_IS_DEVICE
+      result = SmVersionUncached(sm_version, device);
+    ));
+
+  return result;
 }
 
 /**
@@ -554,24 +536,13 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device =
  */
 CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
 {
-    cudaError_t result = cudaErrorUnknown;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            result = CubDebug(cudaStreamSynchronize(stream));
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
-                (void)stream;
-                // Device can't yet sync on a specific stream
-                result = CubDebug(cub::detail::device_synchronize());
-            #else // Device code without the CUDA runtime.
-                (void)stream;
-                // CUDA API calls are not supported from this device.
-                result = CubDebug(cudaErrorInvalidConfiguration);
-            #endif
-        #endif
-    }
+  cudaError_t result = cudaErrorUnknown;
+
+  NV_IF_TARGET(NV_IS_HOST,
+               (result = CubDebug(cudaStreamSynchronize(stream));),
+               ((void)stream;
+                result = CubDebug(cub::detail::device_synchronize());));
+
     return result;
 }
 
diff --git a/test/test_device_segmented_sort.cu b/test/test_device_segmented_sort.cu
index 5360279c81..32a764f864 100644
--- a/test/test_device_segmented_sort.cu
+++ b/test/test_device_segmented_sort.cu
@@ -29,6 +29,9 @@
 #define CUB_STDERR
 
 #include <cub/device/device_segmented_sort.cuh>
+
+#include <nv/target>
+
 #include <test_util.h>
 
 #include <thrust/count.h>
@@ -1500,61 +1503,56 @@ struct EdgeTestDispatch
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
   {
-    if (CUB_IS_HOST_CODE)
-    {
-      #if CUB_INCLUDE_HOST_CODE
-      using SmallAndMediumPolicyT =
-        typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
-      using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
-
-      const int small_segment_max_segment_size =
-        SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE;
-
-      const int items_per_small_segment =
-        SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_THREAD;
-
-      const int medium_segment_max_segment_size =
-        SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE;
-
-      const int single_thread_segment_size = items_per_small_segment;
-
-      const int large_cached_segment_max_segment_size =
-        LargeSegmentPolicyT::BLOCK_THREADS *
-        LargeSegmentPolicyT::ITEMS_PER_THREAD;
-
-      for (bool sort_descending : {ascending, descending})
-      {
-        Input<KeyT, ValueT> edge_cases =
-          InputDescription<KeyT>()
-            .add({a_lot_of, empty_short_circuit_segment_size})
-            .add({a_lot_of, copy_short_circuit_segment_size})
-            .add({a_lot_of, swap_short_circuit_segment_size})
-            .add({a_lot_of, swap_short_circuit_segment_size + 1})
-            .add({a_lot_of, swap_short_circuit_segment_size + 1})
-            .add({a_lot_of, single_thread_segment_size - 1})
-            .add({a_lot_of, single_thread_segment_size })
-            .add({a_lot_of, single_thread_segment_size + 1 })
-            .add({a_lot_of, single_thread_segment_size * 2 - 1 })
-            .add({a_lot_of, single_thread_segment_size * 2 })
-            .add({a_lot_of, single_thread_segment_size * 2 + 1 })
-            .add({a_bunch_of, small_segment_max_segment_size - 1})
-            .add({a_bunch_of, small_segment_max_segment_size})
-            .add({a_bunch_of, small_segment_max_segment_size + 1})
-            .add({a_bunch_of, medium_segment_max_segment_size - 1})
-            .add({a_bunch_of, medium_segment_max_segment_size})
-            .add({a_bunch_of, medium_segment_max_segment_size + 1})
-            .add({a_bunch_of, large_cached_segment_max_segment_size - 1})
-            .add({a_bunch_of, large_cached_segment_max_segment_size})
-            .add({a_bunch_of, large_cached_segment_max_segment_size + 1})
-            .add({a_few, large_cached_segment_max_segment_size * 2})
-            .add({a_few, large_cached_segment_max_segment_size * 3})
-            .add({a_few, large_cached_segment_max_segment_size * 5})
-            .template gen<ValueT>(sort_descending);
-
-        InputTest<KeyT, ValueT>(sort_descending, edge_cases);
-      }
-      #endif
-    }
+    NV_IF_TARGET(NV_IS_HOST,
+      (using SmallAndMediumPolicyT =
+         typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+       using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+
+       const int small_segment_max_segment_size =
+         SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE;
+
+       const int items_per_small_segment =
+         SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_THREAD;
+
+       const int medium_segment_max_segment_size =
+         SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE;
+
+       const int single_thread_segment_size = items_per_small_segment;
+
+       const int large_cached_segment_max_segment_size =
+         LargeSegmentPolicyT::BLOCK_THREADS *
+         LargeSegmentPolicyT::ITEMS_PER_THREAD;
+
+       for (bool sort_descending : {ascending, descending}) {
+         Input<KeyT, ValueT> edge_cases =
+           InputDescription<KeyT>()
+             .add({a_lot_of, empty_short_circuit_segment_size})
+             .add({a_lot_of, copy_short_circuit_segment_size})
+             .add({a_lot_of, swap_short_circuit_segment_size})
+             .add({a_lot_of, swap_short_circuit_segment_size + 1})
+             .add({a_lot_of, swap_short_circuit_segment_size + 1})
+             .add({a_lot_of, single_thread_segment_size - 1})
+             .add({a_lot_of, single_thread_segment_size})
+             .add({a_lot_of, single_thread_segment_size + 1})
+             .add({a_lot_of, single_thread_segment_size * 2 - 1})
+             .add({a_lot_of, single_thread_segment_size * 2})
+             .add({a_lot_of, single_thread_segment_size * 2 + 1})
+             .add({a_bunch_of, small_segment_max_segment_size - 1})
+             .add({a_bunch_of, small_segment_max_segment_size})
+             .add({a_bunch_of, small_segment_max_segment_size + 1})
+             .add({a_bunch_of, medium_segment_max_segment_size - 1})
+             .add({a_bunch_of, medium_segment_max_segment_size})
+             .add({a_bunch_of, medium_segment_max_segment_size + 1})
+             .add({a_bunch_of, large_cached_segment_max_segment_size - 1})
+             .add({a_bunch_of, large_cached_segment_max_segment_size})
+             .add({a_bunch_of, large_cached_segment_max_segment_size + 1})
+             .add({a_few, large_cached_segment_max_segment_size * 2})
+             .add({a_few, large_cached_segment_max_segment_size * 3})
+             .add({a_few, large_cached_segment_max_segment_size * 5})
+             .template gen<ValueT>(sort_descending);
+
+         InputTest<KeyT, ValueT>(sort_descending, edge_cases);
+       }));
 
     return cudaSuccess;
   }
diff --git a/test/test_util.h b/test/test_util.h
index f00c21bb9f..b1f3373aa1 100644
--- a/test/test_util.h
+++ b/test/test_util.h
@@ -51,13 +51,16 @@
 #include "half.h"
 #include "bfloat16.h"
 
-#include "cub/util_debug.cuh"
-#include "cub/util_device.cuh"
-#include "cub/util_type.cuh"
-#include "cub/util_macro.cuh"
-#include "cub/util_math.cuh"
-#include "cub/util_ptx.cuh"
-#include "cub/iterator/discard_output_iterator.cuh"
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+
+#include <nv/target>
 
 /******************************************************************************
  * Type conversion macros
@@ -569,134 +572,121 @@ enum GenMode
  */
 #pragma nv_exec_check_disable
 template <typename T>
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0)
+__host__ __device__ __forceinline__
+void InitValue(GenMode gen_mode, T &value, std::size_t index = 0)
 {
-    // RandomBits is host-only.
-    if (CUB_IS_DEVICE_CODE)
-    {
-        #if CUB_INCLUDE_DEVICE_CODE
-            switch (gen_mode)
-            {
-                case RANDOM:
-                case RANDOM_BIT:
-                case RANDOM_MINUS_PLUS_ZERO:
-                    _CubLog("%s\n",
-                            "cub::InitValue cannot generate random numbers on device.");
-                    CUB_NS_QUALIFIER::ThreadTrap();
-                    break;
-                case UNIFORM:
-                    value = 2;
-                    break;
-                case INTEGER_SEED:
-                default:
-                    value = (T) index;
-                    break;
-            }
-        #endif // CUB_INCLUDE_DEVICE_CODE
-    }
-    else
-    {
-        #if CUB_INCLUDE_HOST_CODE
-            switch (gen_mode)
-            {
-                case RANDOM:
-                    RandomBits(value);
-                    break;
-                case RANDOM_BIT:
-                {
-                    char c;
-                    RandomBits(c, 0, 0, 1);
-                    value = (c > 0) ? (T) 1 : (T) -1;
-                    break;
-                }
-                case RANDOM_MINUS_PLUS_ZERO:
-                {
-                    // Replace roughly 1/128 of values with -0.0 or +0.0, and generate the rest randomly
-                    typedef typename CUB_NS_QUALIFIER::Traits<T>::UnsignedBits UnsignedBits;
-                    char c;
-                    RandomBits(c);
-                    if (c == 0)
-                    {
-                        // Replace 1/256 of values with +0.0 bit pattern
-                        value = SafeBitCast<T>(UnsignedBits(0));
-                    }
-                    else if (c == 1)
-                    {
-                        // Replace 1/256 of values with -0.0 bit pattern
-                        value = SafeBitCast<T>(UnsignedBits(UnsignedBits(1) <<
-                                                            (sizeof(UnsignedBits) * 8) - 1));
-                    }
-                    else
-                    {
-                        // 127/128 of values are random
-                        RandomBits(value);
-                    }
-                    break;
-                }
-                case UNIFORM:
-                    value = 2;
-                    break;
-                case INTEGER_SEED:
-                default:
-                    value = (T) index;
-                    break;
-            }
-        #endif // CUB_INCLUDE_HOST_CODE
-    }
+  // RandomBits is host-only.
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (
+      switch (gen_mode) {
+      case RANDOM:
+        RandomBits(value);
+        break;
+      case RANDOM_BIT: {
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = static_cast<T>((c > 0) ? 1 : -1);
+        break;
+      }
+      case RANDOM_MINUS_PLUS_ZERO: {
+        // Replace roughly 1/128 of values with -0.0 or +0.0, and
+        // generate the rest randomly
+        using UnsignedBits = typename CUB_NS_QUALIFIER::Traits<T>::UnsignedBits;
+        char c;
+        RandomBits(c);
+        if (c == 0)
+        {
+          // Replace 1/256 of values with +0.0 bit pattern
+          value = SafeBitCast<T>(UnsignedBits(0));
+        }
+        else if (c == 1)
+        {
+          // Replace 1/256 of values with -0.0 bit pattern
+          value = SafeBitCast<T>(
+            UnsignedBits(UnsignedBits(1) << (sizeof(UnsignedBits) * 8) - 1));
+        }
+        else
+        {
+          // 127/128 of values are random
+          RandomBits(value);
+        }
+        break;
+      }
+      case UNIFORM:
+        value = 2;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = static_cast<T>(index);
+        break;
+      }),
+    ( // NV_IS_DEVICE:
+      switch (gen_mode) {
+      case RANDOM:
+      case RANDOM_BIT:
+      case RANDOM_MINUS_PLUS_ZERO:
+        _CubLog("%s\n",
+                "cub::InitValue cannot generate random numbers on device.");
+        CUB_NS_QUALIFIER::ThreadTrap();
+        break;
+      case UNIFORM:
+        value = 2;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = static_cast<T>(index);
+        break;
+      }
+    ));
 }
 
-
 /**
  * Initialize value (bool)
  */
 #pragma nv_exec_check_disable
 __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, std::size_t index = 0)
 {
-    // RandomBits is host-only.
-    if (CUB_IS_DEVICE_CODE)
-    {
-        #if CUB_INCLUDE_DEVICE_CODE
-            switch (gen_mode)
-            {
-                case RANDOM:
-                case RANDOM_BIT:
-                case RANDOM_MINUS_PLUS_ZERO:
-                    _CubLog("%s\n",
-                            "cub::InitValue cannot generate random numbers on device.");
-                    CUB_NS_QUALIFIER::ThreadTrap();
-                    break;
-                case UNIFORM:
-                    value = true;
-                    break;
-                case INTEGER_SEED:
-                default:
-                    value = (index > 0);
-                    break;
-            }
-        #endif // CUB_INCLUDE_DEVICE_CODE
-    }
-    else
+  // RandomBits is host-only.
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (
+      switch (gen_mode)
+      {
+      case RANDOM:
+      case RANDOM_BIT:
+          char c;
+          RandomBits(c, 0, 0, 1);
+          value = (c > 0);
+          break;
+       case UNIFORM:
+          value = true;
+          break;
+      case INTEGER_SEED:
+      default:
+          value = (index > 0);
+          break;
+      }
+    ),
+  ( // NV_IS_DEVICE,
+    switch (gen_mode)
     {
-        #if CUB_INCLUDE_HOST_CODE
-            switch (gen_mode)
-            {
-                case RANDOM:
-                case RANDOM_BIT:
-                case RANDOM_MINUS_PLUS_ZERO:
-                    char c;
-                    RandomBits(c, 0, 0, 1);
-                    value = (c > 0);
-                    break;
-                case UNIFORM:
-                    value = true;
-                    break;
-                case INTEGER_SEED:
-                default:
-                    value = (index > 0);
-                    break;
-            }
-        #endif // CUB_INCLUDE_HOST_CODE
+      case RANDOM:
+      case RANDOM_BIT:
+      case RANDOM_MINUS_PLUS_ZERO:
+        _CubLog("%s\n",
+                "cub::InitValue cannot generate random numbers on device.");
+        CUB_NS_QUALIFIER::ThreadTrap();
+        break;
+      case UNIFORM:
+        value = true;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = (index > 0);
+        break;
     }
+  ));
 }
 
 
diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu
index 18308c1dba..d1569585c7 100644
--- a/test/test_warp_reduce.cu
+++ b/test/test_warp_reduce.cu
@@ -36,8 +36,10 @@
 #include <stdio.h>
 #include <typeinfo>
 
-#include <cub/warp/warp_reduce.cuh>
 #include <cub/util_allocator.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+#include <nv/target>
 
 #include "test_util.h"
 
@@ -67,16 +69,14 @@ struct WrapperFunctor
     template <typename T>
     inline __host__ __device__ T operator()(const T &a, const T &b) const
     {
-      if (CUB_IS_DEVICE_CODE)
-      {
-          #if CUB_INCLUDE_DEVICE_CODE != 0
+        NV_IF_TARGET(NV_IS_DEVICE,
+        (
               if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid)
               {
                   _CubLog("%s\n", "Invalid lane ID in cub::WrapperFunctor::operator()");
                   cub::ThreadTrap();
               }
-          #endif
-      }
+        ));
 
         return op(a, b);
     }

From 476c1b8849b5fb3236240a8c59738ef59be69285 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Tue, 11 May 2021 17:05:46 -0400
Subject: [PATCH 5/7] Don't use host-only functions in host-device contexts.

This fixes the issue reported in NVIDIA/CUB#299. There's no
clear reason why this should use `RandomBits` unconditionally.
---
 test/test_util.h | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/test/test_util.h b/test/test_util.h
index b1f3373aa1..d7b5d28f7c 100644
--- a/test/test_util.h
+++ b/test/test_util.h
@@ -714,22 +714,18 @@ __host__ __device__ __forceinline__ void InitValue(
     // This specialization only appears to be used by test_warp_scan.
     // It initializes with uniform values and random keys, so we need to
     // protect the call to the host-only RandomBits.
-    if (CUB_IS_DEVICE_CODE)
-    {
-        #if CUB_INCLUDE_DEVICE_CODE
-            _CubLog("%s\n",
-                    "cub::InitValue cannot generate random numbers on device.");
-            CUB_NS_QUALIFIER::ThreadTrap();
-        #endif // CUB_INCLUDE_DEVICE_CODE
-    }
-    else
-    {
-        #if CUB_INCLUDE_HOST_CODE
-            // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
-            RandomBits(value.key, 3);
-            value.key = (value.key & 0x1);
-        #endif // CUB_INCLUDE_HOST_CODE
-    }
+    // clang-format off
+    NV_IF_TARGET(NV_IS_HOST, (
+        // Assign corresponding flag with a likelihood of the last bit
+        // being set with entropy-reduction level 3
+        RandomBits(value.key, 3);
+        value.key = (value.key & 0x1);
+      ), ( // NV_IS_DEVICE
+        _CubLog("%s\n",
+                "cub::InitValue cannot generate random numbers on device.");
+        CUB_NS_QUALIFIER::ThreadTrap();
+      ));
+    // clang-format on
 }
 
 

From f4d61fbb26c0c3b2ffd392bfbe6edbc60c0d5be3 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Wed, 23 Mar 2022 15:06:39 -0400
Subject: [PATCH 6/7] Use Thrust's kernel launch helper in DispatchRadixSort.

---
 cub/device/dispatch/dispatch_radix_sort.cuh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index 6265232fad..39a6441a0c 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1344,9 +1344,14 @@ struct DispatchRadixSort :
 
             // exclusive sums to determine starts
             const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
-            DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>
-                <<<num_passes, SCAN_BLOCK_THREADS, 0, stream>>>(d_bins);
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                      num_passes, SCAN_BLOCK_THREADS, 0, stream
+                      ).doit(DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>,
+                            d_bins);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // use the other buffer if no overwrite is allowed
             KeyT* d_keys_tmp = d_keys.Alternate();
@@ -1374,7 +1379,8 @@ struct DispatchRadixSort :
                            stream))) break;
                     auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
                         MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>;
-                    errror = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+
+                    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
                       num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream
                     ).doit(onesweep_kernel,
                            d_lookback, d_ctrs + portion * num_passes + pass,

From 4de961aee49c894e9c380d7c2f7e750016976f00 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Thu, 24 Mar 2022 12:50:41 -0400
Subject: [PATCH 7/7] Skip large allocation tests that exceed device memory.

The merge sort test with pow2 >20 fails on GTX 1650. Detect
bad_alloc failures and skip those tests. Tests for smaller
problem sizes will still fail if there's a bad_alloc.
---
 test/test_device_merge_sort.cu | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/test/test_device_merge_sort.cu b/test/test_device_merge_sort.cu
index 1a639027e1..52154ac177 100644
--- a/test/test_device_merge_sort.cu
+++ b/test/test_device_merge_sort.cu
@@ -49,6 +49,7 @@
 
 #include <cstdio>
 #include <limits>
+#include <new> // for std::bad_alloc
 #include <memory>
 #include <typeinfo>
 
@@ -322,12 +323,29 @@ void Test(thrust::default_random_engine &rng)
 {
   for (unsigned int pow2 = 9; pow2 < 22; pow2 += 2)
   {
-    const unsigned int num_items = 1 << pow2;
-    AllocateAndTestIterators<DataType, DataType>(num_items);
-
-
-    TestHelper<true>::AllocateAndTest<HugeDataType, DataType>(rng, num_items);
-    Test<DataType>(rng, num_items);
+    try
+    {
+      const unsigned int num_items = 1 << pow2;
+      AllocateAndTestIterators<DataType, DataType>(num_items);
+
+      TestHelper<true>::AllocateAndTest<HugeDataType, DataType>(rng, num_items);
+      Test<DataType>(rng, num_items);
+    }
+    catch (std::bad_alloc &e)
+    {
+      if (pow2 > 20)
+      { // Some cards don't have enough memory for large allocations, these
+        // can be skipped.
+        printf("Skipping large memory test. (num_items=2^%u): %s\n",
+               pow2,
+               e.what());
+      }
+      else
+      { // For smaller problem sizes, treat as an error:
+        printf("Error (num_items=2^%u): %s", pow2, e.what());
+        throw;
+      }
+    }
   }
 }