From 35118103c1305b7611cef47fc43ca81271871af2 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Thu, 18 Aug 2022 17:38:36 +0400 Subject: [PATCH 1/2] Deprecate CTK<9 --- cub/util_arch.cuh | 6 -- cub/util_ptx.cuh | 39 +-------- cub/warp/specializations/warp_reduce_shfl.cuh | 71 --------------- cub/warp/specializations/warp_scan_shfl.cuh | 87 ------------------- 4 files changed, 1 insertion(+), 202 deletions(-) diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index f4013de568..9891317f8a 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -44,12 +44,6 @@ CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(_NVHPC_CUDA) || \ - CUDA_VERSION >= 9000) && \ - !defined(CUB_USE_COOPERATIVE_GROUPS) -#define CUB_USE_COOPERATIVE_GROUPS -#endif - /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. #ifndef CUB_PTX_ARCH diff --git a/cub/util_ptx.cuh b/cub/util_ptx.cuh index 5b2a20486c..2a35f2e01c 100644 --- a/cub/util_ptx.cuh +++ b/cub/util_ptx.cuh @@ -253,9 +253,7 @@ __device__ __forceinline__ int CTA_SYNC_OR(int p) */ __device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS __syncwarp(member_mask); -#endif } @@ -264,11 +262,7 @@ __device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) */ __device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS return __any_sync(member_mask, predicate); -#else - return ::__any(predicate); -#endif } @@ -277,11 +271,7 @@ __device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask */ __device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS return __all_sync(member_mask, predicate); -#else - return ::__all(predicate); -#endif } @@ -290,11 +280,7 @@ __device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask */ __device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS return __ballot_sync(member_mask, predicate); -#else - return __ballot(predicate); -#endif } @@ -304,13 +290,8 @@ __device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_m __device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); -#else - asm volatile("shfl.up.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); -#endif return word; } @@ -320,13 +301,8 @@ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned __device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); -#else - asm volatile("shfl.down.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); -#endif return word; } @@ -336,13 +312,8 @@ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsign __device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); -#else - asm volatile("shfl.idx.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags)); -#endif return word; } @@ -352,11 +323,7 @@ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned __device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) { -#ifdef CUB_USE_COOPERATIVE_GROUPS - return __shfl_sync(member_mask, word, src_lane); -#else - return __shfl(word, src_lane); -#endif + return __shfl_sync(member_mask, word, src_lane); } /** @@ -739,11 +706,7 @@ inline __device__ unsigned int MatchAny(unsigned int label) " .reg .pred p;\n" " and.b32 %0, %1, %2;" " setp.eq.u32 p, %0, %2;\n" -#ifdef CUB_USE_COOPERATIVE_GROUPS " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" -#else - " vote.ballot.b32 %0, p;\n" -#endif " @!p not.b32 %0, %0;\n" "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); diff --git a/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/warp/specializations/warp_reduce_shfl.cuh index 83daf733e9..b45810774f 100644 --- a/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/warp/specializations/warp_reduce_shfl.cuh @@ -141,7 +141,6 @@ struct WarpReduceShfl int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 r0;" @@ -151,17 +150,6 @@ struct WarpReduceShfl " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif return output; } @@ -178,7 +166,6 @@ struct WarpReduceShfl int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .f32 r0;" @@ -188,17 +175,6 @@ struct WarpReduceShfl " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); -#endif return output; } @@ -214,7 +190,6 @@ struct WarpReduceShfl unsigned long long output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" @@ -227,20 +202,6 @@ struct WarpReduceShfl " @p add.u64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.u64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); -#endif return output; } @@ -257,7 +218,6 @@ struct WarpReduceShfl int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" @@ -270,20 +230,6 @@ struct WarpReduceShfl " @p add.s64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.s64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); -#endif return output; } @@ -300,7 +246,6 @@ struct WarpReduceShfl int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" @@ -315,22 +260,6 @@ struct WarpReduceShfl " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); -#endif return output; } diff --git a/cub/warp/specializations/warp_scan_shfl.cuh b/cub/warp/specializations/warp_scan_shfl.cuh index 2f2207c1ac..c2988711c8 100644 --- a/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/warp/specializations/warp_scan_shfl.cuh @@ -126,7 +126,6 @@ struct WarpScanShfl int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .s32 r0;" @@ -136,17 +135,6 @@ struct WarpScanShfl " mov.s32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .s32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.s32 r0, r0, %4;" - " mov.s32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif return output; } @@ -162,7 +150,6 @@ struct WarpScanShfl int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 r0;" @@ -172,17 +159,6 @@ struct WarpScanShfl " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif return output; } @@ -199,7 +175,6 @@ struct WarpScanShfl int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .f32 r0;" @@ -209,17 +184,6 @@ struct WarpScanShfl " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); -#endif return output; } @@ -236,7 +200,6 @@ struct WarpScanShfl int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u64 r0;" @@ -251,22 +214,6 @@ struct WarpScanShfl " mov.u64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.u64 r0, r0, %4;" - " mov.u64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); -#endif return output; } @@ -283,7 +230,6 @@ struct WarpScanShfl int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .s64 r0;" @@ -298,22 +244,6 @@ struct WarpScanShfl " mov.s64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .s64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.s64 r0, r0, %4;" - " mov.s64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); -#endif return output; } @@ -330,7 +260,6 @@ struct WarpScanShfl int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS asm volatile( "{" " .reg .u32 lo;" @@ -345,22 +274,6 @@ struct WarpScanShfl " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); -#endif return output; } From c2beeed383732fbc61908ff05f66f65b01abc86b Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Thu, 18 Aug 2022 23:12:02 +0400 Subject: [PATCH 2/2] Deprecate CUB_USE_COOPERATIVE_GROUPS --- cub/util_arch.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index 9891317f8a..d2506e93cf 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -44,6 +44,9 @@ CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +// \deprecated [Since 2.1.0] +#define CUB_USE_COOPERATIVE_GROUPS + /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. #ifndef CUB_PTX_ARCH