diff --git a/cub/agent/agent_radix_sort_onesweep.cuh b/cub/agent/agent_radix_sort_onesweep.cuh index 61c1be05e9..d0987fe468 100644 --- a/cub/agent/agent_radix_sort_onesweep.cuh +++ b/cub/agent/agent_radix_sort_onesweep.cuh @@ -33,7 +33,9 @@ #pragma once +#if !defined(__NVCOMPILER_CUDA__) #include +#endif #include "../block/block_radix_rank.cuh" #include "../block/radix_rank_sort_operations.cuh" @@ -123,7 +125,11 @@ struct AgentRadixSortOnesweep }; typedef typename Traits::UnsignedBits UnsignedBits; +#if defined(__NVCOMPILER_CUDA__) + typedef OffsetT AtomicOffsetT; +#else typedef typename cuda::atomic AtomicOffsetT; +#endif static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortOnesweepPolicy::RANK_ALGORITHM; @@ -211,8 +217,13 @@ struct AgentRadixSortOnesweep if (FULL_BINS || bin < RADIX_DIGITS) { // write the local sum into the bin - d_lookback[block_idx * RADIX_DIGITS + bin].store( - bins[u] | LOOKBACK_PARTIAL_MASK, cuda::memory_order_relaxed); + AtomicOffsetT& loc = d_lookback[block_idx * RADIX_DIGITS + bin]; + OffsetT value = bins[u] | LOOKBACK_PARTIAL_MASK; +#if defined(__NVCOMPILER_CUDA__) + (volatile OffsetT&)loc = value; +#else + loc.store(value, cuda::memory_order_relaxed); +#endif } } } @@ -255,18 +266,27 @@ struct AgentRadixSortOnesweep for (OffsetT block_jdx = block_idx - 1; block_jdx >= 0; --block_jdx) { // wait for some value to appear - OffsetT val = 0; + OffsetT value_j = 0; do { - val = d_lookback[block_jdx * RADIX_DIGITS + bin].load( - cuda::memory_order_relaxed); - } while (val == 0); - - inc_sum += val & LOOKBACK_VALUE_MASK; - want_mask = WARP_BALLOT((val & LOOKBACK_GLOBAL_MASK) == 0, want_mask); - if (val & LOOKBACK_GLOBAL_MASK) break; + AtomicOffsetT& loc_j = d_lookback[block_jdx * RADIX_DIGITS + bin]; +#if defined(__NVCOMPILER_CUDA__) + value_j = (volatile OffsetT&)loc_j; +#else + value_j = loc_j.load(cuda::memory_order_relaxed); +#endif + } while (value_j == 0); + + inc_sum += value_j & LOOKBACK_VALUE_MASK; + want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask); + if (value_j & LOOKBACK_GLOBAL_MASK) break; } - d_lookback[block_idx * RADIX_DIGITS + bin].store( - inc_sum | LOOKBACK_GLOBAL_MASK, cuda::memory_order_relaxed); + AtomicOffsetT& loc_i = d_lookback[block_idx * RADIX_DIGITS + bin]; + OffsetT value_i = inc_sum | LOOKBACK_GLOBAL_MASK; +#if defined(__NVCOMPILER_CUDA__) + (volatile OffsetT&)loc_i = value_i; +#else + loc_i.store(value_i, cuda::memory_order_relaxed); +#endif s.global_offsets[bin] += inc_sum - bins[u]; } } @@ -656,7 +676,11 @@ struct AgentRadixSortOnesweep // initialization if (threadIdx.x == 0) { +#if defined(__NVCOMPILER_CUDA__) + s.block_idx = atomicAdd(d_ctrs, 1); +#else s.block_idx = d_ctrs->fetch_add(1, cuda::memory_order_relaxed); +#endif } CTA_SYNC(); block_idx = s.block_idx; diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 16c00b9101..17d0b20380 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -34,7 +34,10 @@ #pragma once +#if !defined(__NVCOMPILER_CUDA__) #include +#endif + #include #include @@ -535,7 +538,12 @@ template < typename KeyT, typename ValueT, typename OffsetT, - typename AtomicOffsetT = cuda::atomic > +#if defined(__NVCOMPILER_CUDA__) + typename AtomicOffsetT = OffsetT +#else + typename AtomicOffsetT = cuda::atomic +#endif + > __global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) DeviceRadixSortOnesweepKernel (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, @@ -1360,7 +1368,11 @@ struct DispatchRadixSort : cudaError_t InvokeOnesweep() { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; +#if defined(__NVCOMPILER_CUDA__) + typedef OffsetT AtomicOffsetT; +#else typedef cuda::atomic AtomicOffsetT; +#endif // compute temporary storage size const int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS; const int RADIX_DIGITS = 1 << RADIX_BITS;