diff --git a/cub/agent/agent_radix_sort_onesweep.cuh b/cub/agent/agent_radix_sort_onesweep.cuh
index 61c1be05e9..d0987fe468 100644
--- a/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/agent/agent_radix_sort_onesweep.cuh
@@ -33,7 +33,9 @@
 
 #pragma once
 
+#if !defined(__NVCOMPILER_CUDA__)
 #include <cuda/atomic>
+#endif
 
 #include "../block/block_radix_rank.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
@@ -123,7 +125,11 @@ struct AgentRadixSortOnesweep
     };
 
     typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+#if defined(__NVCOMPILER_CUDA__)
+    typedef OffsetT AtomicOffsetT;
+#else
     typedef typename cuda::atomic<OffsetT, cuda::thread_scope_device> AtomicOffsetT;
+#endif
   
     static const RadixRankAlgorithm RANK_ALGORITHM =
                                     AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
@@ -211,8 +217,13 @@ struct AgentRadixSortOnesweep
             if (FULL_BINS || bin < RADIX_DIGITS)
             {
                 // write the local sum into the bin
-                d_lookback[block_idx * RADIX_DIGITS + bin].store(
-                    bins[u] | LOOKBACK_PARTIAL_MASK, cuda::memory_order_relaxed);
+                AtomicOffsetT& loc = d_lookback[block_idx * RADIX_DIGITS + bin];
+                OffsetT value = bins[u] | LOOKBACK_PARTIAL_MASK;
+#if defined(__NVCOMPILER_CUDA__)
+                (volatile OffsetT&)loc = value;
+#else
+                loc.store(value, cuda::memory_order_relaxed);
+#endif
             }
         }
     }
@@ -255,18 +266,27 @@ struct AgentRadixSortOnesweep
                 for (OffsetT block_jdx = block_idx - 1; block_jdx >= 0; --block_jdx)
                 {
                     // wait for some value to appear
-                    OffsetT val = 0;
+                    OffsetT value_j = 0;
                     do {
-                        val = d_lookback[block_jdx * RADIX_DIGITS + bin].load(
-                            cuda::memory_order_relaxed);
-                    } while (val == 0);
-
-                    inc_sum += val & LOOKBACK_VALUE_MASK;
-                    want_mask = WARP_BALLOT((val & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
-                    if (val & LOOKBACK_GLOBAL_MASK) break;
+                        AtomicOffsetT& loc_j = d_lookback[block_jdx * RADIX_DIGITS + bin];
+#if defined(__NVCOMPILER_CUDA__)
+                        value_j = (volatile OffsetT&)loc_j;
+#else
+                        value_j = loc_j.load(cuda::memory_order_relaxed);
+#endif
+                    } while (value_j == 0);
+
+                    inc_sum += value_j & LOOKBACK_VALUE_MASK;
+                    want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
+                    if (value_j & LOOKBACK_GLOBAL_MASK) break;
                 }
-                d_lookback[block_idx * RADIX_DIGITS + bin].store(
-                    inc_sum | LOOKBACK_GLOBAL_MASK, cuda::memory_order_relaxed);
+                AtomicOffsetT& loc_i = d_lookback[block_idx * RADIX_DIGITS + bin];
+                OffsetT value_i = inc_sum | LOOKBACK_GLOBAL_MASK;
+#if defined(__NVCOMPILER_CUDA__)
+                (volatile OffsetT&)loc_i = value_i;
+#else
+                loc_i.store(value_i, cuda::memory_order_relaxed);
+#endif
                 s.global_offsets[bin] += inc_sum - bins[u];
             }
         }
@@ -656,7 +676,11 @@ struct AgentRadixSortOnesweep
         // initialization
         if (threadIdx.x == 0)
         {
+#if defined(__NVCOMPILER_CUDA__)
+            s.block_idx = atomicAdd(d_ctrs, 1);
+#else
             s.block_idx = d_ctrs->fetch_add(1, cuda::memory_order_relaxed);
+#endif
         }
         CTA_SYNC();
         block_idx = s.block_idx;
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index 16c00b9101..17d0b20380 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -34,7 +34,10 @@
 
 #pragma once
 
+#if !defined(__NVCOMPILER_CUDA__)
 #include <cuda/atomic>
+#endif
+
 #include <stdio.h>
 #include <iterator>
 
@@ -535,7 +538,12 @@ template <
     typename KeyT,
     typename ValueT,
     typename OffsetT,
-    typename AtomicOffsetT = cuda::atomic<OffsetT, cuda::thread_scope_device> >
+#if defined(__NVCOMPILER_CUDA__)
+    typename AtomicOffsetT = OffsetT
+#else
+    typename AtomicOffsetT = cuda::atomic<OffsetT, cuda::thread_scope_device>
+#endif
+    >
 __global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
 DeviceRadixSortOnesweepKernel
     (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out,
@@ -1360,7 +1368,11 @@ struct DispatchRadixSort :
     cudaError_t InvokeOnesweep()
     {
         typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+#if defined(__NVCOMPILER_CUDA__)
+        typedef OffsetT AtomicOffsetT;
+#else
         typedef cuda::atomic<OffsetT, cuda::thread_scope_device> AtomicOffsetT;
+#endif
         // compute temporary storage size
         const int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS;
         const int RADIX_DIGITS = 1 << RADIX_BITS;