Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Support nvc++.
Browse files Browse the repository at this point in the history
  • Loading branch information
canonizer committed Sep 25, 2020
1 parent ea2f176 commit 520a7ac
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 13 deletions.
48 changes: 36 additions & 12 deletions cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@

#pragma once

#if !defined(__NVCOMPILER_CUDA__)
#include <cuda/atomic>
#endif

#include "../block/block_radix_rank.cuh"
#include "../block/radix_rank_sort_operations.cuh"
Expand Down Expand Up @@ -123,7 +125,11 @@ struct AgentRadixSortOnesweep
};

typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
#if defined(__NVCOMPILER_CUDA__)
typedef OffsetT AtomicOffsetT;
#else
typedef typename cuda::atomic<OffsetT, cuda::thread_scope_device> AtomicOffsetT;
#endif

static const RadixRankAlgorithm RANK_ALGORITHM =
AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
Expand Down Expand Up @@ -211,8 +217,13 @@ struct AgentRadixSortOnesweep
if (FULL_BINS || bin < RADIX_DIGITS)
{
// write the local sum into the bin
d_lookback[block_idx * RADIX_DIGITS + bin].store(
bins[u] | LOOKBACK_PARTIAL_MASK, cuda::memory_order_relaxed);
AtomicOffsetT& loc = d_lookback[block_idx * RADIX_DIGITS + bin];
OffsetT value = bins[u] | LOOKBACK_PARTIAL_MASK;
#if defined(__NVCOMPILER_CUDA__)
(volatile OffsetT&)loc = value;
#else
loc.store(value, cuda::memory_order_relaxed);
#endif
}
}
}
Expand Down Expand Up @@ -255,18 +266,27 @@ struct AgentRadixSortOnesweep
for (OffsetT block_jdx = block_idx - 1; block_jdx >= 0; --block_jdx)
{
// wait for some value to appear
OffsetT val = 0;
OffsetT value_j = 0;
do {
val = d_lookback[block_jdx * RADIX_DIGITS + bin].load(
cuda::memory_order_relaxed);
} while (val == 0);

inc_sum += val & LOOKBACK_VALUE_MASK;
want_mask = WARP_BALLOT((val & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
if (val & LOOKBACK_GLOBAL_MASK) break;
AtomicOffsetT& loc_j = d_lookback[block_jdx * RADIX_DIGITS + bin];
#if defined(__NVCOMPILER_CUDA__)
value_j = (volatile OffsetT&)loc_j;
#else
value_j = loc_j.load(cuda::memory_order_relaxed);
#endif
} while (value_j == 0);

inc_sum += value_j & LOOKBACK_VALUE_MASK;
want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
if (value_j & LOOKBACK_GLOBAL_MASK) break;
}
d_lookback[block_idx * RADIX_DIGITS + bin].store(
inc_sum | LOOKBACK_GLOBAL_MASK, cuda::memory_order_relaxed);
AtomicOffsetT& loc_i = d_lookback[block_idx * RADIX_DIGITS + bin];
OffsetT value_i = inc_sum | LOOKBACK_GLOBAL_MASK;
#if defined(__NVCOMPILER_CUDA__)
(volatile OffsetT&)loc_i = value_i;
#else
loc_i.store(value_i, cuda::memory_order_relaxed);
#endif
s.global_offsets[bin] += inc_sum - bins[u];
}
}
Expand Down Expand Up @@ -656,7 +676,11 @@ struct AgentRadixSortOnesweep
// initialization
if (threadIdx.x == 0)
{
#if defined(__NVCOMPILER_CUDA__)
s.block_idx = atomicAdd(d_ctrs, 1);
#else
s.block_idx = d_ctrs->fetch_add(1, cuda::memory_order_relaxed);
#endif
}
CTA_SYNC();
block_idx = s.block_idx;
Expand Down
14 changes: 13 additions & 1 deletion cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@

#pragma once

#if !defined(__NVCOMPILER_CUDA__)
#include <cuda/atomic>
#endif

#include <stdio.h>
#include <iterator>

Expand Down Expand Up @@ -535,7 +538,12 @@ template <
typename KeyT,
typename ValueT,
typename OffsetT,
typename AtomicOffsetT = cuda::atomic<OffsetT, cuda::thread_scope_device> >
#if defined(__NVCOMPILER_CUDA__)
typename AtomicOffsetT = OffsetT
#else
typename AtomicOffsetT = cuda::atomic<OffsetT, cuda::thread_scope_device>
#endif
>
__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
DeviceRadixSortOnesweepKernel
(AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out,
Expand Down Expand Up @@ -1360,7 +1368,11 @@ struct DispatchRadixSort :
cudaError_t InvokeOnesweep()
{
typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
#if defined(__NVCOMPILER_CUDA__)
typedef OffsetT AtomicOffsetT;
#else
typedef cuda::atomic<OffsetT, cuda::thread_scope_device> AtomicOffsetT;
#endif
// compute temporary storage size
const int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS;
const int RADIX_DIGITS = 1 << RADIX_BITS;
Expand Down

0 comments on commit 520a7ac

Please sign in to comment.