Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #448 from allisonvacanti/if_target_prep
Browse files Browse the repository at this point in the history
Add libcu++ dependency; initial round of `NV_IF_TARGET` ports.
  • Loading branch information
alliepiper authored May 17, 2022
2 parents 56dcb06 + 4de961a commit 5571258
Show file tree
Hide file tree
Showing 63 changed files with 1,083 additions and 1,130 deletions.
7 changes: 1 addition & 6 deletions cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ template <
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
typename OffsetT, ///< Signed integer type for global offsets
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused)
struct AgentHistogram
{
//---------------------------------------------------------------------
Expand Down Expand Up @@ -562,15 +562,10 @@ struct AgentHistogram
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);

// Accumulate samples
#if CUB_PTX_ARCH >= 120
if (prefer_smem)
AccumulateSmemPixels(samples, is_valid);
else
AccumulateGmemPixels(samples, is_valid);
#else
AccumulateGmemPixels(samples, is_valid);
#endif

}


Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ struct AgentRle
// Constants
enum
{
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
WARP_THREADS = CUB_WARP_THREADS(0),
BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD,
WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD,
Expand Down
3 changes: 1 addition & 2 deletions cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,7 @@ struct AgentSegmentFixup
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,

// Whether or not do fixup using RLE + global atomics
USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
(std::is_same<ValueT, float>::value ||
USE_ATOMIC_FIXUP = (std::is_same<ValueT, float>::value ||
std::is_same<ValueT, int>::value ||
std::is_same<ValueT, unsigned int>::value ||
std::is_same<ValueT, unsigned long long>::value),
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ template <
typename OffsetT, ///< Signed integer type for sequence offsets
bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1
bool HAS_BETA, ///< Whether the input parameter \p beta is 0
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused)
struct AgentSpmv
{
//---------------------------------------------------------------------
Expand Down
33 changes: 24 additions & 9 deletions cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
#include <cub/warp/warp_merge_sort.cuh>
#include <cub/warp/warp_store.cuh>

#include <nv/target>

#include <thrust/system/cuda/detail/core/util.h>


Expand Down Expand Up @@ -108,6 +110,23 @@ class AgentSubWarpSort
{
template <typename T>
__device__ bool operator()(T lhs, T rhs)
{
return this->impl(lhs, rhs);
}

#if defined(__CUDA_FP16_TYPES_EXIST__)
__device__ bool operator()(__half lhs, __half rhs)
{
// Need to explicitly cast to float for SM <= 52.
NV_IF_TARGET(NV_PROVIDES_SM_53,
(return this->impl(lhs, rhs);),
(return this->impl(__half2float(lhs), __half2float(rhs));));
}
#endif

private:
template <typename T>
__device__ bool impl(T lhs, T rhs)
{
if (IS_DESCENDING)
{
Expand All @@ -118,19 +137,15 @@ class AgentSubWarpSort
return lhs < rhs;
}
}

#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
__device__ bool operator()(__half lhs, __half rhs)
{
return (*this)(__half2float(lhs), __half2float(rhs));
}
#endif
};

#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
#if defined(__CUDA_FP16_TYPES_EXIST__)
__device__ static bool equal(__half lhs, __half rhs)
{
return __half2float(lhs) == __half2float(rhs);
// Need to explicitly cast to float for SM <= 52.
NV_IF_TARGET(NV_PROVIDES_SM_53,
(return lhs == rhs;),
(return __half2float(lhs) == __half2float(rhs);));
}
#endif

Expand Down
4 changes: 2 additions & 2 deletions cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -666,11 +666,11 @@ template <
typename T,
typename ScanOpT,
typename ScanTileStateT,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
struct TilePrefixCallbackOp
{
// Parameterized warp reduce
typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
typedef WarpReduce<T, CUB_PTX_WARP_THREADS> WarpReduceT;

// Temporary storage type
struct _TempStorage
Expand Down
7 changes: 3 additions & 4 deletions cub/block/block_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@

CUB_NAMESPACE_BEGIN


/**
* @brief BlockAdjacentDifference provides
* [<em>collective</em>](index.html#sec0) methods for computing the
Expand Down Expand Up @@ -125,9 +124,9 @@ CUB_NAMESPACE_BEGIN
*/
template <typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int LEGACY_PTX_ARCH = 0>
class BlockAdjacentDifference
{
private:
Expand Down
4 changes: 2 additions & 2 deletions cub/block/block_discontinuity.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
Expand Down Expand Up @@ -107,7 +107,7 @@ template <
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockDiscontinuity
{
private:
Expand Down
9 changes: 4 additions & 5 deletions cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ CUB_NAMESPACE_BEGIN
* \tparam WARP_TIME_SLICING <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - It is commonplace for blocks of threads to rearrange data items between
Expand Down Expand Up @@ -114,7 +114,7 @@ template <
bool WARP_TIME_SLICING = false,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockExchange
{
private:
Expand All @@ -129,11 +129,11 @@ private:
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0),
SMEM_BANKS = 1 << LOG_SMEM_BANKS,

TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
Expand Down Expand Up @@ -1126,4 +1126,3 @@ public:


CUB_NAMESPACE_END

20 changes: 4 additions & 16 deletions cub/block/block_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ enum BlockHistogramAlgorithm
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
Expand Down Expand Up @@ -160,7 +160,7 @@ template <
BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockHistogram
{
private:
Expand All @@ -176,27 +176,15 @@ private:
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};

/**
* Ensure the template parameterization meets the requirements of the
* targeted device architecture. BLOCK_HISTO_ATOMIC can only be used
* on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used
* regardless.
*/
static const BlockHistogramAlgorithm SAFE_ALGORITHM =
((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
BLOCK_HISTO_SORT :
ALGORITHM;

/// Internal specialization.
using InternalBlockHistogram =
cub::detail::conditional_t<SAFE_ALGORITHM == BLOCK_HISTO_SORT,
cub::detail::conditional_t<ALGORITHM == BLOCK_HISTO_SORT,
BlockHistogramSort<T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
BINS,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
PTX_ARCH>,
BLOCK_DIM_Z>,
BlockHistogramAtomic<BINS>>;

/// Shared memory storage layout type for BlockHistogram
Expand Down
14 changes: 7 additions & 7 deletions cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ enum BlockLoadAlgorithm
* \tparam WARP_TIME_SLICING <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - The BlockLoad class provides a single data movement abstraction that can be specialized
Expand Down Expand Up @@ -638,7 +638,7 @@ template <
BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockLoad
{
private:
Expand Down Expand Up @@ -860,7 +860,7 @@ private:
struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
{
// BlockExchange utility type for keys
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;

/// Shared memory storage layout type
struct _TempStorage : BlockExchange::TempStorage
Expand Down Expand Up @@ -928,14 +928,14 @@ private:
{
enum
{
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
WARP_THREADS = CUB_WARP_THREADS(0)
};

// Assert BLOCK_THREADS must be a multiple of WARP_THREADS
CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");

// BlockExchange utility type for keys
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;

/// Shared memory storage layout type
struct _TempStorage : BlockExchange::TempStorage
Expand Down Expand Up @@ -1003,14 +1003,14 @@ private:
{
enum
{
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
WARP_THREADS = CUB_WARP_THREADS(0)
};

// Assert BLOCK_THREADS must be a multiple of WARP_THREADS
CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");

// BlockExchange utility type for keys
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;

/// Shared memory storage layout type
struct _TempStorage : BlockExchange::TempStorage
Expand Down
18 changes: 8 additions & 10 deletions cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ struct BlockRadixRankEmptyCallback
* \tparam SMEM_CONFIG <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* Blah...
Expand Down Expand Up @@ -138,12 +138,12 @@ template <
int BLOCK_DIM_X,
int RADIX_BITS,
bool IS_DESCENDING,
bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
bool MEMOIZE_OUTER_SCAN = true,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockRadixRank
{
private:
Expand All @@ -168,7 +168,7 @@ private:

RADIX_DIGITS = 1 << RADIX_BITS,

LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

Expand Down Expand Up @@ -203,8 +203,7 @@ private:
BLOCK_DIM_X,
INNER_SCAN_ALGORITHM,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
PTX_ARCH>
BLOCK_DIM_Z>
BlockScan;


Expand Down Expand Up @@ -508,7 +507,7 @@ template <
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockRadixRankMatch
{
private:
Expand All @@ -527,7 +526,7 @@ private:

RADIX_DIGITS = 1 << RADIX_BITS,

LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

Expand Down Expand Up @@ -558,8 +557,7 @@ private:
BLOCK_THREADS,
INNER_SCAN_ALGORITHM,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
PTX_ARCH>
BLOCK_DIM_Z>
BlockScanT;


Expand Down
Loading

0 comments on commit 5571258

Please sign in to comment.