Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #432 from senior-zero/fix-main/github/scan_documen…
Browse files Browse the repository at this point in the history
…tation

Remove run-to-run determinism guarantee from the scan documentation
  • Loading branch information
gevtushenko authored Feb 8, 2022
2 parents cba758a + 7a8320c commit 5e76177
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 49 deletions.
98 changes: 53 additions & 45 deletions cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Performance
Expand Down Expand Up @@ -139,6 +138,8 @@ struct DeviceScan
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -181,11 +182,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -231,6 +231,8 @@ struct DeviceScan
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam InitValueT <b>[inferred]</b> Type of the \p init_value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -272,7 +274,7 @@ struct DeviceScan
typename InitValueIterT=InitValueT*>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveScan(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
Expand Down Expand Up @@ -310,11 +312,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -346,6 +347,8 @@ struct DeviceScan
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -381,11 +384,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -430,6 +432,8 @@ struct DeviceScan
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -468,11 +472,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -507,6 +510,8 @@ struct DeviceScan
* \tparam ValuesInputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
* \tparam ValuesOutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -515,7 +520,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveSumByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down Expand Up @@ -558,11 +563,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -622,6 +626,8 @@ struct DeviceScan
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam InitValueT <b>[inferred]</b> Type of the \p init_value value used in Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -632,7 +638,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveScanByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down Expand Up @@ -668,11 +674,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -707,6 +712,8 @@ struct DeviceScan
* \tparam ValuesInputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
* \tparam ValuesOutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -715,7 +722,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveSumByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down Expand Up @@ -750,11 +757,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -813,6 +819,8 @@ struct DeviceScan
* \tparam ValuesOutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -822,7 +830,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveScanByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down
8 changes: 4 additions & 4 deletions cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ struct DispatchScanByKey:
InputT,
InitValueT>;

void* d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void* d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in; ///< [in] Iterator to the input sequence of key items
ValuesInputIteratorT d_values_in; ///< [in] Iterator to the input sequence of value items
Expand All @@ -228,7 +228,7 @@ struct DispatchScanByKey:

CUB_RUNTIME_FUNCTION __forceinline__
DispatchScanByKey(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Iterator to the input sequence of value items
Expand Down Expand Up @@ -388,12 +388,12 @@ struct DispatchScanByKey:
*/
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Iterator to the input sequence of value items
ValuesOutputIteratorT d_values_out, ///< [out] Iterator to the input sequence of value items
EqualityOp equality_op, ///< [in]Binary equality functor
EqualityOp equality_op, ///< [in] Binary equality functor
ScanOpT scan_op, ///< [in] Binary scan functor
InitValueT init_value, ///< [in] Initial value to seed the exclusive scan
OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
Expand Down

0 comments on commit 5e76177

Please sign in to comment.