Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Remove run-to-run determinism guarantee from the scan documentation #432

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 53 additions & 45 deletions cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Performance
Expand Down Expand Up @@ -139,6 +138,8 @@ struct DeviceScan
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -181,11 +182,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -231,6 +231,8 @@ struct DeviceScan
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam InitValueT <b>[inferred]</b> Type of the \p init_value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -272,7 +274,7 @@ struct DeviceScan
typename InitValueIterT=InitValueT*>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveScan(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
Expand Down Expand Up @@ -310,11 +312,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -346,6 +347,8 @@ struct DeviceScan
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -381,11 +384,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -430,6 +432,8 @@ struct DeviceScan
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename InputIteratorT,
Expand Down Expand Up @@ -468,11 +472,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -507,6 +510,8 @@ struct DeviceScan
* \tparam ValuesInputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
* \tparam ValuesOutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -515,7 +520,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveSumByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down Expand Up @@ -558,11 +563,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -622,6 +626,8 @@ struct DeviceScan
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam InitValueT <b>[inferred]</b> Type of the \p init_value value used in Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -632,7 +638,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveScanByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down Expand Up @@ -668,11 +674,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative sum operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -707,6 +712,8 @@ struct DeviceScan
* \tparam ValuesInputIteratorT <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
* \tparam ValuesOutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -715,7 +722,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveSumByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down Expand Up @@ -750,11 +757,10 @@ struct DeviceScan
*
* \par
* - Supports non-commutative scan operators.
* - Provides "run-to-run" determinism for pseudo-associative reduction
* (e.g., addition of floating point types) on the same GPU device.
* However, results for pseudo-associative reduction may be inconsistent
* from one device to a another device of a different compute-capability
* because CUB can employ different tile-sizing for different architectures.
* - Results are not deterministic for pseudo-associative operators (e.g.,
* addition of floating-point types). Results for pseudo-associative
* operators may vary from run to run. Additional details can be found in
* the [decoupled look-back] description.
* - \devicestorage
*
* \par Snippet
Expand Down Expand Up @@ -813,6 +819,8 @@ struct DeviceScan
* \tparam ValuesOutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
* \tparam ScanOp <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam EqualityOpT <b>[inferred][/b] Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
*
* [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
*/
template <
typename KeysInputIteratorT,
Expand All @@ -822,7 +830,7 @@ struct DeviceScan
typename EqualityOpT = Equality>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveScanByKey(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items
Expand Down
8 changes: 4 additions & 4 deletions cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ struct DispatchScanByKey:
InputT,
InitValueT>;

void* d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void* d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in; ///< [in] Iterator to the input sequence of key items
ValuesInputIteratorT d_values_in; ///< [in] Iterator to the input sequence of value items
Expand All @@ -228,7 +228,7 @@ struct DispatchScanByKey:

CUB_RUNTIME_FUNCTION __forceinline__
DispatchScanByKey(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Iterator to the input sequence of value items
Expand Down Expand Up @@ -388,12 +388,12 @@ struct DispatchScanByKey:
*/
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Iterator to the input sequence of key items
ValuesInputIteratorT d_values_in, ///< [in] Iterator to the input sequence of value items
ValuesOutputIteratorT d_values_out, ///< [out] Iterator to the input sequence of value items
EqualityOp equality_op, ///< [in]Binary equality functor
EqualityOp equality_op, ///< [in] Binary equality functor
ScanOpT scan_op, ///< [in] Binary scan functor
InitValueT init_value, ///< [in] Initial value to seed the exclusive scan
OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
Expand Down