From 7a8320c3ab1aa0d73946f443036c20edf85f68be Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 8 Feb 2022 02:58:11 +0300 Subject: [PATCH] Remove run-to-run determinism guarantee from the documentation --- cub/device/device_scan.cuh | 98 +++++++++++--------- cub/device/dispatch/dispatch_scan_by_key.cuh | 8 +- 2 files changed, 57 insertions(+), 49 deletions(-) diff --git a/cub/device/device_scan.cuh b/cub/device/device_scan.cuh index b66dcf7dd3..199afc1487 100644 --- a/cub/device/device_scan.cuh +++ b/cub/device/device_scan.cuh @@ -96,11 +96,10 @@ struct DeviceScan * * \par * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Performance @@ -139,6 +138,8 @@ struct DeviceScan * * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename InputIteratorT, @@ -181,11 +182,10 @@ struct DeviceScan * * \par * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Snippet @@ -231,6 +231,8 @@ struct DeviceScan * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam InitValueT [inferred] Type of the \p init_value used Binary scan functor type having member T operator()(const T &a, const T &b) + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename InputIteratorT, @@ -272,7 +274,7 @@ struct DeviceScan typename InitValueIterT=InitValueT*> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items @@ -310,11 +312,10 @@ struct DeviceScan * * \par * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Snippet @@ -346,6 +347,8 @@ struct DeviceScan * * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename InputIteratorT, @@ -381,11 +384,10 @@ struct DeviceScan * * \par * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Snippet @@ -430,6 +432,8 @@ struct DeviceScan * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename InputIteratorT, @@ -468,11 +472,10 @@ struct DeviceScan * * \par * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Snippet @@ -507,6 +510,8 @@ struct DeviceScan * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename KeysInputIteratorT, @@ -515,7 +520,7 @@ struct DeviceScan typename EqualityOpT = Equality> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items @@ -558,11 +563,10 @@ struct DeviceScan * * \par * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Snippet @@ -622,6 +626,8 @@ struct DeviceScan * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam InitValueT [inferred] Type of the \p init_value value used in Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename KeysInputIteratorT, @@ -632,7 +638,7 @@ struct DeviceScan typename EqualityOpT = Equality> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items @@ -668,11 +674,10 @@ struct DeviceScan * * \par * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Snippet @@ -707,6 +712,8 @@ struct DeviceScan * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename KeysInputIteratorT, @@ -715,7 +722,7 @@ struct DeviceScan typename EqualityOpT = Equality> CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items @@ -750,11 +757,10 @@ struct DeviceScan * * \par * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. * - \devicestorage * * \par Snippet @@ -813,6 +819,8 @@ struct DeviceScan * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename KeysInputIteratorT, @@ -822,7 +830,7 @@ struct DeviceScan typename EqualityOpT = Equality> CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items diff --git a/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/device/dispatch/dispatch_scan_by_key.cuh index 58f5c3bcdb..ab71990e88 100644 --- a/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -213,7 +213,7 @@ struct DispatchScanByKey: InputT, InitValueT>; - void* d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void* d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeysInputIteratorT d_keys_in; ///< [in] Iterator to the input sequence of key items ValuesInputIteratorT d_values_in; ///< [in] Iterator to the input sequence of value items @@ -228,7 +228,7 @@ struct DispatchScanByKey: CUB_RUNTIME_FUNCTION __forceinline__ DispatchScanByKey( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeysInputIteratorT d_keys_in, ///< [in] Iterator to the input sequence of key items ValuesInputIteratorT d_values_in, ///< [in] Iterator to the input sequence of value items @@ -388,12 +388,12 @@ struct DispatchScanByKey: */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation KeysInputIteratorT d_keys_in, ///< [in] Iterator to the input sequence of key items ValuesInputIteratorT d_values_in, ///< [in] Iterator to the input sequence of value items ValuesOutputIteratorT d_values_out, ///< [out] Iterator to the input sequence of value items - EqualityOp equality_op, ///< [in]Binary equality functor + EqualityOp equality_op, ///< [in] Binary equality functor ScanOpT scan_op, ///< [in] Binary scan functor InitValueT init_value, ///< [in] Initial value to seed the exclusive scan OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)