diff --git a/cub/test/c2h/generators.cu b/cub/test/c2h/generators.cu index 274afea5e3..d86e5c73a0 100644 --- a/cub/test/c2h/generators.cu +++ b/cub/test/c2h/generators.cu @@ -413,9 +413,26 @@ void init_key_segments(const thrust::device_vector &segment_offsets, template void init_key_segments(const thrust::device_vector &segment_offsets, std::int32_t *out, std::size_t element_size); +template void init_key_segments(const thrust::device_vector &segment_offsets, + std::uint8_t *out, + std::size_t element_size); +template void init_key_segments(const thrust::device_vector &segment_offsets, + float *out, + std::size_t element_size); template void init_key_segments(const thrust::device_vector &segment_offsets, custom_type_state_t *out, std::size_t element_size); +#ifdef TEST_HALF_T +template void init_key_segments(const thrust::device_vector &segment_offsets, + half_t *out, + std::size_t element_size); +#endif + +#ifdef TEST_BF_T +template void init_key_segments(const thrust::device_vector &segment_offsets, + bfloat16_t *out, + std::size_t element_size); +#endif } // namespace detail template diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh index 7f317f435c..e9a0dc447c 100644 --- a/cub/test/catch2_test_device_scan.cuh +++ b/cub/test/catch2_test_device_scan.cuh @@ -27,32 +27,189 @@ #pragma once -template +#include +#include + +#include +#include + +/** + * @brief Helper class template to facilitate specifying input/output type pairs along with the key + * type for *-by-key algorithms, and an equality operator type. + */ +template +struct type_quad +{ + using input_t = InputT; + using output_t = OutputT; + using key_t = KeyT; + using eq_op_t = EqualityOpT; +}; + +/** + * @brief Mod2Equality (used for integral keys, making keys more likely to equal each other) + */ +struct Mod2Equality +{ + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return (a % 2) == (b % 2); + } +}; + +template void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, - T init, + InitT init, BinaryOp op) { - T acc = init; + using value_t = cub::detail::value_t; + using accum_t = cub::detail::accumulator_t; + using output_t = cub::detail::value_t; + accum_t acc = static_cast(init); for (; first != last; ++first) { - *result++ = acc; + *result++ = static_cast(acc); acc = op(acc, *first); } } -template +template void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, - T init) + InitT init) { - T acc = init; + using value_t = cub::detail::value_t; + using accum_t = cub::detail::accumulator_t; + using output_t = cub::detail::value_t; + accum_t acc = static_cast(init); for (; first != last; ++first) { acc = op(acc, *first); - *result++ = acc; + *result++ = static_cast(acc); + } +} + +template +void compute_exclusive_scan_by_key_reference(ValueInItT h_values_it, + KeyInItT h_keys_it, + ValuesOutItT result_out_it, + ScanOpT scan_op, + EqualityOpT equality_op, + InitT init, + std::size_t num_items) +{ + using value_t = cub::detail::value_t; + using accum_t = cub::detail::accumulator_t; + using output_t = cub::detail::value_t; + + if (num_items > 0) + { + for (std::size_t i = 0; i < num_items;) + { + accum_t val = static_cast(h_values_it[i]); + result_out_it[i] = init; + accum_t inclusive = static_cast(scan_op(init, val)); + + ++i; + + for (; i < num_items && equality_op(h_keys_it[i - 1], h_keys_it[i]); ++i) + { + val = static_cast(h_values_it[i]); + result_out_it[i] = static_cast(inclusive); + inclusive = static_cast(scan_op(inclusive, val)); + } + } + } +} + +template +void compute_exclusive_scan_by_key_reference(const thrust::device_vector &d_values, + const thrust::device_vector &d_keys, + ValuesOutItT result_out_it, + ScanOpT scan_op, + EqualityOpT equality_op, + InitT init) +{ + thrust::host_vector host_values(d_values); + thrust::host_vector host_keys(d_keys); + + std::size_t num_items = host_values.size(); + + compute_exclusive_scan_by_key_reference(host_values.cbegin(), + host_keys.cbegin(), + result_out_it, + scan_op, + equality_op, + init, + num_items); +} + +template +void compute_inclusive_scan_by_key_reference(ValueInItT h_values_it, + KeyInItT h_keys_it, + ValuesOutItT result_out_it, + ScanOpT scan_op, + EqualityOpT equality_op, + std::size_t num_items) +{ + using value_t = cub::detail::value_t; + using accum_t = cub::detail::accumulator_t; + using output_t = cub::detail::value_t; + + for (std::size_t i = 0; i < num_items;) + { + accum_t inclusive = h_values_it[i]; + result_out_it[i] = static_cast(inclusive); + + ++i; + + for (; i < num_items && equality_op(h_keys_it[i - 1], h_keys_it[i]); ++i) + { + accum_t val = h_values_it[i]; + inclusive = static_cast(scan_op(inclusive, val)); + result_out_it[i] = static_cast(inclusive); + } } } + +template +void compute_inclusive_scan_by_key_reference(const thrust::device_vector &d_values, + const thrust::device_vector &d_keys, + ValuesOutItT result_out_it, + ScanOpT scan_op, + EqualityOpT equality_op) +{ + thrust::host_vector host_values(d_values); + thrust::host_vector host_keys(d_keys); + + std::size_t num_items = host_values.size(); + + compute_inclusive_scan_by_key_reference(host_values.cbegin(), + host_keys.cbegin(), + result_out_it, + scan_op, + equality_op, + num_items); +} diff --git a/cub/test/catch2_test_device_scan_by_key.cu b/cub/test/catch2_test_device_scan_by_key.cu new file mode 100644 index 0000000000..113169c3b4 --- /dev/null +++ b/cub/test/catch2_test_device_scan_by_key.cu @@ -0,0 +1,451 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include + +#include +#include + +#include + +#include "catch2_test_device_reduce.cuh" +#include "catch2_test_device_scan.cuh" + +// Has to go after all cub headers. Otherwise, this test won't catch unused +// variables in cub kernels. +#include "c2h/custom_type.cuh" +#include "c2h/extended_types.cuh" +#include "catch2/catch.hpp" +#include "catch2_test_cdp_helper.h" +#include "catch2_test_helper.h" + +DECLARE_CDP_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, device_exclusive_sum_by_key); +DECLARE_CDP_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_exclusive_scan_by_key); +DECLARE_CDP_WRAPPER(cub::DeviceScan::InclusiveSumByKey, device_inclusive_sum_by_key); +DECLARE_CDP_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_inclusive_scan_by_key); + +// %PARAM% TEST_CDP cdp 0:1 +// %PARAM% TEST_TYPES types 0:1:2:3 + +// List of types to test +using custom_t = c2h::custom_type_t; + +// type_quad's parameters and defaults: +// type_quad +#if TEST_TYPES == 0 +using full_type_list = + c2h::type_list, + type_quad>; +#elif TEST_TYPES == 1 +using full_type_list = c2h::type_list, type_quad>; +#elif TEST_TYPES == 2 +using full_type_list = + c2h::type_list, + type_quad>; +#elif TEST_TYPES == 3 +// clang-format off +using full_type_list = c2h::type_list< +type_quad +#if TEST_HALF_T +, type_quad // testing half +#endif +#if TEST_BF_T +, type_quad // testing bf16 +#endif +>; +// clang-format on +#endif + +CUB_TEST("Device scan works with all device interfaces", "[by_key][scan][device]", full_type_list) +{ + using params = params_t; + using key_t = typename params::type_pair_t::key_t; + using value_t = typename params::item_t; + using output_t = typename params::output_t; + using offset_t = std::uint32_t; + using eq_op_t = typename params::type_pair_t::eq_op_t; + + constexpr offset_t min_items = 1; + constexpr offset_t max_items = 1000000; + + // Generate the input sizes to test for + const offset_t num_items = GENERATE_COPY(take(2, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + INFO("Test num_items: " << num_items); + + // Range of segment sizes to generate (a segment is a series of consecutive equal keys) + const std::tuple seg_size_range = + GENERATE_COPY(table({{1, 1}, {1, num_items}, {num_items, num_items}})); + INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " + << std::get<1>(seg_size_range) << "]"); + + // Generate input segments + thrust::device_vector segment_offsets = + c2h::gen_uniform_offsets(CUB_SEED(1), + num_items, + std::get<0>(seg_size_range), + std::get<1>(seg_size_range)); + + // Get array of keys from segment offsets + thrust::device_vector segment_keys(num_items); + c2h::init_key_segments(segment_offsets, segment_keys); + auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data()); + + // Generate input data + thrust::device_vector in_values(num_items); + c2h::gen(CUB_SEED(2), in_values); + auto d_values_it = thrust::raw_pointer_cast(in_values.data()); + +// Skip DeviceScan::InclusiveSum and DeviceScan::ExclusiveSum tests for extended floating-point +// types because of unbounded epsilon due to pseudo associativity of the addition operation over +// floating point numbers +#if TEST_TYPES != 3 + SECTION("inclusive sum") + { + using op_t = cub::Sum; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_inclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + op_t{}, + eq_op_t{}); + + // Run test + thrust::device_vector out_values(num_items); + auto d_values_out_it = thrust::raw_pointer_cast(out_values.data()); + device_inclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items, eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + + // Run test in-place + CUB_IF_CONSTEXPR(std::is_same::value) + { + // Copy input values to memory allocated for output values, to ensure in_values are + // unchanged for a (potentially) subsequent test that uses in_values as input + out_values = in_values; + auto values_in_out_it = thrust::raw_pointer_cast(out_values.data()); + device_inclusive_sum_by_key(d_keys_it, + values_in_out_it, + values_in_out_it, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } + } + + SECTION("exclusive sum") + { + using op_t = cub::Sum; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_exclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + op_t{}, + eq_op_t{}, + output_t{}); + + // Run test + thrust::device_vector out_values(num_items); + auto d_values_out_it = thrust::raw_pointer_cast(out_values.data()); + device_exclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items, eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + + // Run test in-place + CUB_IF_CONSTEXPR(std::is_same::value) + { + // Copy input values to memory allocated for output values, to ensure in_values are + // unchanged for a (potentially) subsequent test that uses in_values as input + out_values = in_values; + auto values_in_out_it = thrust::raw_pointer_cast(out_values.data()); + device_exclusive_sum_by_key(d_keys_it, + values_in_out_it, + values_in_out_it, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } + } +#endif + + SECTION("inclusive scan") + { + using op_t = cub::Min; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_inclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + op_t{}, + eq_op_t{}); + + // Run test + thrust::device_vector out_values(num_items); + auto d_values_out_it = thrust::raw_pointer_cast(out_values.data()); + device_inclusive_scan_by_key(d_keys_it, + unwrap_it(d_values_it), + unwrap_it(d_values_out_it), + op_t{}, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + + // Run test in-place + CUB_IF_CONSTEXPR(std::is_same::value) + { + // Copy input values to memory allocated for output values, to ensure in_values are + // unchanged for a (potentially) subsequent test that uses in_values as input + out_values = in_values; + auto values_in_out_it = thrust::raw_pointer_cast(out_values.data()); + device_inclusive_scan_by_key(d_keys_it, + unwrap_it(values_in_out_it), + unwrap_it(values_in_out_it), + op_t{}, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } + } + + SECTION("exclusive scan") + { + using op_t = cub::Sum; + + // Scan operator + auto scan_op = unwrap_op(reference_extended_fp(d_values_it), op_t{}); + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_exclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + scan_op, + eq_op_t{}, + output_t{}); + + // Run test + thrust::device_vector out_values(num_items); + auto d_values_out_it = thrust::raw_pointer_cast(out_values.data()); + using init_t = cub::detail::value_t; + device_exclusive_scan_by_key(d_keys_it, + unwrap_it(d_values_it), + unwrap_it(d_values_out_it), + scan_op, + init_t{}, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + + // Run test in-place + CUB_IF_CONSTEXPR(std::is_same::value) + { + // Copy input values to memory allocated for output values, to ensure in_values are + // unchanged for a (potentially) subsequent test that uses in_values as input + out_values = in_values; + auto values_in_out_it = thrust::raw_pointer_cast(out_values.data()); + device_exclusive_scan_by_key(d_keys_it, + unwrap_it(values_in_out_it), + unwrap_it(values_in_out_it), + scan_op, + init_t{}, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } + } +} + +#if TEST_TYPES == 0 +using key_alias_type_list = c2h::type_list; +#elif TEST_TYPES == 1 +using key_alias_type_list = c2h::type_list; +#elif TEST_TYPES == 2 +using key_alias_type_list = c2h::type_list; +#elif TEST_TYPES == 3 +using key_alias_type_list = c2h::type_list; +#endif + +CUB_TEST("Device scan works when memory for keys and results alias one another", + "[by_key][scan][device]", + key_alias_type_list) +{ + using key_t = typename c2h::get<0, TestType>; + using value_t = key_t; + using output_t = key_t; + using offset_t = std::uint32_t; + + constexpr offset_t min_items = 1; + constexpr offset_t max_items = 1000000; + + // Generate the input sizes to test for + const offset_t num_items = GENERATE_COPY(take(2, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + INFO("Test num_items: " << num_items); + + // Range of segment sizes to generate (a segment is a series of consecutive equal keys) + const std::tuple seg_size_range = + GENERATE_COPY(table({{1, 1}, {1, num_items}, {num_items, num_items}})); + INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " + << std::get<1>(seg_size_range) << "]"); + + // Generate input segments + thrust::device_vector segment_offsets = + c2h::gen_uniform_offsets(CUB_SEED(1), + num_items, + std::get<0>(seg_size_range), + std::get<1>(seg_size_range)); + + // Get array of keys from segment offsets + thrust::device_vector segment_keys(num_items); + c2h::init_key_segments(segment_offsets, segment_keys); + auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data()); + + // Generate input data + thrust::device_vector in_values(num_items); + c2h::gen(CUB_SEED(2), in_values); + auto d_values_it = thrust::raw_pointer_cast(in_values.data()); + + SECTION("inclusive sum") + { + using op_t = cub::Sum; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_inclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + op_t{}, + cub::Equality{}); + + // Run test + auto d_values_out_it = d_keys_it; + device_inclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items); + + // Verify result + REQUIRE(expected_result == segment_keys); + } + + SECTION("exclusive sum") + { + using op_t = cub::Sum; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_exclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + op_t{}, + cub::Equality{}, + output_t{}); + + // Run test + auto d_values_out_it = d_keys_it; + device_exclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items); + + // Verify result + REQUIRE(expected_result == segment_keys); + } + + SECTION("inclusive scan") + { + using op_t = cub::Min; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_inclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + op_t{}, + cub::Equality{}); + + // Run test + auto d_values_out_it = d_keys_it; + device_inclusive_scan_by_key(d_keys_it, d_values_it, d_values_out_it, op_t{}, num_items); + + // Verify result + REQUIRE(expected_result == segment_keys); + } + + SECTION("exclusive scan") + { + using op_t = cub::Sum; + + // Scan operator + auto scan_op = op_t{}; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_exclusive_scan_by_key_reference(in_values, + segment_keys, + expected_result.begin(), + scan_op, + cub::Equality{}, + output_t{}); + + // Run test + auto d_values_out_it = d_keys_it; + using init_t = value_t; + device_exclusive_scan_by_key(d_keys_it, + d_values_it, + d_values_out_it, + scan_op, + init_t{}, + num_items); + + // Verify result + REQUIRE(expected_result == segment_keys); + } +} diff --git a/cub/test/catch2_test_device_scan_by_key_iterators.cu b/cub/test/catch2_test_device_scan_by_key_iterators.cu new file mode 100644 index 0000000000..71f4bcc821 --- /dev/null +++ b/cub/test/catch2_test_device_scan_by_key_iterators.cu @@ -0,0 +1,231 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include + +#include +#include + +#include + +#include "catch2_test_device_reduce.cuh" +#include "catch2_test_device_scan.cuh" + +// Has to go after all cub headers. Otherwise, this test won't catch unused +// variables in cub kernels. +#include "c2h/custom_type.cuh" +#include "c2h/extended_types.cuh" +#include "catch2/catch.hpp" +#include "catch2_test_cdp_helper.h" +#include "catch2_test_helper.h" + +DECLARE_CDP_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, device_exclusive_sum_by_key); +DECLARE_CDP_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_exclusive_scan_by_key); +DECLARE_CDP_WRAPPER(cub::DeviceScan::InclusiveSumByKey, device_inclusive_sum_by_key); +DECLARE_CDP_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_inclusive_scan_by_key); + +// %PARAM% TEST_CDP cdp 0:1 +// %PARAM% TEST_TYPES types 0:1:2:3 + +// List of types to test +using custom_t = c2h::custom_type_t; + +// type_quad's parameters and defaults: +// type_quad +#if TEST_TYPES == 0 +using full_type_list = + c2h::type_list, + type_quad>; +#elif TEST_TYPES == 1 +using full_type_list = c2h::type_list, type_quad>; +#elif TEST_TYPES == 2 +using full_type_list = + c2h::type_list, + type_quad>; +#elif TEST_TYPES == 3 +using full_type_list = c2h::type_list>; +#endif + +/** + * @brief Input data generation mode + */ +enum class gen_data_t : int +{ + /// Uniform random data generation + GEN_TYPE_RANDOM, + /// Constant value as input data + GEN_TYPE_CONST +}; + +CUB_TEST("Device scan works with fancy iterators", "[by_key][scan][device]", full_type_list) +{ + using params = params_t; + using key_t = typename params::type_pair_t::key_t; + using value_t = typename params::item_t; + using output_t = typename params::output_t; + using offset_t = std::uint32_t; + using eq_op_t = typename params::type_pair_t::eq_op_t; + + constexpr offset_t min_items = 1; + constexpr offset_t max_items = 1000000; + + // Generate the input sizes to test for + const offset_t num_items = GENERATE_COPY(take(2, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + INFO("Test num_items: " << num_items); + + // Range of segment sizes to generate (a segment is a series of consecutive equal keys) + const std::tuple seg_size_range = + GENERATE_COPY(table({{1, 1}, {1, num_items}, {num_items, num_items}})); + INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " + << std::get<1>(seg_size_range) << "]"); + + // Generate input segments + thrust::device_vector segment_offsets = + c2h::gen_uniform_offsets(CUB_SEED(1), + num_items, + std::get<0>(seg_size_range), + std::get<1>(seg_size_range)); + + // Get array of keys from segment offsets + thrust::device_vector segment_keys(num_items); + c2h::init_key_segments(segment_offsets, segment_keys); + auto d_keys_it = segment_keys.begin(); + thrust::host_vector h_segment_keys(segment_keys); + + // Prepare input data + value_t default_constant{}; + init_default_constant(default_constant); + auto values_in_it = thrust::make_constant_iterator(default_constant); + + SECTION("inclusive sum") + { + using op_t = cub::Sum; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_inclusive_scan_by_key_reference(values_in_it, + h_segment_keys.cbegin(), + expected_result.begin(), + op_t{}, + eq_op_t{}, + num_items); + + // Run test + thrust::device_vector out_values(num_items); + device_inclusive_sum_by_key(d_keys_it, values_in_it, out_values.begin(), num_items, eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } + + SECTION("exclusive sum") + { + using op_t = cub::Sum; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_exclusive_scan_by_key_reference(values_in_it, + h_segment_keys.cbegin(), + expected_result.begin(), + op_t{}, + eq_op_t{}, + output_t{}, + num_items); + + // Run test + thrust::device_vector out_values(num_items); + device_exclusive_sum_by_key(d_keys_it, values_in_it, out_values.begin(), num_items, eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } + + SECTION("inclusive scan") + { + using op_t = cub::Min; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_inclusive_scan_by_key_reference(values_in_it, + h_segment_keys.cbegin(), + expected_result.begin(), + op_t{}, + eq_op_t{}, + num_items); + + // Run test + thrust::device_vector out_values(num_items); + device_inclusive_scan_by_key(d_keys_it, + values_in_it, + out_values.begin(), + op_t{}, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } + + SECTION("exclusive scan") + { + using op_t = cub::Sum; + + // Scan operator + auto scan_op = op_t{}; + + // Prepare verification data + thrust::host_vector expected_result(num_items); + compute_exclusive_scan_by_key_reference(values_in_it, + h_segment_keys.cbegin(), + expected_result.begin(), + scan_op, + eq_op_t{}, + output_t{}, + num_items); + + // Run test + thrust::device_vector out_values(num_items); + using init_t = value_t; + device_exclusive_scan_by_key(d_keys_it, + values_in_it, + out_values.begin(), + scan_op, + init_t{}, + num_items, + eq_op_t{}); + + // Verify result + REQUIRE(expected_result == out_values); + } +} diff --git a/cub/test/test_device_scan_by_key.cu b/cub/test/test_device_scan_by_key.cu deleted file mode 100644 index c1cde25cfc..0000000000 --- a/cub/test/test_device_scan_by_key.cu +++ /dev/null @@ -1,1099 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Test of DeviceScan utilities - ******************************************************************************/ - -// Ensure printing of CUDA runtime errors to console -#define CUB_STDERR - -#include -#include -#include -#include -#include -#include - -#include - -#include "test_util.h" - -#include -#include -#include - -using namespace cub; - - -//--------------------------------------------------------------------- -// Globals, constants and typedefs -//--------------------------------------------------------------------- - -bool g_verbose = false; -int g_timing_iterations = 0; -double g_device_giga_bandwidth; -CachingDeviceAllocator g_allocator(true); - -// Dispatch types -enum Backend -{ - CUB, // CUB method - CDP, // GPU-based (dynamic parallelism) dispatch to CUB method -}; - - -enum AliasMode -{ - AliasNone, // output is allocated - AliasKeys, // output is an alias of input keys - AliasValues // output is an alias of input values -}; - - -/** - * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) - */ -template -struct WrapperFunctor -{ - OpT op; - - WrapperFunctor(OpT op) : op(op) {} - - template - __host__ __device__ __forceinline__ auto operator()(const T &a, const U &b) const - -> decltype(op(a, b)) - { - return static_cast(op(a, b)); - } -}; - -/** - * \brief DivideByFiveFunctor (used by TestIterator) - */ -template -struct DivideByFiveFunctor -{ - template - __host__ __device__ __forceinline__ OutputT operator()(const T &a) const - { - return static_cast(a / 5); - } -}; - -/** - * \brief Mod2Equality (used for non-bool keys to make keys more likely to equal each other) - */ -struct Mod2Equality -{ - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return (a % 2) == (b % 2); - } -}; - - -//--------------------------------------------------------------------- -// Dispatch to different CUB DeviceScan entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch to exclusive scan entrypoint - */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - EqualityOpT equality_op) -{ - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op); - } - return error; -} - - -/** - * Dispatch to exclusive sum entrypoint - */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - Sum /*scan_op*/, - InitialValueT /*initial_value*/, - OffsetT num_items, - EqualityOpT equality_op) -{ - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op); - } - return error; -} - - -/** - * Dispatch to inclusive scan entrypoint - */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - NullType /*initial_value*/, - OffsetT num_items, - EqualityOpT equality_op) -{ - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op); - } - return error; -} - -/** - * Dispatch to inclusive sum entrypoint - */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - Sum /*scan_op*/, - NullType /*initial_value*/, - OffsetT num_items, - EqualityOpT equality_op) -{ - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op); - } - return error; -} - -//--------------------------------------------------------------------- -// CUDA Nested Parallelism Test Kernel -//--------------------------------------------------------------------- - -#if TEST_CDP == 1 - -/** - * Simple wrapper kernel to invoke DeviceScan - */ -template -__global__ void CDPDispatchKernel(Int2Type cub_backend, - IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void *d_temp_storage, - size_t temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - EqualityOpT equality_op) -{ - *d_cdp_error = Dispatch(cub_backend, - is_primitive, - timing_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - initial_value, - num_items, - equality_op); - - *d_temp_storage_bytes = temp_storage_bytes; -} - -/** - * Dispatch to CDP kernel - */ -template -cudaError_t Dispatch(Int2Type /*dispatch_to*/, - IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void *d_temp_storage, - size_t &temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - EqualityOpT equality_op) -{ - // Invoke kernel to invoke device-side dispatch - cudaError_t retval = - thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) - .doit(CDPDispatchKernel, - Int2Type{}, - is_primitive, - timing_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - initial_value, - num_items, - equality_op); - CubDebugExit(retval); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, - d_temp_storage_bytes, - sizeof(size_t) * 1, - cudaMemcpyDeviceToHost)); - - // Copy out error - CubDebugExit(cudaMemcpy(&retval, - d_cdp_error, - sizeof(cudaError_t) * 1, - cudaMemcpyDeviceToHost)); - return retval; -} - -#endif // TEST_CDP - -//--------------------------------------------------------------------- -// Test generation -//--------------------------------------------------------------------- - - -/** - * Initialize problem - */ -template -void Initialize( - GenMode gen_mode, - T *h_in, - int num_items) -{ - for (int i = 0; i < num_items; ++i) - { - InitValue(gen_mode, h_in[i], i); - } - - if (g_verbose) - { - printf("Input:\n"); - DisplayResults(h_in, num_items); - printf("\n\n"); - } -} - -/** - * Solve exclusive-scan problem - */ -template < - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT> -void Solve( - KeysInputIteratorT h_keys_in, - ValuesInputIteratorT h_values_in, - OutputT *h_reference, - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ - using ValueT = cub::detail::value_t; - using AccumT = cub::detail::accumulator_t; - - if (num_items > 0) - { - for (int i = 0; i < num_items;) { - AccumT val = static_cast(h_values_in[i]); - h_reference[i] = initial_value; - AccumT inclusive = static_cast(scan_op(initial_value, val)); - - ++i; - - for (; i < num_items && equality_op(h_keys_in[i - 1], h_keys_in[i]); ++i) - { - val = static_cast(h_values_in[i]); - h_reference[i] = static_cast(inclusive); - inclusive = static_cast(scan_op(inclusive, val)); - } - } - } -} - - -/** - * Solve inclusive-scan problem - */ -template < - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename OutputT, - typename ScanOpT, - typename EqualityOpT> -void Solve( - KeysInputIteratorT h_keys_in, - ValuesInputIteratorT h_values_in, - OutputT *h_reference, - int num_items, - ScanOpT scan_op, - NullType /*initial_value*/, - EqualityOpT equality_op) -{ - using ValueT = cub::detail::value_t; - using AccumT = cub::detail::accumulator_t; - - if (num_items > 0) - { - for (int i = 0; i < num_items;) { - AccumT inclusive = h_values_in[i]; - h_reference[i] = static_cast(inclusive); - - ++i; - - for (; i < num_items && equality_op(h_keys_in[i - 1], h_keys_in[i]); ++i) - { - AccumT val = h_values_in[i]; - inclusive = static_cast(scan_op(inclusive, val)); - h_reference[i] = static_cast(inclusive); - } - } - } -} - -template -struct AllocateOutput { - static void run(OutputT *&d_out, DeviceInputIteratorT, int num_items) { - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items)); - } -}; - -template -struct AllocateOutput { - static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) { - d_out = d_in; - } -}; - -/** - * Test DeviceScan for a given problem input - */ -template < - Backend BACKEND, - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT, - AliasMode Mode=AliasNone> -void Test( - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - OutputT *h_reference, - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ - using KeyT = cub::detail::value_t; - using InputT = cub::detail::value_t; - - // Allocate device output array - OutputT *d_values_out = NULL; - - if (Mode == AliasKeys) - { - AllocateOutput::run( - d_values_out, - d_keys_in, - num_items); - } - else - { - AllocateOutput::run( - d_values_out, - d_values_in, - num_items); - } - - // Allocate CDP device arrays - size_t *d_temp_storage_bytes = NULL; - cudaError_t *d_cdp_error = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); - - // Allocate temporary storage - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - CubDebugExit(Dispatch( - Int2Type(), - Int2Type::PRIMITIVE>(), - 1, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - initial_value, - num_items, - equality_op)); - CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); - - // Clear device output array - if (Mode == AliasNone) - { - CubDebugExit(cudaMemset(d_values_out, 0, sizeof(OutputT) * num_items)); - } - - // Run warmup/correctness iteration - CubDebugExit(Dispatch( - Int2Type(), - Int2Type::PRIMITIVE>(), - 1, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - initial_value, - num_items, - equality_op)); - - // Check for correctness (and display results, if specified) - const int compare = CompareDeviceResults(h_reference, - d_values_out, - num_items, - true, - g_verbose); - - printf("\t%s", compare ? "FAIL" : "PASS"); - - // Flush any stdout/stderr - fflush(stdout); - fflush(stderr); - - // Display performance - if (g_timing_iterations > 0) - { - // Performance - GpuTimer gpu_timer; - gpu_timer.Start(); - CubDebugExit(Dispatch(Int2Type(), - Int2Type::PRIMITIVE>(), - g_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - initial_value, - num_items, - equality_op)); - - gpu_timer.Stop(); - float elapsed_millis = gpu_timer.ElapsedMillis(); - float avg_millis = elapsed_millis / g_timing_iterations; - float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; - float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT)); - printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", - avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); - } - - printf("\n\n"); - - // Cleanup - if (Mode == AliasNone) - { - if (d_values_out) - { - CubDebugExit(g_allocator.DeviceFree(d_values_out)); - } - } - - if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); - if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); - if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); - - // Correctness asserts - AssertEquals(0, compare); -} - -template -void TestInplaceValues(KeysInputIteratorT d_keys_in, - OutputT *d_values_in, - OutputT *h_reference, - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ - Test(d_keys_in, - d_values_in, - h_reference, - num_items, - scan_op, - initial_value, - equality_op); -} - -template -void TestInplaceValues(KeysInputIteratorT, - ValuesInputIteratorT, - OutputT *, - int, - ScanOpT, - InitialValueT, - EqualityOpT) -{} - -template -void TestInplaceKeys(T *d_keys_in, - ValuesInputIteratorT d_values_in, - T *h_reference, - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ - Test(d_keys_in, - d_values_in, - h_reference, - num_items, - scan_op, - initial_value, - equality_op); -} - -template -void TestInplaceKeys(KeysInputIteratorT, - ValuesInputIteratorT, - OutputT *, - int, - ScanOpT, - InitialValueT, - EqualityOpT) -{} - -/** - * Test DeviceScan on pointer type - */ -template < - Backend BACKEND, - typename KeyT, - typename InputT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT> -void TestPointer( - int num_items, - GenMode gen_mode, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ - printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n", - (BACKEND == CDP) ? "CDP CUB" : "CUB", - (std::is_same::value) ? "Inclusive" : "Exclusive", - (std::is_same::value) ? "Sum" : "Scan", - num_items, - typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT), - (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS"); - fflush(stdout); - - // Allocate host arrays - KeyT* h_keys_in = new KeyT[num_items]; - InputT* h_values_in = new InputT[num_items]; - OutputT* h_reference = new OutputT[num_items]; - - // Initialize problem and solution - Initialize(gen_mode, h_keys_in, num_items); - Initialize(gen_mode, h_values_in, num_items); - - // If the output type is primitive and the operator is cub::Sum, the test - // dispatcher throws away scan_op and initial_value for exclusive scan. - // Without an initial_value arg, the accumulator switches to the input value - // type. - // Do the same thing here: - if (Traits::PRIMITIVE && - std::is_same::value && - !std::is_same::value) - { - Solve(h_keys_in, h_values_in, h_reference, num_items, cub::Sum{}, InputT{}, equality_op); - } - else - { - Solve(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op); - } - - // Allocate problem device arrays - KeyT *d_keys_in = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items)); - InputT *d_values_in = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(InputT) * num_items)); - - // Initialize device input - CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); - - // Run Test - Test(d_keys_in, - d_values_in, - h_reference, - num_items, - scan_op, - initial_value, - equality_op); - - // Test in/out values aliasing - TestInplaceValues(d_keys_in, - d_values_in, - h_reference, - num_items, - scan_op, - initial_value, - equality_op); - - CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); - - // Test keys/values aliasing (should go last, changes keys) - TestInplaceKeys(d_keys_in, - d_values_in, - h_reference, - num_items, - scan_op, - initial_value, - equality_op); - - // Cleanup - if (h_keys_in) delete[] h_keys_in; - if (h_values_in) delete[] h_values_in; - if (h_reference) delete[] h_reference; - if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in)); - if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in)); -} - - -/** - * Test DeviceScan on iterator type - */ -template < - Backend BACKEND, - typename KeyT, - typename InputT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT> -void TestIterator( - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ - printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n", - (BACKEND == CDP) ? "CDP CUB" : "CUB", - (std::is_same::value) ? "Inclusive" : "Exclusive", - (std::is_same::value) ? "Sum" : "Scan", - num_items, - typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT)); - fflush(stdout); - - // Use a counting iterator followed by div as the keys - using CountingIterT = CountingInputIterator; - CountingIterT h_keys_in_helper(0); - TransformInputIterator, CountingIterT> h_keys_in(h_keys_in_helper, DivideByFiveFunctor()); - - // Use a constant iterator as the input - InputT val = InputT(); - ConstantInputIterator h_values_in(val); - - // Allocate host arrays - OutputT* h_reference = new OutputT[num_items]; - - // Initialize problem and solution - Solve(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op); - - // Run Test - Test(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op); - - // Cleanup - if (h_reference) delete[] h_reference; -} - - -/** - * Test different gen modes - */ -template < - Backend BACKEND, - typename KeyT, - typename InputT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT> -void Test( - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ - TestPointer( num_items, UNIFORM, scan_op, initial_value, equality_op); - TestPointer( num_items, RANDOM, scan_op, initial_value, equality_op); - TestIterator( num_items, scan_op, initial_value, equality_op); -} - - -/** - * Test different dispatch - */ -template < - typename KeyT, - typename InputT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT> -void Test( - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -{ -#if TEST_CDP == 0 - Test(num_items, scan_op, initial_value, equality_op); -#elif TEST_CDP == 1 - Test(num_items, scan_op, initial_value, equality_op); -#endif // TEST_CDP -} - - -/** - * Test different operators - */ -template -void TestOp( - int num_items, - OutputT identity, - OutputT initial_value, - EqualityOpT equality_op) -{ - // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values) - Test(num_items, cub::Sum(), identity, equality_op); - Test(num_items, cub::Max(), identity, equality_op); - - // Exclusive (non-specialized, so we can test initial-value) - Test(num_items, WrapperFunctor(cub::Sum()), initial_value, equality_op); - Test(num_items, WrapperFunctor(cub::Max()), initial_value, equality_op); - - // Inclusive (no initial value) - Test(num_items, cub::Sum(), NullType(), equality_op); - Test(num_items, cub::Max(), NullType(), equality_op); -} - -/** - * Test different key type and equality operator - */ -template -void TestKeyTAndEqualityOp( - int num_items, - OutputT identity, - OutputT initial_value) -{ - TestOp(num_items, identity, initial_value, Equality()); - TestOp( num_items, identity, initial_value, Mod2Equality()); -} - -/** - * Test different input sizes - */ -template < - typename InputT, - typename OutputT> -void TestSize( - int num_items, - OutputT identity, - OutputT initial_value) -{ - if (num_items < 0) - { - TestKeyTAndEqualityOp(0, identity, initial_value); - TestKeyTAndEqualityOp(1, identity, initial_value); - TestKeyTAndEqualityOp(100, identity, initial_value); - TestKeyTAndEqualityOp(10000, identity, initial_value); - TestKeyTAndEqualityOp(1000000, identity, initial_value); - } - else - { - TestKeyTAndEqualityOp(num_items, identity, initial_value); - } -} - - - -//--------------------------------------------------------------------- -// Main -//--------------------------------------------------------------------- - -/** - * Main - */ -int main(int argc, char** argv) -{ - int num_items = -1; - - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("n", num_items); - args.GetCmdLineArgument("i", g_timing_iterations); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s " - "[--n= " - "[--i= " - "[--device=] " - "[--v] " - "\n", argv[0]); - exit(0); - } - - // Initialize device - CubDebugExit(args.DeviceInit()); - g_device_giga_bandwidth = args.device_giga_bandwidth; - printf("\n"); - - // %PARAM% TEST_CDP cdp 0:1 - // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5 - -#if TEST_VALUE_TYPES == 0 - - // Test different input+output data types - TestSize(num_items, (int)0, (int)99); - - // Test same input+output data types - TestSize(num_items, (unsigned char)0, (unsigned char)99); - TestSize(num_items, (char)0, (char)99); - -#elif TEST_VALUE_TYPES == 1 - - TestSize(num_items, (unsigned short)0, (unsigned short)99); - TestSize(num_items, (unsigned int)0, (unsigned int)99); - TestSize(num_items, - (unsigned long long)0, - (unsigned long long)99); -#elif TEST_VALUE_TYPES == 2 - - TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21)); - TestSize(num_items, make_char2(0, 0), make_char2(17, 21)); - TestSize(num_items, make_ushort2(0, 0), make_ushort2(17, 21)); - -#elif TEST_VALUE_TYPES == 3 - - TestSize(num_items, make_uint2(0, 0), make_uint2(17, 21)); - TestSize(num_items, - make_ulonglong2(0, 0), - make_ulonglong2(17, 21)); - TestSize(num_items, - make_uchar4(0, 0, 0, 0), - make_uchar4(17, 21, 32, 85)); - -#elif TEST_VALUE_TYPES == 4 - - TestSize(num_items, - make_char4(0, 0, 0, 0), - make_char4(17, 21, 32, 85)); - - TestSize(num_items, - make_ushort4(0, 0, 0, 0), - make_ushort4(17, 21, 32, 85)); - TestSize(num_items, - make_uint4(0, 0, 0, 0), - make_uint4(17, 21, 32, 85)); - -#elif TEST_VALUE_TYPES == 5 - - TestSize(num_items, - make_ulonglong4(0, 0, 0, 0), - make_ulonglong4(17, 21, 32, 85)); - - TestSize(num_items, - TestFoo::MakeTestFoo(0, 0, 0, 0), - TestFoo::MakeTestFoo(std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max())); - - TestSize(num_items, - TestBar(0, 0), - TestBar(std::numeric_limits::max(), - std::numeric_limits::max())); - -#endif - - return 0; -}