diff --git a/dependencies/cub b/dependencies/cub index a39e385cc6..f88e0c7fe3 160000 --- a/dependencies/cub +++ b/dependencies/cub @@ -1 +1 @@ -Subproject commit a39e385cc6be20754f859dd266021ab1d88459d3 +Subproject commit f88e0c7fe39db3c6161ea4bbcc7228d30e12410f diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index fdfc04e97b..244d839fed 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -151,6 +151,7 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) endforeach() # Add specialized tests: +add_subdirectory(async) add_subdirectory(cpp) add_subdirectory(cuda) add_subdirectory(omp) diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt new file mode 100644 index 0000000000..00d50f097f --- /dev/null +++ b/testing/async/CMakeLists.txt @@ -0,0 +1,80 @@ +# The async tests perform a large amount of codegen, making them expensive to +# build and test. To keep compilation and runtimes manageable, the tests are +# broken up into many files per algorithm to enable parallelism during +# compilation and testing. The structure of these test directories are: +# +# thrust/testing/async//.cu +# +# These generate executables and CTest tests named +# ${config_prefix}.test.async... + +# The async tests only support CUDA enabled configs. Create a list of valid +# thrust targets: +set(cuda_configs) +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + if (config_device STREQUAL CUDA) + list(APPEND cuda_configs ${thrust_target}) + endif() +endforeach() + +list(LENGTH cuda_configs num_cuda_configs) +if (num_cuda_configs EQUAL 0) + return() # No valid configs found, nothing to do. +endif() + +# Process a single algorithm directory, adding all .cu/cpp files as tests for +# each valid backend. algo_name is the name of the subdir ( +# above) and is used for naming the executable/targets. +function(thrust_add_async_test_dir algo_name) + file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "${algo_name}/*.cu" + "${algo_name}/*.cpp" + ) + + # Per-algorithm, all-config metatarget: thrust.all.test.async.[algo].all + set(algo_meta_target thrust.all.test.async.${algo_name}.all) + add_custom_target(${algo_meta_target}) + + foreach(thrust_target IN LISTS cuda_configs) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + # Per-algorithm, per-config metatarget: thrust.[config].test.async.[algo].all + set(algo_config_meta_target ${config_prefix}.test.async.${algo_name}.all) + add_custom_target(${algo_config_meta_target}) + add_dependencies(${algo_meta_target} ${algo_config_meta_target}) + + foreach(test_src IN LISTS test_srcs) + get_filename_component(test_name "${test_src}" NAME_WLE) + string(PREPEND test_name async.${algo_name}.) + + thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) + if(THRUST_ENABLE_TESTS_WITH_RDC) + thrust_enable_rdc_for_cuda_target(${test_target}) + endif() + + add_dependencies(${algo_config_meta_target} ${test_target}) + endforeach() + endforeach() +endfunction() + +# Grab all algorithm subdirectories: +set(test_dirs) +file(GLOB contents + CONFIGURE_DEPENDS + "${CMAKE_CURRENT_LIST_DIR}/*" +) + +foreach(test_dir IN LISTS contents) + if(IS_DIRECTORY "${test_dir}") + list(APPEND test_dirs "${test_dir}") + endif() +endforeach() + +# Process all test dirs: +foreach(test_dir IN LISTS test_dirs) + get_filename_component(algo_name "${test_dir}" NAME_WLE) + thrust_add_async_test_dir(${algo_name}) +endforeach() diff --git a/testing/async/exclusive_scan/basic.cu b/testing/async/exclusive_scan/basic.cu new file mode 100644 index 0000000000..3fcd3c4a4b --- /dev/null +++ b/testing/async/exclusive_scan/basic.cu @@ -0,0 +1,72 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct basic_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "basic invocation with device vectors"; + } +}; + +template +struct TestBasic +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasic, NumericTypes); + +// Testing the in-place algorithm uses the exact same instantiations of the +// underlying scan implementation as above. Test them here to avoid compiling +// them twice. +template > +struct basic_inplace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector_reuse_input + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + input_value_type> + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "basic in-place invocation with device vectors"; + } +}; + +template +struct TestBasicInPlace +{ + void operator()(std::size_t num_values) const + { + using invoker = basic_inplace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasicInPlace, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu new file mode 100644 index 0000000000..d056c6e782 --- /dev/null +++ b/testing/async/exclusive_scan/counting_iterator.cu @@ -0,0 +1,46 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include + +template > +struct invoker + : testing::async::mixin::input::counting_iterator_0 + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "fancy input iterator (counting_iterator)"; + } +}; + +template +struct TestCountingIterator +{ + void operator()(std::size_t num_values) const + { + num_values = unittest::truncate_to_max_representable(num_values); + testing::async::test_policy_overloads>::run(num_values); + } +}; +// Use built-in types only, counting_iterator doesn't seem to be compatible with +// the custom_numeric. +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestCountingIterator, + BuiltinNumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu new file mode 100644 index 0000000000..692fb35b3d --- /dev/null +++ b/testing/async/exclusive_scan/discard_output.cu @@ -0,0 +1,38 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Compilation test with discard iterators. No runtime validation is actually +// performed, other than testing whether the algorithm completes without +// exception. + +template > +struct discard_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::discard_iterator + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::noop +{ + static std::string description() { return "discard output"; } +}; + +template +struct TestDiscard +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestDiscard, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/large_indices.cu b/testing/async/exclusive_scan/large_indices.cu new file mode 100644 index 0000000000..0d5a7aaa24 --- /dev/null +++ b/testing/async/exclusive_scan/large_indices.cu @@ -0,0 +1,239 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu. + +namespace +{ + +// Fake iterator that asserts +// (a) it is written with a sequence and +// (b) a defined maximum value is written at some point +struct assert_sequence_iterator +{ + using value_type = std::int64_t; + using difference_type = std::int64_t; + + // Defined for thrust::iterator_traits: + using pointer = value_type*; + using reference = assert_sequence_iterator; // weird but convenient + using iterator_category = + typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference>::type; + + std::int64_t expected{0}; + std::int64_t max{0}; + mutable thrust::device_ptr found_max{nullptr}; + mutable thrust::device_ptr unexpected_value{nullptr}; + + // Should be called on the first iterator generated. + void initialize_shared_state() + { + found_max = thrust::device_malloc(1); + unexpected_value = thrust::device_malloc(1); + *found_max = false; + *unexpected_value = false; + } + + // Should be called only once on the initialized iterator. + void free_shared_state() const + { + thrust::device_free(found_max); + thrust::device_free(unexpected_value); + found_max = nullptr; + unexpected_value = nullptr; + } + + __host__ __device__ assert_sequence_iterator operator+(difference_type i) const + { + return clone(expected + i); + } + + __host__ __device__ reference operator[](difference_type i) const + { + return clone(expected + i); + } + + // Some weirdness, this iterator acts like its own reference + __device__ assert_sequence_iterator operator=(value_type val) + { + if (val != expected) + { + printf("Error: expected %lld, got %lld\n", expected, val); + *unexpected_value = true; + } + else if (val == max) + { + *found_max = true; + } + + return *this; + } + +private: + __host__ __device__ + assert_sequence_iterator clone(value_type new_expected) const + { + return {new_expected, max, found_max, unexpected_value}; + } +}; + +// output mixin that generates assert_sequence_iterators. +// Must be paired with validate_assert_sequence_iterators mixin to free +// shared state. +struct assert_sequence_output +{ + struct output_type + { + using iterator = assert_sequence_iterator; + + iterator iter; + + explicit output_type(iterator&& it) + : iter{std::move(it)} + { + iter.initialize_shared_state(); + } + + ~output_type() + { + iter.free_shared_state(); + } + + iterator begin() { return iter; } + }; + + template + static output_type generate_output(std::size_t num_values, InputType&) + { + using value_type = typename assert_sequence_iterator::value_type; + assert_sequence_iterator it{0, + // minus one bc exclusive scan: + static_cast(num_values - 1), + nullptr, + nullptr}; + return output_type{std::move(it)}; + } +}; + +struct validate_assert_sequence_iterators +{ + using output_t = assert_sequence_output::output_type; + + template + static void compare_outputs(EventType& e, + output_t const&, + output_t const& test) + { + testing::async::mixin::compare_outputs::detail::basic_event_validation(e); + + ASSERT_EQUAL(*test.iter.unexpected_value, false); + ASSERT_EQUAL(*test.iter.found_max, true); + } +}; + +//------------------------------------------------------------------------------ +// Overloads without custom binary operators use thrust::plus<>, so use +// constant input iterator to generate the output sequence: +struct default_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple // - initial_value + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, {0}}; + } +}; + +struct default_bin_op_invoker + : testing::async::mixin::input::constant_iterator_1 + , assert_sequence_output + , default_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::basic + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with default binary operator"; + } +}; + +} // anon namespace + +void TestLargeIndicesDefaultScanOp() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(TestLargeIndicesDefaultScanOp); + +namespace +{ + +//------------------------------------------------------------------------------ +// Generate the output sequence using counting iterators and thrust::max<> for +// custom operator overloads. +struct custom_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple> // - initial_value, binop + >; + + static postfix_args_type generate_postfix_args() + { + return {{0, {}}}; + } +}; + +struct custom_bin_op_invoker + : testing::async::mixin::input::counting_iterator_1 + , assert_sequence_output + , custom_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::exclusive_scan::mixin::invoke_async::basic + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with custom binary operator"; + } +}; + +} // namespace + +void TestLargeIndicesCustomScanOp() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(TestLargeIndicesCustomScanOp); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/large_types.cu b/testing/async/exclusive_scan/large_types.cu new file mode 100644 index 0000000000..90bfdc683d --- /dev/null +++ b/testing/async/exclusive_scan/large_types.cu @@ -0,0 +1,63 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +// This test is an adaptation of TestScanWithLargeTypes from scan.cu. + +// Need special initialization for the FixedVector type: +template +struct device_vector_fill +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::fill(input.begin(), input.end(), value_type{2}); + return input; + } +}; + +template > +struct invoker + : device_vector_fill + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "scan with large value types."; + } +}; + +struct TestLargeTypes +{ + void operator()(std::size_t num_values) const + { + using testing::async::test_policy_overloads; + + test_policy_overloads>>::run(num_values); + +#ifndef __QNX__ // These are excluded on QNX for scan.cu's version of this test. + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); +#else + KNOWN_FAILURE; +#endif + } +}; +DECLARE_UNITTEST(TestLargeTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu new file mode 100644 index 0000000000..174880b246 --- /dev/null +++ b/testing/async/exclusive_scan/mixed_types.cu @@ -0,0 +1,105 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Test using mixed int/float types for: +// - input_value_type | (int, float) +// - output_value_type | (int, float) +// - initial_value_type | (int, float, ) +// - thrust::plus T-type | (int, float, void>) +// +// The initial_value_type and thrust::plus types are covered by the +// mixin::postfix_args::scan_mixed_types_overloads component. +// +// The testing/scan.cu TestMixedTypes test spells out the expected behavior, +// which is defined by https://wg21.link/P0571. + +namespace +{ + +template +struct mixed_type_input_generator +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen deliberately to test + // casting orders and accumulator types: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +// A fractional value is used to ensure that a different result is obtained when +// using float vs. int. +template +struct mixed_types_postfix_args +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple>, // - initial_value, plus<> + std::tuple>, // - initial_value, plus + std::tuple> // - initial_value, plus + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, + {static_cast(5.5)}, + {static_cast(5.5), thrust::plus<>{}}, + {static_cast(5.5), thrust::plus{}}, + {static_cast(5.5), thrust::plus{}}}; + } +}; + +template +struct invoker + : mixed_type_input_generator + , testing::async::mixin::output::device_vector + , mixed_types_postfix_args + , testing::async::exclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "mixed input/output/initial type tests"; + } +}; + +} // namespace + +void TestScanMixedTypes() +{ + // 10 values are enough to check the behavior we want to test while staying + // small enough to reason about. + constexpr std::size_t num_values = 10; + + // invoker template params are input_value_type, output_vt, initial_vt: + using testing::async::test_policy_overloads; + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + // We all float down here + test_policy_overloads>::run(num_values); +} +DECLARE_UNITTEST(TestScanMixedTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h new file mode 100644 index 0000000000..719e7dabaf --- /dev/null +++ b/testing/async/exclusive_scan/mixin.h @@ -0,0 +1,116 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +namespace testing +{ +namespace async +{ +namespace exclusive_scan +{ + +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple, // - initial_value + std::tuple // - initial_value, binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, {42}, {42, alternate_binary_op{}}}; + } +}; + +} // namespace postfix_args + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +struct basic +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::exclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +} // namespace mixin +} // namespace exclusive_scan +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/exclusive_scan/stateful_operator.cu b/testing/async/exclusive_scan/stateful_operator.cu new file mode 100644 index 0000000000..34b4adde14 --- /dev/null +++ b/testing/async/exclusive_scan/stateful_operator.cu @@ -0,0 +1,60 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +namespace +{ + +// Custom binary operator for scan: +template +struct stateful_operator +{ + T offset; + + __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; } +}; + +// Postfix args overload definition that uses a stateful custom binary operator +template +struct use_stateful_operator +{ + using postfix_args_type = std::tuple< // Single overload: + std::tuple> // init_val, bin_op + >; + + static postfix_args_type generate_postfix_args() + { + return {{value_type{42}, {value_type{2}}}}; + } +}; + +template +struct invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , use_stateful_operator + , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::exclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() { return "scan with stateful operator"; } +}; + +} // namespace + +template +struct TestStatefulOperator +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestStatefulOperator, NumericTypes); + +#endif // C++14 diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu new file mode 100644 index 0000000000..4415cfda44 --- /dev/null +++ b/testing/async/exclusive_scan/using_vs_adl.cu @@ -0,0 +1,171 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Verify what happens when calling the algorithm without any namespace +// qualifiers: +// - If the async entry point is available in the global namespace due to a +// using statement, the async algorithm should be called. +// - Otherwise, ADL should resolve the call to the synchronous algo in the +// thrust:: namespace. + +namespace invoke_reference +{ + +template +struct adl_host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + using OutIter = thrust::remove_cvref_t; + + // ADL should resolve this to the synchronous `thrust::` algorithm. + // This is checked by ensuring that the call returns an output iterator. + OutIter result = + exclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + (void)result; + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct using_namespace +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using namespace thrust::async; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +struct using_cpo +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using thrust::async::exclusive_scan; + thrust::device_event e = + exclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +template > +struct using_namespace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_namespace + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "importing async CPO with `using namespace thrust::async`"; + } +}; + +void TestUsingNamespace() +{ + using invoker = using_namespace_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(TestUsingNamespace); + +template > +struct using_cpo_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::exclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_cpo + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "importing async CPO with " + "`using namespace thrust::async::exclusive_scan`"; + } +}; + +void TestUsingCPO() +{ + using invoker = using_cpo_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(TestUsingCPO); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/basic.cu b/testing/async/inclusive_scan/basic.cu new file mode 100644 index 0000000000..a4437c638e --- /dev/null +++ b/testing/async/inclusive_scan/basic.cu @@ -0,0 +1,70 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +template > +struct basic_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "basic invocation with device vectors"; + } +}; + +template +struct TestBasic +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasic, NumericTypes); + +// Testing the in-place algorithm uses the exact same instantiations of the +// underlying scan implementation as above. Test them here to avoid compiling +// them twice. +template > +struct basic_inplace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector_reuse_input + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + input_value_type> + , testing::async::inclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "basic in-place invocation with device vectors"; + } +}; + +template +struct TestBasicInPlace +{ + void operator()(std::size_t num_values) const + { + using invoker = basic_inplace_invoker; + testing::async::test_policy_overloads::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasicInPlace, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/counting_iterator.cu b/testing/async/inclusive_scan/counting_iterator.cu new file mode 100644 index 0000000000..7818bf018b --- /dev/null +++ b/testing/async/inclusive_scan/counting_iterator.cu @@ -0,0 +1,45 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include + +template > +struct invoker + : testing::async::mixin::input::counting_iterator_0 + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "fancy input iterator (counting_iterator)"; + } +}; + +template +struct TestCountingIterator +{ + void operator()(std::size_t num_values) const + { + num_values = unittest::truncate_to_max_representable(num_values); + testing::async::test_policy_overloads>::run(num_values); + } +}; +// Use built-in types only, counting_iterator doesn't seem to be compatible with +// the custom_numeric. +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestCountingIterator, + BuiltinNumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/discard_output.cu b/testing/async/inclusive_scan/discard_output.cu new file mode 100644 index 0000000000..b310ee3a4d --- /dev/null +++ b/testing/async/inclusive_scan/discard_output.cu @@ -0,0 +1,37 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Compilation test with discard iterators. No runtime validation is actually +// performed, other than testing whether the algorithm completes without +// exception. + +template > +struct discard_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::discard_iterator + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::noop +{ + static std::string description() { return "discard output"; } +}; + +template +struct TestDiscard +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestDiscard, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/large_indices.cu b/testing/async/inclusive_scan/large_indices.cu new file mode 100644 index 0000000000..a73d32e8b6 --- /dev/null +++ b/testing/async/inclusive_scan/large_indices.cu @@ -0,0 +1,228 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu. + +namespace +{ + +// Fake iterator that asserts +// (a) it is written with a sequence and +// (b) a defined maximum value is written at some point +struct assert_sequence_iterator +{ + using value_type = std::int64_t; + using difference_type = std::int64_t; + + // Defined for thrust::iterator_traits: + using pointer = value_type *; + using reference = assert_sequence_iterator; // weird but convenient + using iterator_category = typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference>::type; + + std::int64_t expected{0}; + std::int64_t max{0}; + mutable thrust::device_ptr found_max{nullptr}; + mutable thrust::device_ptr unexpected_value{nullptr}; + + // Should be called on the first iterator generated. + void initialize_shared_state() + { + found_max = thrust::device_malloc(1); + unexpected_value = thrust::device_malloc(1); + *found_max = false; + *unexpected_value = false; + } + + // Should be called only once on the initialized iterator. + void free_shared_state() const + { + thrust::device_free(found_max); + thrust::device_free(unexpected_value); + found_max = nullptr; + unexpected_value = nullptr; + } + + __host__ __device__ assert_sequence_iterator operator+(difference_type i) const + { + return clone(expected + i); + } + + __host__ __device__ reference operator[](difference_type i) const + { + return clone(expected + i); + } + + // Some weirdness, this iterator acts like its own reference + __device__ assert_sequence_iterator operator=(value_type val) + { + if (val != expected) + { + printf("Error: expected %lld, got %lld\n", expected, val); + + *unexpected_value = true; + } + else if (val == max) + { + *found_max = true; + } + + return *this; + } + +private: + __host__ __device__ assert_sequence_iterator + clone(value_type new_expected) const + { + return {new_expected, max, found_max, unexpected_value}; + } +}; + +// output mixin that generates assert_sequence_iterators. +// Must be paired with validate_assert_sequence_iterators mixin to free +// shared state. +struct assert_sequence_output +{ + struct output_type + { + using iterator = assert_sequence_iterator; + + iterator iter; + + explicit output_type(iterator &&it) + : iter{std::move(it)} + { + iter.initialize_shared_state(); + } + + ~output_type() { iter.free_shared_state(); } + + iterator begin() { return iter; } + }; + + template + static output_type generate_output(std::size_t num_values, InputType &) + { + using value_type = typename assert_sequence_iterator::value_type; + assert_sequence_iterator it{1, + static_cast(num_values), + nullptr, + nullptr}; + return output_type{std::move(it)}; + } +}; + +struct validate_assert_sequence_iterators +{ + using output_t = assert_sequence_output::output_type; + + template + static void compare_outputs(EventType &e, + output_t const &, + output_t const &test) + { + testing::async::mixin::compare_outputs::detail::basic_event_validation(e); + + ASSERT_EQUAL(*test.iter.unexpected_value, false); + ASSERT_EQUAL(*test.iter.found_max, true); + } +}; + +//------------------------------------------------------------------------------ +// Overloads without custom binary operators use thrust::plus<>, so use +// constant input iterator to generate the output sequence: +struct default_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<> // - no extra args + >; + + static postfix_args_type generate_postfix_args() { return {{}}; } +}; + +struct default_bin_op_invoker + : testing::async::mixin::input::constant_iterator_1 + , assert_sequence_output + , default_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::basic + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with default binary operator"; + } +}; + +} // end anon namespace + +void TestLargeIndicesDefaultScanOp() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(TestLargeIndicesDefaultScanOp); + +namespace +{ + +//------------------------------------------------------------------------------ +// Generate the output sequence using counting iterators and thrust::max<> for +// custom operator overloads. +struct custom_bin_op_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple> // - custom binary op + >; + + static postfix_args_type generate_postfix_args() { return {{{}}}; } +}; + +struct custom_bin_op_invoker + : testing::async::mixin::input::counting_iterator_1 + , assert_sequence_output + , custom_bin_op_overloads + , testing::async::mixin::invoke_reference::noop + , testing::async::inclusive_scan::mixin::invoke_async::basic + , validate_assert_sequence_iterators +{ + static std::string description() + { + return "test large array indices with custom binary operator"; + } +}; + +} // end anon namespace + +void TestLargeIndicesCustomScanOp() +{ + // Test problem sizes around signed/unsigned int max: + testing::async::test_policy_overloads::run(1ll << 30); + testing::async::test_policy_overloads::run(1ll << 31); + testing::async::test_policy_overloads::run(1ll << 32); + testing::async::test_policy_overloads::run(1ll << 33); +} +DECLARE_UNITTEST(TestLargeIndicesCustomScanOp); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/large_types.cu b/testing/async/inclusive_scan/large_types.cu new file mode 100644 index 0000000000..f4bc196a65 --- /dev/null +++ b/testing/async/inclusive_scan/large_types.cu @@ -0,0 +1,63 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +// This test is an adaptation of TestScanWithLargeTypes from scan.cu. + +// Need special initialization for the FixedVector type: +template +struct device_vector_fill +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::fill(input.begin(), input.end(), value_type{2}); + return input; + } +}; + +template > +struct invoker + : device_vector_fill + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::inclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "scan with large value types."; + } +}; + +struct TestLargeTypes +{ + void operator()(std::size_t num_values) const + { + using testing::async::test_policy_overloads; + + test_policy_overloads>>::run(num_values); + +#ifndef __QNX__ // These are excluded on QNX for scan.cu's version of this test. + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); + test_policy_overloads>>::run(num_values); +#else + KNOWN_FAILURE; +#endif + } +}; +DECLARE_UNITTEST(TestLargeTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/mixed_types.cu b/testing/async/inclusive_scan/mixed_types.cu new file mode 100644 index 0000000000..733eddc9c6 --- /dev/null +++ b/testing/async/inclusive_scan/mixed_types.cu @@ -0,0 +1,95 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Test using mixed int/float types for: +// - input_value_type | (int, float) +// - output_value_type | (int, float) +// - thrust::plus T-type | (int, float, void>) +// +// The thrust::plus types are covered by the +// scan_mixed_types_overloads component. +// +// The testing/scan.cu TestMixedTypes test spells out the expected behavior, +// which is defined by https://wg21.link/P0571. + +namespace +{ + +template +struct mixed_type_input_generator +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + // fractional values are chosen deliberately to test + // casting orders and accumulator types: + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +// A fractional value is used to ensure that a different result is obtained when +// using float vs. int. +struct mixed_types_postfix_args +{ + using postfix_args_type = std::tuple< // Overloads to test: + std::tuple<>, // - no extra args + std::tuple>, // - plus<> + std::tuple>, // - plus + std::tuple> // - plus + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, + {thrust::plus<>{}}, + {thrust::plus{}}, + {thrust::plus{}}}; + } +}; + +template +struct invoker + : mixed_type_input_generator + , testing::async::mixin::output::device_vector + , mixed_types_postfix_args + , testing::async::inclusive_scan::mixin::invoke_reference:: + host_synchronous + , testing::async::inclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal +{ + static std::string description() + { + return "mixed input/output/functor value_type tests"; + } +}; + +} // namespace + +void TestScanMixedTypes() +{ + // 10 values are enough to check the behavior we want to test while staying + // small enough to reason about during debugging. + constexpr std::size_t num_values = 10; + + // invoker template params are input_value_type, output_vt: + using testing::async::test_policy_overloads; + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); + test_policy_overloads>::run(num_values); +} +DECLARE_UNITTEST(TestScanMixedTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/mixin.h b/testing/async/inclusive_scan/mixin.h new file mode 100644 index 0000000000..8431526f49 --- /dev/null +++ b/testing/async/inclusive_scan/mixin.h @@ -0,0 +1,115 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +#include + +namespace testing +{ +namespace async +{ +namespace inclusive_scan +{ + +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace postfix_args +{ + +template > +struct all_overloads +{ + using postfix_args_type = std::tuple< // List any extra arg overloads: + std::tuple<>, // - no extra args + std::tuple // - binary_op + >; + + static postfix_args_type generate_postfix_args() + { + return {{}, {alternate_binary_op{}}}; + } +}; + +} // namespace postfix_args + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +template +struct host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + // Run host synchronous algorithm to generate reference. + thrust::inclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get( + THRUST_FWD(postfix_tuple))...); + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +struct basic +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + auto e = thrust::async::inclusive_scan( + std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +} // namespace mixin +} // namespace inclusive_scan +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/inclusive_scan/stateful_operator.cu b/testing/async/inclusive_scan/stateful_operator.cu new file mode 100644 index 0000000000..946cc4a940 --- /dev/null +++ b/testing/async/inclusive_scan/stateful_operator.cu @@ -0,0 +1,60 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +namespace +{ + +// Custom binary operator for scan: +template +struct stateful_operator +{ + T offset; + + __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; } +}; + +// Postfix args overload definition that uses a stateful custom binary operator +template +struct use_stateful_operator +{ + using postfix_args_type = std::tuple< // Single overload: + std::tuple> // bin_op + >; + + static postfix_args_type generate_postfix_args() + { + return {{{value_type{2}}}}; + } +}; + +template +struct invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , use_stateful_operator + , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous< + value_type> + , testing::async::inclusive_scan::mixin::invoke_async::basic + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() { return "scan with stateful operator"; } +}; + +} // namespace + +template +struct TestStatefulOperator +{ + void operator()(std::size_t num_values) const + { + testing::async::test_policy_overloads>::run(num_values); + } +}; +DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestStatefulOperator, NumericTypes); + +#endif // C++14 diff --git a/testing/async/inclusive_scan/using_vs_adl.cu b/testing/async/inclusive_scan/using_vs_adl.cu new file mode 100644 index 0000000000..e7fca8b0a9 --- /dev/null +++ b/testing/async/inclusive_scan/using_vs_adl.cu @@ -0,0 +1,169 @@ +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include + +#include + +// Verify what happens when calling the algorithm without any namespace +// qualifiers: +// - If the async entry point is available in the global namespace due to a +// using statement, the async algorithm should be called. +// - Otherwise, ADL should resolve the call to the synchronous algo in the +// thrust:: namespace. + +namespace invoke_reference +{ + +template +struct adl_host_synchronous +{ + template + static void invoke_reference(InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Create host versions of the input/output: + thrust::host_vector host_input(input.cbegin(), + input.cend()); + thrust::host_vector host_output(host_input.size()); + + using OutIter = thrust::remove_cvref_t; + + // ADL should resolve this to the synchronous `thrust::` algorithm. + // This is checked by ensuring that the call returns an output iterator. + OutIter result = + inclusive_scan(host_input.cbegin(), + host_input.cend(), + host_output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + (void)result; + + // Copy back to device. + output = host_output; + } +}; + +} // namespace invoke_reference + +namespace invoke_async +{ + +struct using_namespace +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using namespace thrust::async; + thrust::device_event e = + inclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +struct using_cpo +{ + template + static auto invoke_async(PrefixArgTuple&& prefix_tuple, + std::index_sequence, + InputType const& input, + OutputType& output, + PostfixArgTuple&& postfix_tuple, + std::index_sequence) + { + // Importing the CPO into the current namespace should unambiguously resolve + // this call to the CPO, as opposed to resolving to the thrust:: algorithm + // via ADL. This is verified by checking that an event is returned. + using thrust::async::inclusive_scan; + thrust::device_event e = + inclusive_scan(std::get(THRUST_FWD(prefix_tuple))..., + input.cbegin(), + input.cend(), + output.begin(), + std::get(THRUST_FWD(postfix_tuple))...); + return e; + } +}; + +} // namespace invoke_async + +template > +struct using_namespace_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_namespace + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "importing async CPO with `using namespace thrust::async`"; + } +}; + +void TestUsingNamespace() +{ + using invoker = using_namespace_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(TestUsingNamespace); + +template > +struct using_cpo_invoker + : testing::async::mixin::input::device_vector + , testing::async::mixin::output::device_vector + , testing::async::inclusive_scan::mixin::postfix_args:: + all_overloads + , invoke_reference::adl_host_synchronous + , invoke_async::using_cpo + , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet +{ + static std::string description() + { + return "importing async CPO with " + "`using namespace thrust::async::inclusive_scan`"; + } +}; + +void TestUsingCPO() +{ + using invoker = using_cpo_invoker; + testing::async::test_policy_overloads::run(128); +} +DECLARE_UNITTEST(TestUsingCPO); + +#endif // C++14 diff --git a/testing/async/mixin.h b/testing/async/mixin.h new file mode 100644 index 0000000000..a58886ce00 --- /dev/null +++ b/testing/async/mixin.h @@ -0,0 +1,645 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +// This file contains a set of mix-in classes that define an algorithm +// definition for use with test_policy_overloads. The algorithm +// definition describes the details of a thrust::async algorithm invocation: +// +// - Input type and initialization +// - Output type and initialization (supports in-place, too) +// - Postfix arguments that define the algorithm's overload set +// - Abstracted invocation of the async algorithm +// - Abstracted invocation of a reference algorithm +// - Validation of async vs. reference output +// - A description string. +// +// This definition is used by test_policy_overloads to test each overload +// against a reference while injecting a variety of execution policies. This +// validates that each overload behaves correctly according to some reference. +// +// Since much of the algorithm definition is generic and may be reused in +// multiple tests with slight changes, a mix-in system is used to simplify +// the creation of algorithm definitions. The following namespace hierarchy is +// used to organize these generic components: +// +// * testing::async::mixin:: +// ** ::input - Input types/values (device vectors, counting iterators, etc) +// ** ::output - Output types/values (device vectors, inplace device vectors, +// discard iterators, etc) +// ** ::postfix_args - Algorithm specific overload sets +// ** ::invoke_reference - Algorithm specific reference invocation +// ** ::invoke_async - Algorithm specific async algo invocation +// ** ::compare_outputs - Compare output values. +// +// Each algorithm should define its own `mixins.h` header to declare algorithm +// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async) +// in a testing::async::::mixins namespace structure. +// +// For example, the test.async.exclusive_scan.basic test uses the following +// algorithm definition from mix-ins: +// +// ``` +// #include +// #include +// #include +// template > +// struct basic_invoker +// : testing::async::mixin::input::device_vector +// , testing::async::mixin::output::device_vector +// , testing::async::exclusive_scan::mixin::postfix_args:: +// all_overloads +// , testing::async::exclusive_scan::mixin::invoke_reference:: +// host_synchronous +// , testing::async::exclusive_scan::mixin::invoke_async::basic +// , testing::async::mixin::compare_outputs::assert_equal_quiet +// { +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// }; +// +// ... +// +// testing::async::test_policy_overloads>::run(num_values); +// ``` +// +// The basic_invoker class expands to something similar to the following: +// +// ``` +// template > +// struct basic_invoker +// { +// public: +// +// static std::string description() +// { +// return "basic invocation with device vectors"; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::input::device_vector +// using input_type = thrust::device_vector; +// +// // Generate an instance of the input: +// static input_type generate_input(std::size_t num_values) +// { +// input_type input(num_values); +// thrust::sequence(input.begin(), input.end(), 25, 3); +// return input; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::output::device_vector +// using output_type = thrust::device_vector; +// +// // Generate an instance of the output: +// // Might be more complicated, eg. fancy iterators, etc +// static output_type generate_output(std::size_t num_values) +// { +// return output_type(num_values); +// } +// +// //------------------------------------------------------------------------- +// // +// testing::async::mixin::exclusive_scan::mixin::postfix_args::all_overloads +// // +// using postfix_args_type = std::tuple< // List any extra +// arg overloads: +// std::tuple<>, // - no extra args +// std::tuple, // - initial_value +// std::tuple // - initial_value, +// binary_op +// >; +// +// // Create instances of the extra arguments to use when invoking the +// // algorithm: +// static postfix_args_type generate_postfix_args() +// { +// return { +// {}, // no extra args +// {42}, // initial_value +// {57, alternate_binary_op{}} // initial_value, binary_op +// }; +// } +// +// //------------------------------------------------------------------------- +// // +// testing::async::mixin::exclusive_scan::mixin::invoke_reference::host_synchronous +// // +// // Invoke a reference implementation for a single overload as described by +// // postfix_tuple. This tuple contains instances of any additional +// arguments +// // to pass to the algorithm. The tuple/index_sequence pattern is used to +// // support the "no extra args" overload, since the parameter pack +// expansion +// // will do exactly what we want in all cases. +// template +// static void invoke_reference(input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// // Create host versions of the input/output: +// thrust::host_vector host_input(input.cbegin(), +// input.cend()); +// thrust::host_vector host_output(host_input.size()); +// +// // Run host synchronous algorithm to generate reference. +// thrust::exclusive_scan(host_input.cbegin(), +// host_input.cend(), +// host_output.begin(), +// std::get( +// THRUST_FWD(postfix_tuple))...); +// +// // Copy back to device. +// output = host_output; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic +// // +// // Invoke the async algorithm for a single overload as described by +// // the prefix and postfix tuples. These tuples contains instances of any +// // additional arguments to pass to the algorithm. The tuple/index_sequence +// // pattern is used to support the "no extra args" overload, since the +// // parameter pack expansion will do exactly what we want in all cases. +// // Prefix args are included here (but not for invoke_reference) to allow +// the +// // test framework to change the execution policy. +// // This method must return an event or future. +// template +// static auto invoke_async(PrefixArgTuple &&prefix_tuple, +// std::index_sequence, +// input_type const &input, +// output_type &output, +// PostfixArgTuple &&postfix_tuple, +// std::index_sequence) +// { +// output.resize(input.size()); +// auto e = thrust::async::exclusive_scan( +// std::get(THRUST_FWD(prefix_tuple))..., +// input.cbegin(), +// input.cend(), +// output.begin(), +// std::get(THRUST_FWD(postfix_tuple))...); +// return e; +// } +// +// //------------------------------------------------------------------------- +// // testing::async::mixin::compare_outputs::assert_equal_quiet +// // +// // Wait on and validate the event/future (usually with TEST_EVENT_WAIT / +// // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output +// // matches the testing output. +// template +// static void compare_outputs(EventType &e, +// output_type const &ref, +// output_type const &test) +// { +// TEST_EVENT_WAIT(e); +// ASSERT_EQUAL_QUIET(ref, test); +// } +// }; +// ``` +// +// Similar invokers with slight tweaks are used in other +// async/exclusive_scan/*.cu tests. + +namespace testing +{ +namespace async +{ +namespace mixin +{ + +//------------------------------------------------------------------------------ +namespace input +{ + +// TODO it'd be nice to specify a lambda expression that'd replace the call to +// thrust::sequence when desired. +template +struct device_vector +{ + using input_type = thrust::device_vector; + + static input_type generate_input(std::size_t num_values) + { + input_type input(num_values); + thrust::sequence(input.begin(), + input.end(), + static_cast(1.5), + static_cast(1)); + return input; + } +}; + +template +struct counting_iterator_0 +{ + struct input_type + { + using iterator = thrust::counting_iterator; + + std::size_t num_values; + + iterator begin() { return iterator{static_cast(0)}; } + iterator begin() const { return iterator{static_cast(0)}; } + iterator cbegin() const { return iterator{static_cast(0)}; } + + iterator end() { return iterator{static_cast(num_values)}; } + iterator end() const { return iterator{static_cast(num_values)}; } + iterator cend() const { return iterator{static_cast(num_values)}; } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +template +struct counting_iterator_1 +{ + struct input_type + { + using iterator = thrust::counting_iterator; + + std::size_t num_values; + + iterator begin() { return iterator{static_cast(1)}; } + iterator begin() const { return iterator{static_cast(1)}; } + iterator cbegin() const { return iterator{static_cast(1)}; } + + iterator end() { return iterator{static_cast(1 + num_values)}; } + iterator end() const { return iterator{static_cast(1 + num_values)}; } + iterator cend() const { return iterator{static_cast(1 + num_values)}; } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +template +struct constant_iterator_1 +{ + struct input_type + { + using iterator = thrust::constant_iterator; + + std::size_t num_values; + + iterator begin() { return iterator{static_cast(1)}; } + iterator begin() const { return iterator{static_cast(1)}; } + iterator cbegin() const { return iterator{static_cast(1)}; } + + iterator end() { return iterator{static_cast(1)} + num_values; } + iterator end() const + { + return iterator{static_cast(1)} + num_values; + } + iterator cend() const + { + return iterator{static_cast(1)} + num_values; + } + + std::size_t size() const { return num_values; } + }; + + static input_type generate_input(std::size_t num_values) + { + return {num_values}; + } +}; + +} // namespace input + +//------------------------------------------------------------------------------ +namespace output +{ + +template +struct device_vector +{ + using output_type = thrust::device_vector; + + template + static output_type generate_output(std::size_t num_values, + InputType& /* unused */) + { + return output_type(num_values); + } +}; + +template +struct device_vector_reuse_input +{ + using output_type = thrust::device_vector&; + + template + static output_type generate_output(std::size_t /*num_values*/, + InputType& input) + { + return input; + } +}; + +struct discard_iterator +{ + struct output_type + { + using iterator = thrust::discard_iterator<>; + + iterator begin() { return thrust::make_discard_iterator(); } + iterator begin() const { return thrust::make_discard_iterator(); } + iterator cbegin() const { return thrust::make_discard_iterator(); } + }; + + template + static output_type generate_output(std::size_t /* num_values */, + InputType& /* input */) + { + return output_type{}; + } +}; + +} // namespace output + +//------------------------------------------------------------------------------ +namespace postfix_args +{ +/* Defined per algorithm. Example: + * + * // Defines several overloads: + * // algorithm([policy,] input, output) // no postfix args + * // algorithm([policy,] input, output, initial_value) + * // algorithm([policy,] input, output, initial_value, binary_op) + * template > struct all_overloads + * { + * using postfix_args_type = std::tuple< // List any extra arg + * overloads: std::tuple<>, // - no extra args + * std::tuple, // - initial_value + * std::tuple // - initial_value, binary_op + * >; + * + * static postfix_args_type generate_postfix_args() + * { + * return {{}, {42}, {42, alternate_binary_op{}}}; + * } + * }; + * + */ +} + +//------------------------------------------------------------------------------ +namespace invoke_reference +{ + +/* Defined per algorithm. Example: + * + * template + * struct host_synchronous + * { + * template + * static void invoke_reference(InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * // Create host versions of the input/output: + * thrust::host_vector host_input(input.cbegin(), + * input.cend()); + * thrust::host_vector host_output(host_input.size()); + * + * // Run host synchronous algorithm to generate reference. + * // Be sure to call a backend that doesn't use the same underlying + * // implementation. + * thrust::exclusive_scan(host_input.cbegin(), + * host_input.cend(), + * host_output.begin(), + * std::get( + * THRUST_FWD(postfix_tuple))...); + * + * // Copy back to device. + * output = host_output; + * } + * }; + * + */ + +// Used to save time when testing unverifiable invocations (discard_iterators) +struct noop +{ + template + static void invoke_reference(Ts&&...) + {} +}; + +} // namespace invoke_reference + +//------------------------------------------------------------------------------ +namespace invoke_async +{ + +/* Defined per algorithm. Example: + * + * struct basic + * { + * template + * static auto invoke_async(PrefixArgTuple&& prefix_tuple, + * std::index_sequence, + * InputType const& input, + * OutputType& output, + * PostfixArgTuple&& postfix_tuple, + * std::index_sequence) + * { + * auto e = thrust::async::exclusive_scan( + * std::get(THRUST_FWD(prefix_tuple))..., + * input.cbegin(), + * input.cend(), + * output.begin(), + * std::get(THRUST_FWD(postfix_tuple))...); + * return e; + * } + * }; + */ + +} // namespace invoke_async + +//------------------------------------------------------------------------------ +namespace compare_outputs +{ + +namespace detail +{ + +void basic_event_validation(thrust::device_event& e) +{ + TEST_EVENT_WAIT(e); +} + +template +void basic_event_validation(thrust::device_future& f) +{ + TEST_FUTURE_VALUE_RETRIEVAL(f); +} + +} // namespace detail + +struct assert_equal +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_EQUAL(ref, test); + } +}; + +// Does an 'almost_equal' comparison for floating point types, since fp +// addition is non-associative +struct assert_maybe_fuzzy_equal +{ +private: + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::false_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_EQUAL(ref, test); + } + + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::true_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } + +public: + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + using value_type = typename OutputType::value_type; + compare_outputs_impl(e, ref, test, std::is_floating_point{}); + } +}; + +struct assert_equal_quiet +{ + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + detail::basic_event_validation(e); + ASSERT_EQUAL_QUIET(ref, test); + } +}; + +// Does an 'almost_equal' comparison for floating point types, since fp +// addition is non-associative +struct assert_maybe_fuzzy_equal_quiet +{ +private: + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::false_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_EQUAL_QUIET(ref, test); + } + + template + static void compare_outputs_impl(EventType& e, + OutputType const& ref, + OutputType const& test, + std::true_type /* is_floating_point */) + { + detail::basic_event_validation(e); + ASSERT_ALMOST_EQUAL(ref, test); + } + +public: + template + static void compare_outputs(EventType& e, + OutputType const& ref, + OutputType const& test) + { + using value_type = typename OutputType::value_type; + compare_outputs_impl(e, ref, test, std::is_floating_point{}); + } +}; + +// Used to save time when testing unverifiable invocations (discard_iterators). +// Just does basic validation of the future/event. +struct noop +{ + template + static void compare_outputs(EventType &e, Ts&&...) + { + detail::basic_event_validation(e); + } +}; + +} // namespace compare_outputs + +} // namespace mixin +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h new file mode 100644 index 0000000000..00a55a8b2d --- /dev/null +++ b/testing/async/test_policy_overloads.h @@ -0,0 +1,421 @@ +#pragma once + +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include + +#include + +#include + +// TODO Cover these cases from testing/async_reduce.cu: +// - [x] test_async_reduce_after ("after_future" in test_policy_overloads) +// - [ ] test_async_reduce_on_then_after (KNOWN_FAILURE, see #1195) +// - [ ] all the child variants (e.g. with allocator) too +// - [ ] test_async_copy_then_reduce (Need to figure out how to fit this in) +// - [ ] test_async_reduce_caching (only useful when returning future) + +namespace testing +{ + +namespace async +{ + +// Tests that policies are handled correctly for all overloads of an async +// algorithm. +// +// The AlgoDef parameter type defines an async algorithm, its overloads, and +// abstracts its invocation. See the async/mixins.h for a documented example of +// this interface and some convenience mixins that can be used to construct a +// definition quickly. +// +// The AlgoDef interface is used to run several tests of the algorithm, +// exhaustively testing all overloads for algorithm correctness and proper +// policy handling. +// +// ## Basic tests +// +// In the basic tests, each overload is called repeatedly with: +// 1) No policy +// 2) thrust::device +// 3) thrust::device(thrust::device_allocator) +// 4) thrust::device.on(stream) +// 5) thrust::device(thrust::device_allocator).on(stream) +// +// The output of the async algorithm is compared against a reference output, +// and the returned event/future is tested to make sure it holds a reference to +// the expected stream. +// +// ## After Future tests +// +// The after_future tests check that the future/event returned from an algorithm +// behaves properly when consumed by a policy's `.after` method. +template +struct test_policy_overloads +{ + using algo_def = AlgoDef; + using input_type = typename algo_def::input_type; + using output_type = typename algo_def::output_type; + using postfix_args_type = typename algo_def::postfix_args_type; + + static constexpr std::size_t num_postfix_arg_sets = + std::tuple_size::value; + + // Main entry point; call this from a unit test function. + static void run(std::size_t num_values) + { + test_postfix_overloads(num_values); + } + +private: + template + using size_const = std::integral_constant; + + //---------------------------------------------------------------------------- + // Recursively call sub tests for each overload set in postfix_args: + template + static void test_postfix_overloads(std::size_t const num_values, + size_const = {}) + { + static_assert(postfix_idx < num_postfix_arg_sets, "Internal error."); + + run_basic_policy_tests(num_values); + run_after_future_test(num_values); + + // Recurse to test next round of overloads: + test_postfix_overloads(num_values, size_const{}); + } + + static void test_postfix_overloads(std::size_t const, + size_const) + { + // terminal case, no-op + } + + //---------------------------------------------------------------------------- + // For the specified postfix overload set, test the algorithm with several + // different policy configurations. + template + static void run_basic_policy_tests(std::size_t const num_values) + { + // When a policy uses the default stream, the algorithm implementation + // should spawn a new stream in the returned event: + auto using_default_stream = [](auto& e) { + ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(), + e.stream().native_handle()); + }; + + // When a policy uses a non-default stream, the implementation should pass + // the stream through to the output: + thrust::system::cuda::detail::unique_stream test_stream{}; + auto using_test_stream = [&test_stream](auto& e) { + ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle()); + }; + + // Test the different types of policies: + run_basic_policy_test("(no policy)", + std::make_tuple(), + using_default_stream, + num_values); + + run_basic_policy_test("thrust::device", + std::make_tuple(thrust::device), + using_default_stream, + num_values); + + run_basic_policy_test( + "thrust::device(thrust::device_allocator{})", + std::make_tuple(thrust::device(thrust::device_allocator{})), + using_default_stream, + num_values); + + run_basic_policy_test("thrust::device.on(test_stream.get())", + std::make_tuple( + thrust::device.on(test_stream.get())), + using_test_stream, + num_values); + + run_basic_policy_test( + "thrust::device(thrust::device_allocator{}).on(test_stream.get())", + std::make_tuple( + thrust::device(thrust::device_allocator{}).on(test_stream.get())), + using_test_stream, + num_values); + } + + // Wrap the basic_policy_test in a try/catch block to inject helpful info into + // any thrown unit test exceptions. + template + static void run_basic_policy_test(std::string const &policy_desc, + PolicyTuple &&policy_tuple, + ValidateEvent &&validate, + std::size_t const num_values) + { + try + { + basic_policy_test(policy_tuple, + validate, + num_values); + } + catch (unittest::UnitTestException &exc) + { + // Append some identifying information to the exception to help with + // debugging: + using overload_t = std::tuple_element_t; + + std::string const overload_desc = + unittest::demangle(typeid(overload_t).name()); + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - algo_def::description = " << algo_def::description() << "\n" + << " - test = basic_policy\n" + << " - policy = " << policy_desc << "\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc << "\n" + << " - tuple of trailing arguments = " << overload_desc << "\n" + << " - num_values = " << num_values; + throw; + } + } + + // Invoke the algorithm multiple times and validate the results. + template + static void basic_policy_test(PrefixArgTuple &&prefix_tuple_ref, + ValidateEvent const &validate, + std::size_t num_values) + { + // Sink the prefix tuple into a const local so it can be safely passed to + // multiple invocations without worrying about potential modifications. + using prefix_tuple_type = thrust::remove_cvref_t; + prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref); + + using postfix_tuple_type = + std::tuple_element_t; + postfix_tuple_type const postfix_tuple = get_postfix_tuple(); + + // Generate index sequences for the tuples: + constexpr auto prefix_tuple_size = std::tuple_size{}; + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence; + using postfix_index_seq = std::make_index_sequence; + + // Use unique, non-const inputs for each invocation to support in-place + // algo_def configurations. + input_type input_a = algo_def::generate_input(num_values); + input_type input_b = algo_def::generate_input(num_values); + input_type input_c = algo_def::generate_input(num_values); + input_type input_d = algo_def::generate_input(num_values); + input_type input_ref = algo_def::generate_input(num_values); + + output_type output_a = algo_def::generate_output(num_values, input_a); + output_type output_b = algo_def::generate_output(num_values, input_b); + output_type output_c = algo_def::generate_output(num_values, input_c); + output_type output_d = algo_def::generate_output(num_values, input_d); + output_type output_ref = algo_def::generate_output(num_values, input_ref); + + // Invoke multiple overlapping async algorithms, capturing their outputs + // and events/futures: + auto e_a = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_a, + output_a, + postfix_tuple, + postfix_index_seq{}); + auto e_b = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_b, + output_b, + postfix_tuple, + postfix_index_seq{}); + auto e_c = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_c, + output_c, + postfix_tuple, + postfix_index_seq{}); + auto e_d = algo_def::invoke_async(prefix_tuple, + prefix_index_seq{}, + input_d, + output_d, + postfix_tuple, + postfix_index_seq{}); + + // Let reference calc overlap with async testing: + algo_def::invoke_reference(input_ref, + output_ref, + postfix_tuple, + postfix_index_seq{}); + + algo_def::compare_outputs(e_a, output_ref, output_a); + algo_def::compare_outputs(e_b, output_ref, output_b); + algo_def::compare_outputs(e_c, output_ref, output_c); + algo_def::compare_outputs(e_d, output_ref, output_d); + + validate(e_a); + validate(e_b); + validate(e_c); + validate(e_d); + } + + //---------------------------------------------------------------------------- + // Test .after(event/future) handling: + template + static void run_after_future_test(std::size_t const num_values) + { + try + { + after_future_test(num_values); + } + catch (unittest::UnitTestException &exc) + { + // Append some identifying information to the exception to help with + // debugging: + using postfix_t = std::tuple_element_t; + + std::string const postfix_desc = + unittest::demangle(typeid(postfix_t).name()); + std::string const input_desc = + unittest::demangle(typeid(input_type).name()); + std::string const output_desc = + unittest::demangle(typeid(output_type).name()); + + exc << "\n" + << " - algo_def::description = " << algo_def::description() << "\n" + << " - test = after_future\n" + << " - input_type = " << input_desc << "\n" + << " - output_type = " << output_desc << "\n" + << " - tuple of trailing arguments = " << postfix_desc << "\n" + << " - num_values = " << num_values; + throw; + } + } + + template + static void after_future_test(std::size_t const num_values) + { + using postfix_tuple_type = + std::tuple_element_t; + postfix_tuple_type const postfix_tuple = get_postfix_tuple(); + + // Generate index sequences for the tuples. Prefix size always = 1 here. + constexpr auto postfix_tuple_size = std::tuple_size{}; + using prefix_index_seq = std::make_index_sequence<1>; + using postfix_index_seq = std::make_index_sequence; + + // Use unique, non-const inputs for each invocation to support in-place + // algo_def configurations. + input_type input_a = algo_def::generate_input(num_values); + input_type input_b = algo_def::generate_input(num_values); + input_type input_c = algo_def::generate_input(num_values); + input_type input_tmp = algo_def::generate_input(num_values); + input_type input_ref = algo_def::generate_input(num_values); + + output_type output_a = algo_def::generate_output(num_values, input_a); + output_type output_b = algo_def::generate_output(num_values, input_b); + output_type output_c = algo_def::generate_output(num_values, input_c); + output_type output_tmp = algo_def::generate_output(num_values, input_tmp); + output_type output_ref = algo_def::generate_output(num_values, input_ref); + + auto e_a = algo_def::invoke_async(std::make_tuple(thrust::device), + prefix_index_seq{}, + input_a, + output_a, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_a.valid_stream()); + auto const stream_a = e_a.stream().native_handle(); + + // Execution on default stream should create a new stream in the result: + ASSERT_NOT_EQUAL_QUIET(thrust::cuda_cub::default_stream(), stream_a); + + // Explicitly order this invocation after the previous one: + auto e_b = + algo_def::invoke_async(std::make_tuple(thrust::device.after(e_a)), + prefix_index_seq{}, + input_b, + output_b, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_b.valid_stream()); + auto const stream_b = e_b.stream().native_handle(); + + // Second invocation should use same stream as before: + ASSERT_EQUAL_QUIET(stream_a, stream_b); + + // Verify that double consumption of e_a produces an exception: + ASSERT_THROWS_EQUAL(auto x = algo_def::invoke_async( + std::make_tuple(thrust::device.after(e_a)), + prefix_index_seq{}, + input_tmp, + output_tmp, + postfix_tuple, + postfix_index_seq{}); + THRUST_UNUSED_VAR(x), + thrust::event_error, + thrust::event_error(thrust::event_errc::no_state)); + + // Explicitly order this invocation after e_b: + auto policy_after_e_b = thrust::device.after(e_b); + // Make sure the tuple contains an lvalue ref: + auto policy_after_e_b_tuple = std::forward_as_tuple(policy_after_e_b); + auto e_c = + algo_def::invoke_async(policy_after_e_b_tuple, + prefix_index_seq{}, + input_c, + output_c, + postfix_tuple, + postfix_index_seq{}); + ASSERT_EQUAL(true, e_c.valid_stream()); + auto const stream_c = e_c.stream().native_handle(); + + // Should use same stream as e_b: + ASSERT_EQUAL_QUIET(stream_b, stream_c); + + // Verify that indirect double consumption of e_b produces an exception: + ASSERT_THROWS_EQUAL( + auto x = algo_def::invoke_async(policy_after_e_b_tuple, + prefix_index_seq{}, + input_tmp, + output_tmp, + postfix_tuple, + postfix_index_seq{}); + THRUST_UNUSED_VAR(x), + thrust::event_error, + thrust::event_error(thrust::event_errc::no_state)); + + // Let reference calc overlap with async testing: + algo_def::invoke_reference(input_ref, + output_ref, + postfix_tuple, + postfix_index_seq{}); + + // Validate results + // Use e_c for all three checks -- e_a and e_b will not pass the event + // checks since their streams were stolen by dependencies. + algo_def::compare_outputs(e_c, output_ref, output_a); + algo_def::compare_outputs(e_c, output_ref, output_b); + algo_def::compare_outputs(e_c, output_ref, output_c); + } + + //---------------------------------------------------------------------------- + // Various helper functions: + template + static auto get_postfix_tuple() + { + return std::get(algo_def::generate_postfix_args()); + } +}; + +} // namespace async +} // namespace testing + +#endif // C++14 diff --git a/testing/event.cu b/testing/event.cu index 5833d41452..5814269195 100644 --- a/testing/event.cu +++ b/testing/event.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include diff --git a/testing/future.cu b/testing/future.cu index 1375588607..eb1ab582a0 100644 --- a/testing/future.cu +++ b/testing/future.cu @@ -1,6 +1,6 @@ #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h index 984cc61c6b..9a3454efd5 100644 --- a/testing/unittest/util_async.h +++ b/testing/unittest/util_async.h @@ -1,9 +1,9 @@ #pragma once #include -#include +#include -#if THRUST_CPP_DIALECT >= 2011 +#if THRUST_CPP_DIALECT >= 2014 #include @@ -73,5 +73,4 @@ auto test_future_value_retrieval( } // namespace unittest -#endif // THRUST_CPP_DIALECT >= 2011 - +#endif // THRUST_CPP_DIALECT >= 2014 diff --git a/thrust/async/scan.h b/thrust/async/scan.h new file mode 100644 index 0000000000..d7adfac983 --- /dev/null +++ b/thrust/async/scan.h @@ -0,0 +1,355 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! \file async/scan.h + * \brief Functions for asynchronously computing prefix scans. + */ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace thrust +{ + +namespace async +{ + +// Fallback implementations used when no overloads are found via ADL: +namespace unimplemented +{ + +template +event +async_inclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +template +event +async_exclusive_scan(thrust::execution_policy&, + ForwardIt, + Sentinel, + OutputIt, + InitialValueType, + BinaryOp) +{ + THRUST_STATIC_ASSERT_MSG( + (thrust::detail::depend_on_instantiation::value), + "this algorithm is not implemented for the specified system" + ); + return {}; +} + +} // namespace unimplemented + +namespace inclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_inclusive_scan; + +// Implementation of the thrust::async::inclusive_scan CPO. +struct inclusive_scan_fn final +{ + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) + + template >>> + THRUST_NODISCARD + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_inclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + thrust::plus<>{} + ) + ) +}; + +} // namespace inclusive_scan_detail + +THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{}; + +namespace exclusive_scan_detail +{ + +// Include fallback implementation for ADL failures +using thrust::async::unimplemented::async_exclusive_scan; + +// Implementation of the thrust::async::exclusive_scan CPO. +struct exclusive_scan_fn final +{ + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + THRUST_NODISCARD + auto + operator()(thrust::detail::execution_policy_base const& exec, + ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::derived_cast(thrust::detail::strip_const(exec)), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) + + template >>> + THRUST_NODISCARD + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init, + BinaryOp&& op) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) + ) + + template >>> + THRUST_NODISCARD + auto + operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out, + InitialValueType&& init) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + THRUST_FWD(init), + thrust::plus<>{} + ) + ) + + template + THRUST_NODISCARD + auto operator()(ForwardIt&& first, + Sentinel&& last, + OutputIt&& out) const + // ADL dispatch. + THRUST_RETURNS( + async_exclusive_scan( + thrust::detail::select_system( + iterator_system_t>{}, + iterator_system_t>{} + ), + THRUST_FWD(first), + THRUST_FWD(last), + THRUST_FWD(out), + iterator_value_t>{}, + thrust::plus<>{} + ) + ) +}; + +} // namespace exclusive_scan_detail + +THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{}; + +} // namespace async + +} // end namespace thrust + +#endif diff --git a/thrust/future.h b/thrust/future.h index 12bebf8c6e..03c5d79f6c 100644 --- a/thrust/future.h +++ b/thrust/future.h @@ -24,7 +24,7 @@ #include #include -#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC) +#if THRUST_CPP_DIALECT >= 2014 #include #include diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h new file mode 100644 index 0000000000..3ce27d2e11 --- /dev/null +++ b/thrust/system/cuda/detail/async/exclusive_scan.h @@ -0,0 +1,199 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +auto async_exclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + InitialValueType init, + BinaryOp op) + -> unique_eager_event +{ + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; + + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + cudaError_t status; + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (nullptr, + tmp_size, + first, + out, + op, + init, + n_fixed, + nullptr, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after determining tmp storage " + "requirements for exclusive_scan"); + } + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (tmp_ptr, + tmp_size, + first, + out, + op, + init, + n_fixed, + user_raw_stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after dispatching exclusive_scan kernel"); + } + + return std::move(ev); +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_exclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + InitialValueType &&init, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_exclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(init), + THRUST_FWD(op) + ) +) + +} // namespace cuda_cub + +} // namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif // C++14 + diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h new file mode 100644 index 0000000000..1600e15fab --- /dev/null +++ b/thrust/system/cuda/detail/async/inclusive_scan.h @@ -0,0 +1,195 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include +#include + +#if THRUST_CPP_DIALECT >= 2014 + +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan +// - Note that thrust::plus<> is transparent, cub::Sum is not. This should be +// fixed in CUB first). +// - Need to check if CUB actually optimizes for sums before putting in effort + +namespace thrust +{ +namespace system +{ +namespace cuda +{ +namespace detail +{ + +template +auto async_inclusive_scan_n(execution_policy& policy, + ForwardIt first, + Size n, + OutputIt out, + BinaryOp op) + -> unique_eager_event +{ + using Dispatch32 = cub::DispatchScan; + + using Dispatch64 = cub::DispatchScan; + + auto const device_alloc = get_async_device_allocator(policy); + unique_eager_event ev; + + // Determine temporary device storage requirements. + cudaError_t status; + size_t tmp_size = 0; + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (nullptr, + tmp_size, + first, + out, + op, + cub::NullType{}, + n_fixed, + nullptr, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after determining tmp storage " + "requirements for inclusive_scan"); + } + + // Allocate temporary storage. + auto content = uninitialized_allocate_unique_n( + device_alloc, tmp_size + ); + void* const tmp_ptr = raw_pointer_cast(content.get()); + + // Set up stream with dependencies. + cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy); + + if (thrust::cuda_cub::default_stream() != user_raw_stream) + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple( + std::move(content), + unique_stream(nonowning, user_raw_stream) + ), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + else + { + ev = make_dependent_event( + std::tuple_cat( + std::make_tuple(std::move(content)), + extract_dependencies(std::move(thrust::detail::derived_cast(policy))))); + } + + // Run scan. + { + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + n, + (tmp_ptr, + tmp_size, + first, + out, + op, + cub::NullType{}, + n_fixed, + user_raw_stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after dispatching inclusive_scan kernel"); + } + + return std::move(ev); +} + +}}} // namespace system::cuda::detail + +namespace cuda_cub +{ + +// ADL entry point. +template +auto async_inclusive_scan(execution_policy& policy, + ForwardIt first, + Sentinel&& last, + OutputIt&& out, + BinaryOp&& op) +THRUST_RETURNS( + thrust::system::cuda::detail::async_inclusive_scan_n( + policy, + first, + distance(first, THRUST_FWD(last)), + THRUST_FWD(out), + THRUST_FWD(op) + ) +) + +} // namespace cuda_cub + +} // namespace thrust + +#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + +#endif // C++14 + diff --git a/thrust/system/cuda/detail/async/scan.h b/thrust/system/cuda/detail/async/scan.h new file mode 100644 index 0000000000..7d993e6641 --- /dev/null +++ b/thrust/system/cuda/detail/async/scan.h @@ -0,0 +1,33 @@ +/****************************************************************************** + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#include +#include diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h new file mode 100644 index 0000000000..a2a90618b4 --- /dev/null +++ b/thrust/system/detail/adl/async/scan.h @@ -0,0 +1,34 @@ +/* + * Copyright 2008-2020 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The purpose of this header is to #include the async/scan.h header of the +// sequential, host, and device systems. It should be #included in any code +// which uses ADL to dispatch async scans. + +#pragma once + +#include + +//#include + +//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h> +//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER +//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER + +#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h> +#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER +#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER + diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl index 300b697b26..83d272c3e9 100644 --- a/thrust/system/detail/generic/scan.inl +++ b/thrust/system/detail/generic/scan.inl @@ -61,9 +61,7 @@ __host__ __device__ { // Use the input iterator's value type per https://wg21.link/P0571 using ValueType = typename thrust::iterator_value::type; - - // assume 0 as the initialization value - return thrust::exclusive_scan(exec, first, last, result, ValueType(0)); + return thrust::exclusive_scan(exec, first, last, result, ValueType{}); } // end exclusive_scan()