diff --git a/dependencies/cub b/dependencies/cub
index a39e385cc6..f88e0c7fe3 160000
--- a/dependencies/cub
+++ b/dependencies/cub
@@ -1 +1 @@
-Subproject commit a39e385cc6be20754f859dd266021ab1d88459d3
+Subproject commit f88e0c7fe39db3c6161ea4bbcc7228d30e12410f
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index fdfc04e97b..244d839fed 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -151,6 +151,7 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
 endforeach()
 
 # Add specialized tests:
+add_subdirectory(async)
 add_subdirectory(cpp)
 add_subdirectory(cuda)
 add_subdirectory(omp)
diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt
new file mode 100644
index 0000000000..00d50f097f
--- /dev/null
+++ b/testing/async/CMakeLists.txt
@@ -0,0 +1,80 @@
+# The async tests perform a large amount of codegen, making them expensive to
+# build and test. To keep compilation and runtimes manageable, the tests are
+# broken up into many files per algorithm to enable parallelism during
+# compilation and testing. The structure of these test directories are:
+#
+# thrust/testing/async/<algorithm_name>/<unit_test>.cu
+#
+# These generate executables and CTest tests named
+# ${config_prefix}.test.async.<algorithm_name>.<unit_test>.
+
+# The async tests only support CUDA enabled configs. Create a list of valid
+# thrust targets:
+set(cuda_configs)
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (config_device STREQUAL CUDA)
+    list(APPEND cuda_configs ${thrust_target})
+  endif()
+endforeach()
+
+list(LENGTH cuda_configs num_cuda_configs)
+if (num_cuda_configs EQUAL 0)
+  return() # No valid configs found, nothing to do.
+endif()
+
+# Process a single algorithm directory, adding all .cu/cpp files as tests for
+# each valid backend. algo_name is the name of the subdir (<algorithm_name>
+# above) and is used for naming the executable/targets.
+function(thrust_add_async_test_dir algo_name)
+  file(GLOB test_srcs
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    CONFIGURE_DEPENDS
+    "${algo_name}/*.cu"
+    "${algo_name}/*.cpp"
+  )
+
+  # Per-algorithm, all-config metatarget: thrust.all.test.async.[algo].all
+  set(algo_meta_target thrust.all.test.async.${algo_name}.all)
+  add_custom_target(${algo_meta_target})
+
+  foreach(thrust_target IN LISTS cuda_configs)
+    thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+    # Per-algorithm, per-config metatarget: thrust.[config].test.async.[algo].all
+    set(algo_config_meta_target ${config_prefix}.test.async.${algo_name}.all)
+    add_custom_target(${algo_config_meta_target})
+    add_dependencies(${algo_meta_target} ${algo_config_meta_target})
+
+    foreach(test_src IN LISTS test_srcs)
+      get_filename_component(test_name "${test_src}" NAME_WLE)
+      string(PREPEND test_name async.${algo_name}.)
+
+      thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+      if(THRUST_ENABLE_TESTS_WITH_RDC)
+        thrust_enable_rdc_for_cuda_target(${test_target})
+      endif()
+
+      add_dependencies(${algo_config_meta_target} ${test_target})
+    endforeach()
+  endforeach()
+endfunction()
+
+# Grab all algorithm subdirectories:
+set(test_dirs)
+file(GLOB contents
+  CONFIGURE_DEPENDS
+  "${CMAKE_CURRENT_LIST_DIR}/*"
+)
+
+foreach(test_dir IN LISTS contents)
+  if(IS_DIRECTORY "${test_dir}")
+    list(APPEND test_dirs "${test_dir}")
+  endif()
+endforeach()
+
+# Process all test dirs:
+foreach(test_dir IN LISTS test_dirs)
+  get_filename_component(algo_name "${test_dir}" NAME_WLE)
+  thrust_add_async_test_dir(${algo_name})
+endforeach()
diff --git a/testing/async/exclusive_scan/basic.cu b/testing/async/exclusive_scan/basic.cu
new file mode 100644
index 0000000000..3fcd3c4a4b
--- /dev/null
+++ b/testing/async/exclusive_scan/basic.cu
@@ -0,0 +1,72 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct basic_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "basic invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct TestBasic
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<basic_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasic, NumericTypes);
+
+// Testing the in-place algorithm uses the exact same instantiations of the
+// underlying scan implementation as above. Test them here to avoid compiling
+// them twice.
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct basic_inplace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector_reuse_input<input_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        input_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "basic in-place invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct TestBasicInPlace
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = basic_inplace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasicInPlace, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu
new file mode 100644
index 0000000000..d056c6e782
--- /dev/null
+++ b/testing/async/exclusive_scan/counting_iterator.cu
@@ -0,0 +1,46 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <algorithm>
+#include <limits>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : testing::async::mixin::input::counting_iterator_0<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "fancy input iterator (counting_iterator)";
+  }
+};
+
+template <typename T>
+struct TestCountingIterator
+{
+  void operator()(std::size_t num_values) const
+  {
+    num_values = unittest::truncate_to_max_representable<T>(num_values);
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+// Use built-in types only, counting_iterator doesn't seem to be compatible with
+// the custom_numeric.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestCountingIterator,
+                                          BuiltinNumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu
new file mode 100644
index 0000000000..692fb35b3d
--- /dev/null
+++ b/testing/async/exclusive_scan/discard_output.cu
@@ -0,0 +1,38 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Compilation test with discard iterators. No runtime validation is actually
+// performed, other than testing whether the algorithm completes without
+// exception.
+
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct discard_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::discard_iterator
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::noop
+{
+  static std::string description() { return "discard output"; }
+};
+
+template <typename T>
+struct TestDiscard
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<discard_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestDiscard, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/large_indices.cu b/testing/async/exclusive_scan/large_indices.cu
new file mode 100644
index 0000000000..0d5a7aaa24
--- /dev/null
+++ b/testing/async/exclusive_scan/large_indices.cu
@@ -0,0 +1,239 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <thrust/device_free.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_ptr.h>
+#include <thrust/optional.h>
+
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+
+#include <cstdint>
+
+// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu.
+
+namespace
+{
+
+// Fake iterator that asserts
+// (a) it is written with a sequence and
+// (b) a defined maximum value is written at some point
+struct assert_sequence_iterator
+{
+  using value_type      = std::int64_t;
+  using difference_type = std::int64_t;
+
+  // Defined for thrust::iterator_traits:
+  using pointer           = value_type*;
+  using reference         = assert_sequence_iterator; // weird but convenient
+  using iterator_category =
+    typename thrust::detail::iterator_facade_category<
+      thrust::device_system_tag,
+      thrust::random_access_traversal_tag,
+      value_type,
+      reference>::type;
+
+  std::int64_t expected{0};
+  std::int64_t max{0};
+  mutable thrust::device_ptr<bool> found_max{nullptr};
+  mutable thrust::device_ptr<bool> unexpected_value{nullptr};
+
+  // Should be called on the first iterator generated.
+  void initialize_shared_state()
+  {
+    found_max        = thrust::device_malloc<bool>(1);
+    unexpected_value = thrust::device_malloc<bool>(1);
+    *found_max        = false;
+    *unexpected_value = false;
+  }
+
+  // Should be called only once on the initialized iterator.
+  void free_shared_state() const
+  {
+    thrust::device_free(found_max);
+    thrust::device_free(unexpected_value);
+    found_max        = nullptr;
+    unexpected_value = nullptr;
+  }
+
+  __host__ __device__ assert_sequence_iterator operator+(difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  __host__ __device__ reference operator[](difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  // Some weirdness, this iterator acts like its own reference
+  __device__ assert_sequence_iterator operator=(value_type val)
+  {
+    if (val != expected)
+    {
+      printf("Error: expected %lld, got %lld\n", expected, val);
+      *unexpected_value = true;
+    }
+    else if (val == max)
+    {
+      *found_max = true;
+    }
+
+    return *this;
+  }
+
+private:
+  __host__ __device__
+  assert_sequence_iterator clone(value_type new_expected) const
+  {
+    return {new_expected, max, found_max, unexpected_value};
+  }
+};
+
+// output mixin that generates assert_sequence_iterators.
+// Must be paired with validate_assert_sequence_iterators mixin to free
+// shared state.
+struct assert_sequence_output
+{
+  struct output_type
+  {
+    using iterator = assert_sequence_iterator;
+
+    iterator iter;
+
+    explicit output_type(iterator&& it)
+        : iter{std::move(it)}
+    {
+      iter.initialize_shared_state();
+    }
+
+    ~output_type()
+    {
+      iter.free_shared_state();
+    }
+
+    iterator begin() { return iter; }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values, InputType&)
+  {
+    using value_type = typename assert_sequence_iterator::value_type;
+    assert_sequence_iterator it{0,
+                                // minus one bc exclusive scan:
+                                static_cast<value_type>(num_values - 1),
+                                nullptr,
+                                nullptr};
+    return output_type{std::move(it)};
+  }
+};
+
+struct validate_assert_sequence_iterators
+{
+  using output_t = assert_sequence_output::output_type;
+
+  template <typename EventType>
+  static void compare_outputs(EventType& e,
+                              output_t const&,
+                              output_t const& test)
+  {
+    testing::async::mixin::compare_outputs::detail::basic_event_validation(e);
+
+    ASSERT_EQUAL(*test.iter.unexpected_value, false);
+    ASSERT_EQUAL(*test.iter.found_max, true);
+  }
+};
+
+//------------------------------------------------------------------------------
+// Overloads without custom binary operators use thrust::plus<>, so use
+// constant input iterator to generate the output sequence:
+struct default_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>,                       // - no extra args
+    std::tuple<uint64_t>                // - initial_value
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{}, {0}};
+  }
+};
+
+struct default_bin_op_invoker
+    : testing::async::mixin::input::constant_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , default_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with default binary operator";
+  }
+};
+
+} // anon namespace
+
+void TestLargeIndicesDefaultScanOp()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(TestLargeIndicesDefaultScanOp);
+
+namespace
+{
+
+//------------------------------------------------------------------------------
+// Generate the output sequence using counting iterators and thrust::max<> for
+// custom operator overloads.
+struct custom_bin_op_overloads
+{
+  using postfix_args_type = std::tuple<     // List any extra arg overloads:
+    std::tuple<uint64_t, thrust::maximum<>> // - initial_value, binop
+  >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{0, {}}};
+  }
+};
+
+struct custom_bin_op_invoker
+  : testing::async::mixin::input::counting_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , custom_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with custom binary operator";
+  }
+};
+
+} // namespace
+
+void TestLargeIndicesCustomScanOp()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(TestLargeIndicesCustomScanOp);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/large_types.cu b/testing/async/exclusive_scan/large_types.cu
new file mode 100644
index 0000000000..90bfdc683d
--- /dev/null
+++ b/testing/async/exclusive_scan/large_types.cu
@@ -0,0 +1,63 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <unittest/special_types.h>
+
+// This test is an adaptation of TestScanWithLargeTypes from scan.cu.
+
+// Need special initialization for the FixedVector type:
+template <typename value_type>
+struct device_vector_fill
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::fill(input.begin(), input.end(), value_type{2});
+    return input;
+  }
+};
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : device_vector_fill<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "scan with large value types.";
+  }
+};
+
+struct TestLargeTypes
+{
+  void operator()(std::size_t num_values) const
+  {
+    using testing::async::test_policy_overloads;
+
+    test_policy_overloads<invoker<FixedVector<int, 1>>>::run(num_values);
+
+#ifndef __QNX__ // These are excluded on QNX for scan.cu's version of this test.
+    test_policy_overloads<invoker<FixedVector<int, 8>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 32>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 64>>>::run(num_values);
+#else
+    KNOWN_FAILURE;
+#endif
+  }
+};
+DECLARE_UNITTEST(TestLargeTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu
new file mode 100644
index 0000000000..174880b246
--- /dev/null
+++ b/testing/async/exclusive_scan/mixed_types.cu
@@ -0,0 +1,105 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Test using mixed int/float types for:
+// - input_value_type       | (int, float)
+// - output_value_type      | (int, float)
+// - initial_value_type     | (int, float, <none>)
+// - thrust::plus<T> T-type | (int, float, void>)
+//
+// The initial_value_type and thrust::plus<T> types are covered by the
+// mixin::postfix_args::scan_mixed_types_overloads component.
+//
+// The testing/scan.cu TestMixedTypes test spells out the expected behavior,
+// which is defined by https://wg21.link/P0571.
+
+namespace
+{
+
+template <typename value_type>
+struct mixed_type_input_generator
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen deliberately to test
+                     // casting orders and accumulator types:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+// A fractional value is used to ensure that a different result is obtained when
+// using float vs. int.
+template <typename value_type>
+struct mixed_types_postfix_args
+{
+  using postfix_args_type = std::tuple<         // Overloads to test:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, thrust::plus<>>,     // - initial_value, plus<>
+    std::tuple<value_type, thrust::plus<int>>,  // - initial_value, plus<int>
+    std::tuple<value_type, thrust::plus<float>> // - initial_value, plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{},
+            {static_cast<value_type>(5.5)},
+            {static_cast<value_type>(5.5), thrust::plus<>{}},
+            {static_cast<value_type>(5.5), thrust::plus<int>{}},
+            {static_cast<value_type>(5.5), thrust::plus<float>{}}};
+  }
+};
+
+template <typename input_value_type,
+          typename output_value_type,
+          typename initial_value_type>
+struct invoker
+    : mixed_type_input_generator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , mixed_types_postfix_args<initial_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "mixed input/output/initial type tests";
+  }
+};
+
+} // namespace
+
+void TestScanMixedTypes()
+{
+  // 10 values are enough to check the behavior we want to test while staying
+  // small enough to reason about.
+  constexpr std::size_t num_values = 10;
+
+  // invoker template params are input_value_type, output_vt, initial_vt:
+  using testing::async::test_policy_overloads;
+  test_policy_overloads<invoker<int, int, int>>::run(num_values);
+  test_policy_overloads<invoker<int, int, float>>::run(num_values);
+  test_policy_overloads<invoker<int, float, int>>::run(num_values);
+  test_policy_overloads<invoker<int, float, float>>::run(num_values);
+  test_policy_overloads<invoker<float, int, int>>::run(num_values);
+  test_policy_overloads<invoker<float, int, float>>::run(num_values);
+  test_policy_overloads<invoker<float, float, int>>::run(num_values);
+  // We all float down here
+  test_policy_overloads<invoker<float, float, float>>::run(num_values);
+}
+DECLARE_UNITTEST(TestScanMixedTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h
new file mode 100644
index 0000000000..719e7dabaf
--- /dev/null
+++ b/testing/async/exclusive_scan/mixin.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/async/scan.h>
+
+#include <thrust/scan.h>
+
+#include <async/mixin.h>
+
+namespace testing
+{
+namespace async
+{
+namespace exclusive_scan
+{
+
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple<         // List any extra arg overloads:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{}, {42}, {42, alternate_binary_op{}}};
+  }
+};
+
+} // namespace postfix_args
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::exclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+struct basic
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::exclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+} // namespace mixin
+} // namespace exclusive_scan
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/stateful_operator.cu b/testing/async/exclusive_scan/stateful_operator.cu
new file mode 100644
index 0000000000..34b4adde14
--- /dev/null
+++ b/testing/async/exclusive_scan/stateful_operator.cu
@@ -0,0 +1,60 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+namespace
+{
+
+// Custom binary operator for scan:
+template <typename T>
+struct stateful_operator
+{
+  T offset;
+
+  __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; }
+};
+
+// Postfix args overload definition that uses a stateful custom binary operator
+template <typename value_type>
+struct use_stateful_operator
+{
+  using postfix_args_type = std::tuple<                   // Single overload:
+    std::tuple<value_type, stateful_operator<value_type>> // init_val, bin_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{value_type{42}, {value_type{2}}}};
+  }
+};
+
+template <typename value_type>
+struct invoker
+    : testing::async::mixin::input::device_vector<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , use_stateful_operator<value_type>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description() { return "scan with stateful operator"; }
+};
+
+} // namespace
+
+template <typename T>
+struct TestStatefulOperator
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestStatefulOperator, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu
new file mode 100644
index 0000000000..4415cfda44
--- /dev/null
+++ b/testing/async/exclusive_scan/using_vs_adl.cu
@@ -0,0 +1,171 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Verify what happens when calling the algorithm without any namespace
+// qualifiers:
+// - If the async entry point is available in the global namespace due to a
+//   using statement, the async algorithm should be called.
+// - Otherwise, ADL should resolve the call to the synchronous algo in the
+//   thrust:: namespace.
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct adl_host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+
+    // ADL should resolve this to the synchronous `thrust::` algorithm.
+    // This is checked by ensuring that the call returns an output iterator.
+    OutIter result =
+      exclusive_scan(host_input.cbegin(),
+                     host_input.cend(),
+                     host_output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    (void)result;
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct using_namespace
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using namespace thrust::async;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+struct using_cpo
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using thrust::async::exclusive_scan;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_namespace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_namespace
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with `using namespace thrust::async`";
+  }
+};
+
+void TestUsingNamespace()
+{
+  using invoker = using_namespace_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(TestUsingNamespace);
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_cpo_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_cpo
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with "
+           "`using namespace thrust::async::exclusive_scan`";
+  }
+};
+
+void TestUsingCPO()
+{
+  using invoker = using_cpo_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(TestUsingCPO);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/basic.cu b/testing/async/inclusive_scan/basic.cu
new file mode 100644
index 0000000000..a4437c638e
--- /dev/null
+++ b/testing/async/inclusive_scan/basic.cu
@@ -0,0 +1,70 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct basic_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "basic invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct TestBasic
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<basic_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasic, NumericTypes);
+
+// Testing the in-place algorithm uses the exact same instantiations of the
+// underlying scan implementation as above. Test them here to avoid compiling
+// them twice.
+template <typename input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct basic_inplace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector_reuse_input<input_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        input_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "basic in-place invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct TestBasicInPlace
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = basic_inplace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestBasicInPlace, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/counting_iterator.cu b/testing/async/inclusive_scan/counting_iterator.cu
new file mode 100644
index 0000000000..7818bf018b
--- /dev/null
+++ b/testing/async/inclusive_scan/counting_iterator.cu
@@ -0,0 +1,45 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <algorithm>
+#include <limits>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : testing::async::mixin::input::counting_iterator_0<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "fancy input iterator (counting_iterator)";
+  }
+};
+
+template <typename T>
+struct TestCountingIterator
+{
+  void operator()(std::size_t num_values) const
+  {
+    num_values = unittest::truncate_to_max_representable<T>(num_values);
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+// Use built-in types only, counting_iterator doesn't seem to be compatible with
+// the custom_numeric.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestCountingIterator,
+                                          BuiltinNumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/discard_output.cu b/testing/async/inclusive_scan/discard_output.cu
new file mode 100644
index 0000000000..b310ee3a4d
--- /dev/null
+++ b/testing/async/inclusive_scan/discard_output.cu
@@ -0,0 +1,37 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Compilation test with discard iterators. No runtime validation is actually
+// performed, other than testing whether the algorithm completes without
+// exception.
+
+template <typename input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct discard_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::discard_iterator
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::noop
+{
+  static std::string description() { return "discard output"; }
+};
+
+template <typename T>
+struct TestDiscard
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<discard_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestDiscard, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/large_indices.cu b/testing/async/inclusive_scan/large_indices.cu
new file mode 100644
index 0000000000..a73d32e8b6
--- /dev/null
+++ b/testing/async/inclusive_scan/large_indices.cu
@@ -0,0 +1,228 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <thrust/device_free.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_ptr.h>
+#include <thrust/optional.h>
+
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+
+#include <cstdint>
+
+// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu.
+
+namespace
+{
+
+// Fake iterator that asserts
+// (a) it is written with a sequence and
+// (b) a defined maximum value is written at some point
+struct assert_sequence_iterator
+{
+  using value_type      = std::int64_t;
+  using difference_type = std::int64_t;
+
+  // Defined for thrust::iterator_traits:
+  using pointer           = value_type *;
+  using reference         = assert_sequence_iterator; // weird but convenient
+  using iterator_category = typename thrust::detail::iterator_facade_category<
+    thrust::device_system_tag,
+    thrust::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+
+  std::int64_t expected{0};
+  std::int64_t max{0};
+  mutable thrust::device_ptr<bool> found_max{nullptr};
+  mutable thrust::device_ptr<bool> unexpected_value{nullptr};
+
+  // Should be called on the first iterator generated.
+  void initialize_shared_state()
+  {
+    found_max         = thrust::device_malloc<bool>(1);
+    unexpected_value  = thrust::device_malloc<bool>(1);
+    *found_max        = false;
+    *unexpected_value = false;
+  }
+
+  // Should be called only once on the initialized iterator.
+  void free_shared_state() const
+  {
+    thrust::device_free(found_max);
+    thrust::device_free(unexpected_value);
+    found_max        = nullptr;
+    unexpected_value = nullptr;
+  }
+
+  __host__ __device__ assert_sequence_iterator operator+(difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  __host__ __device__ reference operator[](difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  // Some weirdness, this iterator acts like its own reference
+  __device__ assert_sequence_iterator operator=(value_type val)
+  {
+    if (val != expected)
+    {
+      printf("Error: expected %lld, got %lld\n", expected, val);
+
+      *unexpected_value = true;
+    }
+    else if (val == max)
+    {
+      *found_max = true;
+    }
+
+    return *this;
+  }
+
+private:
+  __host__ __device__ assert_sequence_iterator
+  clone(value_type new_expected) const
+  {
+    return {new_expected, max, found_max, unexpected_value};
+  }
+};
+
+// output mixin that generates assert_sequence_iterators.
+// Must be paired with validate_assert_sequence_iterators mixin to free
+// shared state.
+struct assert_sequence_output
+{
+  struct output_type
+  {
+    using iterator = assert_sequence_iterator;
+
+    iterator iter;
+
+    explicit output_type(iterator &&it)
+        : iter{std::move(it)}
+    {
+      iter.initialize_shared_state();
+    }
+
+    ~output_type() { iter.free_shared_state(); }
+
+    iterator begin() { return iter; }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values, InputType &)
+  {
+    using value_type = typename assert_sequence_iterator::value_type;
+    assert_sequence_iterator it{1,
+                                static_cast<value_type>(num_values),
+                                nullptr,
+                                nullptr};
+    return output_type{std::move(it)};
+  }
+};
+
+struct validate_assert_sequence_iterators
+{
+  using output_t = assert_sequence_output::output_type;
+
+  template <typename EventType>
+  static void compare_outputs(EventType &e,
+                              output_t const &,
+                              output_t const &test)
+  {
+    testing::async::mixin::compare_outputs::detail::basic_event_validation(e);
+
+    ASSERT_EQUAL(*test.iter.unexpected_value, false);
+    ASSERT_EQUAL(*test.iter.found_max, true);
+  }
+};
+
+//------------------------------------------------------------------------------
+// Overloads without custom binary operators use thrust::plus<>, so use
+// constant input iterator to generate the output sequence:
+struct default_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>                        // - no extra args
+    >;
+
+  static postfix_args_type generate_postfix_args() { return {{}}; }
+};
+
+struct default_bin_op_invoker
+    : testing::async::mixin::input::constant_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , default_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with default binary operator";
+  }
+};
+
+} // end anon namespace
+
+void TestLargeIndicesDefaultScanOp()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(TestLargeIndicesDefaultScanOp);
+
+namespace
+{
+
+//------------------------------------------------------------------------------
+// Generate the output sequence using counting iterators and thrust::max<> for
+// custom operator overloads.
+struct custom_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<thrust::maximum<>>       // - custom binary op
+    >;
+
+  static postfix_args_type generate_postfix_args() { return {{{}}}; }
+};
+
+struct custom_bin_op_invoker
+    : testing::async::mixin::input::counting_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , custom_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with custom binary operator";
+  }
+};
+
+} // end anon namespace
+
+void TestLargeIndicesCustomScanOp()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(TestLargeIndicesCustomScanOp);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/large_types.cu b/testing/async/inclusive_scan/large_types.cu
new file mode 100644
index 0000000000..f4bc196a65
--- /dev/null
+++ b/testing/async/inclusive_scan/large_types.cu
@@ -0,0 +1,63 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <unittest/special_types.h>
+
+// This test is an adaptation of TestScanWithLargeTypes from scan.cu.
+
+// Need special initialization for the FixedVector type:
+template <typename value_type>
+struct device_vector_fill
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::fill(input.begin(), input.end(), value_type{2});
+    return input;
+  }
+};
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : device_vector_fill<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "scan with large value types.";
+  }
+};
+
+struct TestLargeTypes
+{
+  void operator()(std::size_t num_values) const
+  {
+    using testing::async::test_policy_overloads;
+
+    test_policy_overloads<invoker<FixedVector<int, 1>>>::run(num_values);
+
+#ifndef __QNX__ // These are excluded on QNX for scan.cu's version of this test.
+    test_policy_overloads<invoker<FixedVector<int, 8>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 32>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 64>>>::run(num_values);
+#else
+    KNOWN_FAILURE;
+#endif
+  }
+};
+DECLARE_UNITTEST(TestLargeTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/mixed_types.cu b/testing/async/inclusive_scan/mixed_types.cu
new file mode 100644
index 0000000000..733eddc9c6
--- /dev/null
+++ b/testing/async/inclusive_scan/mixed_types.cu
@@ -0,0 +1,95 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Test using mixed int/float types for:
+// - input_value_type       | (int, float)
+// - output_value_type      | (int, float)
+// - thrust::plus<T> T-type | (int, float, void>)
+//
+// The thrust::plus<T> types are covered by the
+// scan_mixed_types_overloads component.
+//
+// The testing/scan.cu TestMixedTypes test spells out the expected behavior,
+// which is defined by https://wg21.link/P0571.
+
+namespace
+{
+
+template <typename value_type>
+struct mixed_type_input_generator
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen deliberately to test
+                     // casting orders and accumulator types:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+// A fractional value is used to ensure that a different result is obtained when
+// using float vs. int.
+struct mixed_types_postfix_args
+{
+  using postfix_args_type = std::tuple<  // Overloads to test:
+    std::tuple<>,                        // - no extra args
+    std::tuple<thrust::plus<>>,          // - plus<>
+    std::tuple<thrust::plus<int>>,       // - plus<int>
+    std::tuple<thrust::plus<float>>      // - plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{},
+            {thrust::plus<>{}},
+            {thrust::plus<int>{}},
+            {thrust::plus<float>{}}};
+  }
+};
+
+template <typename input_value_type,
+          typename output_value_type>
+struct invoker
+    : mixed_type_input_generator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , mixed_types_postfix_args
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal
+{
+  static std::string description()
+  {
+    return "mixed input/output/functor value_type tests";
+  }
+};
+
+} // namespace
+
+void TestScanMixedTypes()
+{
+  // 10 values are enough to check the behavior we want to test while staying
+  // small enough to reason about during debugging.
+  constexpr std::size_t num_values = 10;
+
+  // invoker template params are input_value_type, output_vt:
+  using testing::async::test_policy_overloads;
+  test_policy_overloads<invoker<int, int>>::run(num_values);
+  test_policy_overloads<invoker<int, float>>::run(num_values);
+  test_policy_overloads<invoker<float, int>>::run(num_values);
+  test_policy_overloads<invoker<float, float>>::run(num_values);
+}
+DECLARE_UNITTEST(TestScanMixedTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/mixin.h b/testing/async/inclusive_scan/mixin.h
new file mode 100644
index 0000000000..8431526f49
--- /dev/null
+++ b/testing/async/inclusive_scan/mixin.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/async/scan.h>
+
+#include <thrust/scan.h>
+
+#include <async/mixin.h>
+
+namespace testing
+{
+namespace async
+{
+namespace inclusive_scan
+{
+
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+
+template <typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>,                       // - no extra args
+    std::tuple<alternate_binary_op>     // - binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{}, {alternate_binary_op{}}};
+  }
+};
+
+} // namespace postfix_args
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::inclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+struct basic
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::inclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+} // namespace mixin
+} // namespace inclusive_scan
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/stateful_operator.cu b/testing/async/inclusive_scan/stateful_operator.cu
new file mode 100644
index 0000000000..946cc4a940
--- /dev/null
+++ b/testing/async/inclusive_scan/stateful_operator.cu
@@ -0,0 +1,60 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+namespace
+{
+
+// Custom binary operator for scan:
+template <typename T>
+struct stateful_operator
+{
+  T offset;
+
+  __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; }
+};
+
+// Postfix args overload definition that uses a stateful custom binary operator
+template <typename value_type>
+struct use_stateful_operator
+{
+  using postfix_args_type = std::tuple<       // Single overload:
+    std::tuple<stateful_operator<value_type>> // bin_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return {{{value_type{2}}}};
+  }
+};
+
+template <typename value_type>
+struct invoker
+    : testing::async::mixin::input::device_vector<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , use_stateful_operator<value_type>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::basic
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description() { return "scan with stateful operator"; }
+};
+
+} // namespace
+
+template <typename T>
+struct TestStatefulOperator
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TestStatefulOperator, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/using_vs_adl.cu b/testing/async/inclusive_scan/using_vs_adl.cu
new file mode 100644
index 0000000000..e7fca8b0a9
--- /dev/null
+++ b/testing/async/inclusive_scan/using_vs_adl.cu
@@ -0,0 +1,169 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Verify what happens when calling the algorithm without any namespace
+// qualifiers:
+// - If the async entry point is available in the global namespace due to a
+//   using statement, the async algorithm should be called.
+// - Otherwise, ADL should resolve the call to the synchronous algo in the
+//   thrust:: namespace.
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct adl_host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+
+    // ADL should resolve this to the synchronous `thrust::` algorithm.
+    // This is checked by ensuring that the call returns an output iterator.
+    OutIter result =
+      inclusive_scan(host_input.cbegin(),
+                     host_input.cend(),
+                     host_output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    (void)result;
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct using_namespace
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using namespace thrust::async;
+    thrust::device_event e =
+      inclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+struct using_cpo
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using thrust::async::inclusive_scan;
+    thrust::device_event e =
+      inclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_namespace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_namespace
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with `using namespace thrust::async`";
+  }
+};
+
+void TestUsingNamespace()
+{
+  using invoker = using_namespace_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(TestUsingNamespace);
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_cpo_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_cpo
+    , testing::async::mixin::compare_outputs::assert_maybe_fuzzy_equal_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with "
+           "`using namespace thrust::async::inclusive_scan`";
+  }
+};
+
+void TestUsingCPO()
+{
+  using invoker = using_cpo_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(TestUsingCPO);
+
+#endif // C++14
diff --git a/testing/async/mixin.h b/testing/async/mixin.h
new file mode 100644
index 0000000000..a58886ce00
--- /dev/null
+++ b/testing/async/mixin.h
@@ -0,0 +1,645 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <tuple>
+#include <type_traits>
+
+// This file contains a set of mix-in classes that define an algorithm
+// definition for use with test_policy_overloads<algo_def>. The algorithm
+// definition describes the details of a thrust::async algorithm invocation:
+//
+// - Input type and initialization
+// - Output type and initialization (supports in-place, too)
+// - Postfix arguments that define the algorithm's overload set
+// - Abstracted invocation of the async algorithm
+// - Abstracted invocation of a reference algorithm
+// - Validation of async vs. reference output
+// - A description string.
+//
+// This definition is used by test_policy_overloads to test each overload
+// against a reference while injecting a variety of execution policies. This
+// validates that each overload behaves correctly according to some reference.
+//
+// Since much of the algorithm definition is generic and may be reused in
+// multiple tests with slight changes, a mix-in system is used to simplify
+// the creation of algorithm definitions. The following namespace hierarchy is
+// used to organize these generic components:
+//
+// * testing::async::mixin::
+// ** ::input - Input types/values (device vectors, counting iterators, etc)
+// ** ::output - Output types/values (device vectors, inplace device vectors,
+//                                    discard iterators, etc)
+// ** ::postfix_args - Algorithm specific overload sets
+// ** ::invoke_reference - Algorithm specific reference invocation
+// ** ::invoke_async - Algorithm specific async algo invocation
+// ** ::compare_outputs - Compare output values.
+//
+// Each algorithm should define its own `mixins.h` header to declare algorithm
+// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async)
+// in a testing::async::<algorithm_name>::mixins namespace structure.
+//
+// For example, the test.async.exclusive_scan.basic test uses the following
+// algorithm definition from mix-ins:
+//
+// ```
+//   #include <async/test_policy_overloads.h>
+//   #include <async/mixin.h>
+//   #include <async/exclusive_scan/mixin.h>
+//   template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//   struct basic_invoker
+//      : testing::async::mixin::input::device_vector<input_value_type>
+//      , testing::async::mixin::output::device_vector<output_value_type>
+//      , testing::async::exclusive_scan::mixin::postfix_args::
+//          all_overloads<initial_value_type, alternate_binary_op>
+//      , testing::async::exclusive_scan::mixin::invoke_reference::
+//          host_synchronous<input_value_type, output_value_type>
+//      , testing::async::exclusive_scan::mixin::invoke_async::basic
+//      , testing::async::mixin::compare_outputs::assert_equal_quiet
+//   {
+//     static std::string description()
+//     {
+//       return "basic invocation with device vectors";
+//     }
+//   };
+//
+//   ...
+//
+//   testing::async::test_policy_overloads<basic_invoker<T>>::run(num_values);
+// ```
+//
+// The basic_invoker class expands to something similar to the following:
+//
+// ```
+//  template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//  struct basic_invoker
+//  {
+//  public:
+//
+//    static std::string description()
+//    {
+//      return "basic invocation with device vectors";
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::input::device_vector
+//    using input_type  = thrust::device_vector<input_value_type>;
+//
+//    // Generate an instance of the input:
+//    static input_type generate_input(std::size_t num_values)
+//    {
+//      input_type input(num_values);
+//      thrust::sequence(input.begin(), input.end(), 25, 3);
+//      return input;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::output::device_vector
+//    using output_type = thrust::device_vector<output_value_type>;
+//
+//    // Generate an instance of the output:
+//    // Might be more complicated, eg. fancy iterators, etc
+//    static output_type generate_output(std::size_t num_values)
+//    {
+//      return output_type(num_values);
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    //
+//    testing::async::mixin::exclusive_scan::mixin::postfix_args::all_overloads
+//    //
+//    using postfix_args_type = std::tuple<                 // List any extra
+//    arg overloads:
+//      std::tuple<>,                                       // - no extra args
+//      std::tuple<initial_value_type>,                     // - initial_value
+//      std::tuple<initial_value_type, alternate_binary_op> // - initial_value,
+//      binary_op
+//      >;
+//
+//    // Create instances of the extra arguments to use when invoking the
+//    // algorithm:
+//    static postfix_args_type generate_postfix_args()
+//    {
+//      return {
+//        {},                         // no extra args
+//        {42},                       // initial_value
+//        {57, alternate_binary_op{}} // initial_value, binary_op
+//      };
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    //
+//    testing::async::mixin::exclusive_scan::mixin::invoke_reference::host_synchronous
+//    //
+//    // Invoke a reference implementation for a single overload as described by
+//    // postfix_tuple. This tuple contains instances of any additional
+//    arguments
+//    // to pass to the algorithm. The tuple/index_sequence pattern is used to
+//    // support the "no extra args" overload, since the parameter pack
+//    expansion
+//    // will do exactly what we want in all cases.
+//    template <typename PostfixArgTuple, std::size_t... PostfixArgIndices>
+//    static void invoke_reference(input_type const &input,
+//                                 output_type &output,
+//                                 PostfixArgTuple &&postfix_tuple,
+//                                 std::index_sequence<PostfixArgIndices...>)
+//    {
+//      // Create host versions of the input/output:
+//      thrust::host_vector<input_value_type> host_input(input.cbegin(),
+//                                                       input.cend());
+//      thrust::host_vector<output_value_type> host_output(host_input.size());
+//
+//      // Run host synchronous algorithm to generate reference.
+//      thrust::exclusive_scan(host_input.cbegin(),
+//                             host_input.cend(),
+//                             host_output.begin(),
+//                             std::get<PostfixArgIndices>(
+//                               THRUST_FWD(postfix_tuple))...);
+//
+//      // Copy back to device.
+//      output = host_output;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic
+//    //
+//    // Invoke the async algorithm for a single overload as described by
+//    // the prefix and postfix tuples. These tuples contains instances of any
+//    // additional arguments to pass to the algorithm. The tuple/index_sequence
+//    // pattern is used to support the "no extra args" overload, since the
+//    // parameter pack expansion will do exactly what we want in all cases.
+//    // Prefix args are included here (but not for invoke_reference) to allow
+//    the
+//    // test framework to change the execution policy.
+//    // This method must return an event or future.
+//    template <typename PrefixArgTuple,
+//              std::size_t... PrefixArgIndices,
+//              typename PostfixArgTuple,
+//              std::size_t... PostfixArgIndices>
+//    static auto invoke_async(PrefixArgTuple &&prefix_tuple,
+//                             std::index_sequence<PrefixArgIndices...>,
+//                             input_type const &input,
+//                             output_type &output,
+//                             PostfixArgTuple &&postfix_tuple,
+//                             std::index_sequence<PostfixArgIndices...>)
+//    {
+//      output.resize(input.size());
+//      auto e = thrust::async::exclusive_scan(
+//        std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+//        input.cbegin(),
+//        input.cend(),
+//        output.begin(),
+//        std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+//      return e;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::compare_outputs::assert_equal_quiet
+//    //
+//    // Wait on and validate the event/future (usually with TEST_EVENT_WAIT /
+//    // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output
+//    // matches the testing output.
+//    template <typename EventType>
+//    static void compare_outputs(EventType &e,
+//                                output_type const &ref,
+//                                output_type const &test)
+//    {
+//      TEST_EVENT_WAIT(e);
+//      ASSERT_EQUAL_QUIET(ref, test);
+//    }
+// };
+// ```
+//
+// Similar invokers with slight tweaks are used in other
+// async/exclusive_scan/*.cu tests.
+
+namespace testing
+{
+namespace async
+{
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace input
+{
+
+// TODO it'd be nice to specify a lambda expression that'd replace the call to
+// thrust::sequence when desired.
+template <typename value_type>
+struct device_vector
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+template <typename value_type>
+struct counting_iterator_0
+{
+  struct input_type
+  {
+    using iterator = thrust::counting_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() { return iterator{static_cast<value_type>(0)}; }
+    iterator begin() const { return iterator{static_cast<value_type>(0)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(0)}; }
+
+    iterator end() { return iterator{static_cast<value_type>(num_values)}; }
+    iterator end() const { return iterator{static_cast<value_type>(num_values)}; }
+    iterator cend() const { return iterator{static_cast<value_type>(num_values)}; }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+template <typename value_type>
+struct counting_iterator_1
+{
+  struct input_type
+  {
+    using iterator = thrust::counting_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() { return iterator{static_cast<value_type>(1)}; }
+    iterator begin() const { return iterator{static_cast<value_type>(1)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(1)}; }
+
+    iterator end() { return iterator{static_cast<value_type>(1 + num_values)}; }
+    iterator end() const { return iterator{static_cast<value_type>(1 + num_values)}; }
+    iterator cend() const { return iterator{static_cast<value_type>(1 + num_values)}; }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+template <typename value_type>
+struct constant_iterator_1
+{
+  struct input_type
+  {
+    using iterator = thrust::constant_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() { return iterator{static_cast<value_type>(1)}; }
+    iterator begin() const { return iterator{static_cast<value_type>(1)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(1)}; }
+
+    iterator end() { return iterator{static_cast<value_type>(1)} + num_values; }
+    iterator end() const
+    {
+      return iterator{static_cast<value_type>(1)} + num_values;
+    }
+    iterator cend() const
+    {
+      return iterator{static_cast<value_type>(1)} + num_values;
+    }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+} // namespace input
+
+//------------------------------------------------------------------------------
+namespace output
+{
+
+template <typename value_type>
+struct device_vector
+{
+  using output_type = thrust::device_vector<value_type>;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values,
+                                     InputType& /* unused */)
+  {
+    return output_type(num_values);
+  }
+};
+
+template <typename value_type>
+struct device_vector_reuse_input
+{
+  using output_type = thrust::device_vector<value_type>&;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /*num_values*/,
+                                     InputType& input)
+  {
+    return input;
+  }
+};
+
+struct discard_iterator
+{
+  struct output_type
+  {
+    using iterator = thrust::discard_iterator<>;
+
+    iterator begin() { return thrust::make_discard_iterator(); }
+    iterator begin() const { return thrust::make_discard_iterator(); }
+    iterator cbegin() const { return thrust::make_discard_iterator(); }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /* num_values */,
+                                     InputType& /* input */)
+  {
+    return output_type{};
+  }
+};
+
+} // namespace output
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+/* Defined per algorithm. Example:
+ *
+ * // Defines several overloads:
+ * // algorithm([policy,] input, output) // no postfix args
+ * // algorithm([policy,] input, output, initial_value)
+ * // algorithm([policy,] input, output, initial_value, binary_op)
+ * template <typename value_type, typename alternate_binary_op =
+ * thrust::maximum<>> struct all_overloads
+ * {
+ *   using postfix_args_type = std::tuple<         // List any extra arg
+ * overloads: std::tuple<>,                               // - no extra args
+ *     std::tuple<value_type>,                     // - initial_value
+ *     std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+ *     >;
+ *
+ *   static postfix_args_type generate_postfix_args()
+ *   {
+ *     return {{}, {42}, {42, alternate_binary_op{}}};
+ *   }
+ * };
+ *
+ */
+}
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+/* Defined per algorithm. Example:
+ *
+ * template <typename input_value_type,
+ *           typename output_value_type = input_value_type>
+ * struct host_synchronous
+ * {
+ *   template <typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static void invoke_reference(InputType const& input,
+ *                                OutputType& output,
+ *                                PostfixArgTuple&& postfix_tuple,
+ *                                std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     // Create host versions of the input/output:
+ *     thrust::host_vector<input_value_type> host_input(input.cbegin(),
+ *                                                      input.cend());
+ *     thrust::host_vector<output_value_type> host_output(host_input.size());
+ *
+ *     // Run host synchronous algorithm to generate reference.
+ *     // Be sure to call a backend that doesn't use the same underlying
+ *     // implementation.
+ *     thrust::exclusive_scan(host_input.cbegin(),
+ *                            host_input.cend(),
+ *                            host_output.begin(),
+ *                            std::get<PostfixArgIndices>(
+ *                              THRUST_FWD(postfix_tuple))...);
+ *
+ *     // Copy back to device.
+ *     output = host_output;
+ *   }
+ * };
+ *
+ */
+
+// Used to save time when testing unverifiable invocations (discard_iterators)
+struct noop
+{
+  template <typename... Ts>
+  static void invoke_reference(Ts&&...)
+  {}
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+/* Defined per algorithm. Example:
+ *
+ * struct basic
+ * {
+ *   template <typename PrefixArgTuple,
+ *             std::size_t... PrefixArgIndices,
+ *             typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+ *                            std::index_sequence<PrefixArgIndices...>,
+ *                            InputType const& input,
+ *                            OutputType& output,
+ *                            PostfixArgTuple&& postfix_tuple,
+ *                            std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     auto e = thrust::async::exclusive_scan(
+ *       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+ *       input.cbegin(),
+ *       input.cend(),
+ *       output.begin(),
+ *       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+ *     return e;
+ *   }
+ * };
+ */
+
+} // namespace invoke_async
+
+//------------------------------------------------------------------------------
+namespace compare_outputs
+{
+
+namespace detail
+{
+
+void basic_event_validation(thrust::device_event& e)
+{
+  TEST_EVENT_WAIT(e);
+}
+
+template <typename T>
+void basic_event_validation(thrust::device_future<T>& f)
+{
+  TEST_FUTURE_VALUE_RETRIEVAL(f);
+}
+
+} // namespace detail
+
+struct assert_equal
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL(ref, test);
+  }
+};
+
+// Does an 'almost_equal' comparison for floating point types, since fp
+// addition is non-associative
+struct assert_maybe_fuzzy_equal
+{
+private:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::false_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL(ref, test);
+  }
+
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::true_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+
+public:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    using value_type = typename OutputType::value_type;
+    compare_outputs_impl(e, ref, test, std::is_floating_point<value_type>{});
+  }
+};
+
+struct assert_equal_quiet
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL_QUIET(ref, test);
+  }
+};
+
+// Does an 'almost_equal' comparison for floating point types, since fp
+// addition is non-associative
+struct assert_maybe_fuzzy_equal_quiet
+{
+private:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::false_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL_QUIET(ref, test);
+  }
+
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::true_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+
+public:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    using value_type = typename OutputType::value_type;
+    compare_outputs_impl(e, ref, test, std::is_floating_point<value_type>{});
+  }
+};
+
+// Used to save time when testing unverifiable invocations (discard_iterators).
+// Just does basic validation of the future/event.
+struct noop
+{
+  template <typename EventType, typename... Ts>
+  static void compare_outputs(EventType &e, Ts&&...)
+  {
+    detail::basic_event_validation(e);
+  }
+};
+
+} // namespace compare_outputs
+
+} // namespace mixin
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h
new file mode 100644
index 0000000000..00a55a8b2d
--- /dev/null
+++ b/testing/async/test_policy_overloads.h
@@ -0,0 +1,421 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_allocator.h>
+#include <thrust/future.h>
+
+#include <unittest/unittest.h>
+
+#include <string>
+
+// TODO Cover these cases from testing/async_reduce.cu:
+//   - [x] test_async_reduce_after ("after_future" in test_policy_overloads)
+//   - [ ] test_async_reduce_on_then_after (KNOWN_FAILURE, see #1195)
+//     - [ ] all the child variants (e.g. with allocator) too
+//   - [ ] test_async_copy_then_reduce (Need to figure out how to fit this in)
+//   - [ ] test_async_reduce_caching (only useful when returning future)
+
+namespace testing
+{
+
+namespace async
+{
+
+// Tests that policies are handled correctly for all overloads of an async
+// algorithm.
+//
+// The AlgoDef parameter type defines an async algorithm, its overloads, and
+// abstracts its invocation. See the async/mixins.h for a documented example of
+// this interface and some convenience mixins that can be used to construct a
+// definition quickly.
+//
+// The AlgoDef interface is used to run several tests of the algorithm,
+// exhaustively testing all overloads for algorithm correctness and proper
+// policy handling.
+//
+// ## Basic tests
+//
+// In the basic tests, each overload is called repeatedly with:
+// 1) No policy
+// 2) thrust::device
+// 3) thrust::device(thrust::device_allocator<void>)
+// 4) thrust::device.on(stream)
+// 5) thrust::device(thrust::device_allocator<void>).on(stream)
+//
+// The output of the async algorithm is compared against a reference output,
+// and the returned event/future is tested to make sure it holds a reference to
+// the expected stream.
+//
+// ## After Future tests
+//
+// The after_future tests check that the future/event returned from an algorithm
+// behaves properly when consumed by a policy's `.after` method.
+template <typename AlgoDef>
+struct test_policy_overloads
+{
+  using algo_def          = AlgoDef;
+  using input_type        = typename algo_def::input_type;
+  using output_type       = typename algo_def::output_type;
+  using postfix_args_type = typename algo_def::postfix_args_type;
+
+  static constexpr std::size_t num_postfix_arg_sets =
+    std::tuple_size<postfix_args_type>::value;
+
+  // Main entry point; call this from a unit test function.
+  static void run(std::size_t num_values)
+  {
+    test_postfix_overloads(num_values);
+  }
+
+private:
+  template <std::size_t size>
+  using size_const = std::integral_constant<std::size_t, size>;
+
+  //----------------------------------------------------------------------------
+  // Recursively call sub tests for each overload set in postfix_args:
+  template <std::size_t postfix_idx = 0>
+  static void test_postfix_overloads(std::size_t const num_values,
+                                     size_const<postfix_idx> = {})
+  {
+    static_assert(postfix_idx < num_postfix_arg_sets, "Internal error.");
+
+    run_basic_policy_tests<postfix_idx>(num_values);
+    run_after_future_test<postfix_idx>(num_values);
+
+    // Recurse to test next round of overloads:
+    test_postfix_overloads(num_values, size_const<postfix_idx + 1>{});
+  }
+
+  static void test_postfix_overloads(std::size_t const,
+                                     size_const<num_postfix_arg_sets>)
+  {
+    // terminal case, no-op
+  }
+
+  //----------------------------------------------------------------------------
+  // For the specified postfix overload set, test the algorithm with several
+  // different policy configurations.
+  template <std::size_t postfix_idx>
+  static void run_basic_policy_tests(std::size_t const num_values)
+  {
+    // When a policy uses the default stream, the algorithm implementation
+    // should spawn a new stream in the returned event:
+    auto using_default_stream = [](auto& e) {
+      ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(),
+                       e.stream().native_handle());
+    };
+
+    // When a policy uses a non-default stream, the implementation should pass
+    // the stream through to the output:
+    thrust::system::cuda::detail::unique_stream test_stream{};
+    auto using_test_stream = [&test_stream](auto& e) {
+      ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle());
+    };
+
+    // Test the different types of policies:
+    run_basic_policy_test<postfix_idx>("(no policy)",
+                                       std::make_tuple(),
+                                       using_default_stream,
+                                       num_values);
+
+    run_basic_policy_test<postfix_idx>("thrust::device",
+                                       std::make_tuple(thrust::device),
+                                       using_default_stream,
+                                       num_values);
+
+    run_basic_policy_test<postfix_idx>(
+      "thrust::device(thrust::device_allocator<void>{})",
+      std::make_tuple(thrust::device(thrust::device_allocator<void>{})),
+      using_default_stream,
+      num_values);
+
+    run_basic_policy_test<postfix_idx>("thrust::device.on(test_stream.get())",
+                                       std::make_tuple(
+                                         thrust::device.on(test_stream.get())),
+                                       using_test_stream,
+                                       num_values);
+
+    run_basic_policy_test<postfix_idx>(
+      "thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())",
+      std::make_tuple(
+        thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())),
+      using_test_stream,
+      num_values);
+  }
+
+  // Wrap the basic_policy_test in a try/catch block to inject helpful info into
+  // any thrown unit test exceptions.
+  template <std::size_t postfix_idx, typename PolicyTuple, typename ValidateEvent>
+  static void run_basic_policy_test(std::string const &policy_desc,
+                                    PolicyTuple &&policy_tuple,
+                                    ValidateEvent &&validate,
+                                    std::size_t const num_values)
+  {
+    try
+    {
+      basic_policy_test<postfix_idx>(policy_tuple,
+                                     validate,
+                                     num_values);
+    }
+    catch (unittest::UnitTestException &exc)
+    {
+      // Append some identifying information to the exception to help with
+      // debugging:
+      using overload_t = std::tuple_element_t<postfix_idx, postfix_args_type>;
+
+      std::string const overload_desc =
+        unittest::demangle(typeid(overload_t).name());
+      std::string const input_desc =
+        unittest::demangle(typeid(input_type).name());
+      std::string const output_desc =
+        unittest::demangle(typeid(output_type).name());
+
+      exc << "\n"
+          << " - algo_def::description = " << algo_def::description() << "\n"
+          << " - test = basic_policy\n"
+          << " - policy = " << policy_desc << "\n"
+          << " - input_type = " << input_desc << "\n"
+          << " - output_type = " << output_desc << "\n"
+          << " - tuple of trailing arguments = " << overload_desc << "\n"
+          << " - num_values = " << num_values;
+      throw;
+    }
+  }
+
+  // Invoke the algorithm multiple times and validate the results.
+  template <std::size_t postfix_idx,
+            typename PrefixArgTuple,
+            typename ValidateEvent>
+  static void basic_policy_test(PrefixArgTuple &&prefix_tuple_ref,
+                                ValidateEvent const &validate,
+                                std::size_t num_values)
+  {
+    // Sink the prefix tuple into a const local so it can be safely passed to
+    // multiple invocations without worrying about potential modifications.
+    using prefix_tuple_type = thrust::remove_cvref_t<PrefixArgTuple>;
+    prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref);
+
+    using postfix_tuple_type =
+      std::tuple_element_t<postfix_idx, postfix_args_type>;
+    postfix_tuple_type const postfix_tuple = get_postfix_tuple<postfix_idx>();
+
+    // Generate index sequences for the tuples:
+    constexpr auto prefix_tuple_size  = std::tuple_size<prefix_tuple_type>{};
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<prefix_tuple_size>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    // Use unique, non-const inputs for each invocation to support in-place
+    // algo_def configurations.
+    input_type input_a   = algo_def::generate_input(num_values);
+    input_type input_b   = algo_def::generate_input(num_values);
+    input_type input_c   = algo_def::generate_input(num_values);
+    input_type input_d   = algo_def::generate_input(num_values);
+    input_type input_ref = algo_def::generate_input(num_values);
+
+    output_type output_a   = algo_def::generate_output(num_values, input_a);
+    output_type output_b   = algo_def::generate_output(num_values, input_b);
+    output_type output_c   = algo_def::generate_output(num_values, input_c);
+    output_type output_d   = algo_def::generate_output(num_values, input_d);
+    output_type output_ref = algo_def::generate_output(num_values, input_ref);
+
+    // Invoke multiple overlapping async algorithms, capturing their outputs
+    // and events/futures:
+    auto e_a = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_a,
+                                      output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_b = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_b,
+                                      output_b,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_c = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_c,
+                                      output_c,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_d = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_d,
+                                      output_d,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+
+    // Let reference calc overlap with async testing:
+    algo_def::invoke_reference(input_ref,
+                               output_ref,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    algo_def::compare_outputs(e_a, output_ref, output_a);
+    algo_def::compare_outputs(e_b, output_ref, output_b);
+    algo_def::compare_outputs(e_c, output_ref, output_c);
+    algo_def::compare_outputs(e_d, output_ref, output_d);
+
+    validate(e_a);
+    validate(e_b);
+    validate(e_c);
+    validate(e_d);
+  }
+
+  //----------------------------------------------------------------------------
+  // Test .after(event/future) handling:
+  template <std::size_t postfix_idx>
+  static void run_after_future_test(std::size_t const num_values)
+  {
+    try
+    {
+      after_future_test<postfix_idx>(num_values);
+    }
+    catch (unittest::UnitTestException &exc)
+    {
+      // Append some identifying information to the exception to help with
+      // debugging:
+      using postfix_t = std::tuple_element_t<postfix_idx, postfix_args_type>;
+
+      std::string const postfix_desc =
+        unittest::demangle(typeid(postfix_t).name());
+      std::string const input_desc =
+        unittest::demangle(typeid(input_type).name());
+      std::string const output_desc =
+        unittest::demangle(typeid(output_type).name());
+
+      exc << "\n"
+          << " - algo_def::description = " << algo_def::description() << "\n"
+          << " - test = after_future\n"
+          << " - input_type = " << input_desc << "\n"
+          << " - output_type = " << output_desc << "\n"
+          << " - tuple of trailing arguments = " << postfix_desc << "\n"
+          << " - num_values = " << num_values;
+      throw;
+    }
+  }
+
+  template <std::size_t postfix_idx>
+  static void after_future_test(std::size_t const num_values)
+  {
+    using postfix_tuple_type =
+    std::tuple_element_t<postfix_idx, postfix_args_type>;
+    postfix_tuple_type const postfix_tuple = get_postfix_tuple<postfix_idx>();
+
+    // Generate index sequences for the tuples. Prefix size always = 1 here.
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<1>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    // Use unique, non-const inputs for each invocation to support in-place
+    // algo_def configurations.
+    input_type input_a   = algo_def::generate_input(num_values);
+    input_type input_b   = algo_def::generate_input(num_values);
+    input_type input_c   = algo_def::generate_input(num_values);
+    input_type input_tmp = algo_def::generate_input(num_values);
+    input_type input_ref = algo_def::generate_input(num_values);
+
+    output_type output_a   = algo_def::generate_output(num_values, input_a);
+    output_type output_b   = algo_def::generate_output(num_values, input_b);
+    output_type output_c   = algo_def::generate_output(num_values, input_c);
+    output_type output_tmp = algo_def::generate_output(num_values, input_tmp);
+    output_type output_ref = algo_def::generate_output(num_values, input_ref);
+
+    auto e_a = algo_def::invoke_async(std::make_tuple(thrust::device),
+                                      prefix_index_seq{},
+                                      input_a,
+                                      output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    ASSERT_EQUAL(true, e_a.valid_stream());
+    auto const stream_a = e_a.stream().native_handle();
+
+    // Execution on default stream should create a new stream in the result:
+    ASSERT_NOT_EQUAL_QUIET(thrust::cuda_cub::default_stream(), stream_a);
+
+    // Explicitly order this invocation after the previous one:
+    auto e_b =
+      algo_def::invoke_async(std::make_tuple(thrust::device.after(e_a)),
+                             prefix_index_seq{},
+                             input_b,
+                             output_b,
+                             postfix_tuple,
+                             postfix_index_seq{});
+    ASSERT_EQUAL(true, e_b.valid_stream());
+    auto const stream_b = e_b.stream().native_handle();
+
+    // Second invocation should use same stream as before:
+    ASSERT_EQUAL_QUIET(stream_a, stream_b);
+
+    // Verify that double consumption of e_a produces an exception:
+    ASSERT_THROWS_EQUAL(auto x = algo_def::invoke_async(
+                          std::make_tuple(thrust::device.after(e_a)),
+                          prefix_index_seq{},
+                          input_tmp,
+                          output_tmp,
+                          postfix_tuple,
+                          postfix_index_seq{});
+                        THRUST_UNUSED_VAR(x),
+                        thrust::event_error,
+                        thrust::event_error(thrust::event_errc::no_state));
+
+    // Explicitly order this invocation after e_b:
+    auto policy_after_e_b = thrust::device.after(e_b);
+    // Make sure the tuple contains an lvalue ref:
+    auto policy_after_e_b_tuple = std::forward_as_tuple(policy_after_e_b);
+    auto e_c =
+      algo_def::invoke_async(policy_after_e_b_tuple,
+                             prefix_index_seq{},
+                             input_c,
+                             output_c,
+                             postfix_tuple,
+                             postfix_index_seq{});
+    ASSERT_EQUAL(true, e_c.valid_stream());
+    auto const stream_c = e_c.stream().native_handle();
+
+    // Should use same stream as e_b:
+    ASSERT_EQUAL_QUIET(stream_b, stream_c);
+
+    // Verify that indirect double consumption of e_b produces an exception:
+    ASSERT_THROWS_EQUAL(
+      auto x = algo_def::invoke_async(policy_after_e_b_tuple,
+                                      prefix_index_seq{},
+                                      input_tmp,
+                                      output_tmp,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+      THRUST_UNUSED_VAR(x),
+      thrust::event_error,
+      thrust::event_error(thrust::event_errc::no_state));
+
+    // Let reference calc overlap with async testing:
+    algo_def::invoke_reference(input_ref,
+                               output_ref,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    // Validate results
+    // Use e_c for all three checks -- e_a and e_b will not pass the event
+    // checks since their streams were stolen by dependencies.
+    algo_def::compare_outputs(e_c, output_ref, output_a);
+    algo_def::compare_outputs(e_c, output_ref, output_b);
+    algo_def::compare_outputs(e_c, output_ref, output_c);
+  }
+
+  //----------------------------------------------------------------------------
+  // Various helper functions:
+  template <std::size_t postfix_idx>
+  static auto get_postfix_tuple()
+  {
+    return std::get<postfix_idx>(algo_def::generate_postfix_args());
+  }
+};
+
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/event.cu b/testing/event.cu
index 5833d41452..5814269195 100644
--- a/testing/event.cu
+++ b/testing/event.cu
@@ -1,6 +1,6 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
diff --git a/testing/future.cu b/testing/future.cu
index 1375588607..eb1ab582a0 100644
--- a/testing/future.cu
+++ b/testing/future.cu
@@ -1,6 +1,6 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h
index 984cc61c6b..9a3454efd5 100644
--- a/testing/unittest/util_async.h
+++ b/testing/unittest/util_async.h
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 
@@ -73,5 +73,4 @@ auto test_future_value_retrieval(
 
 } // namespace unittest
 
-#endif // THRUST_CPP_DIALECT >= 2011
-
+#endif // THRUST_CPP_DIALECT >= 2014
diff --git a/thrust/async/scan.h b/thrust/async/scan.h
new file mode 100644
index 0000000000..d7adfac983
--- /dev/null
+++ b/thrust/async/scan.h
@@ -0,0 +1,355 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/scan.h
+ *  \brief Functions for asynchronously computing prefix scans.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/detail/static_assert.h>
+
+#include <thrust/system/detail/adl/async/scan.h>
+
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/future.h>
+
+namespace thrust
+{
+
+namespace async
+{
+
+// Fallback implementations used when no overloads are found via ADL:
+namespace unimplemented
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_inclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_exclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     InitialValueType,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace inclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_inclusive_scan;
+
+// Implementation of the thrust::async::inclusive_scan CPO.
+struct inclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             BinaryOp&& op) const
+    // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out,
+                  BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace inclusive_scan_detail
+
+THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
+
+namespace exclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_exclusive_scan;
+
+// Implementation of the thrust::async::exclusive_scan CPO.
+struct exclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  THRUST_NODISCARD
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  THRUST_NODISCARD
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  THRUST_NODISCARD
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace exclusive_scan_detail
+
+THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
+
+} // namespace async
+
+} // end namespace thrust
+
+#endif
diff --git a/thrust/future.h b/thrust/future.h
index 12bebf8c6e..03c5d79f6c 100644
--- a/thrust/future.h
+++ b/thrust/future.h
@@ -24,7 +24,7 @@
 #include <thrust/detail/cpp11_required.h>
 #include <thrust/detail/modern_gcc_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/execution_policy.h>
 #include <thrust/detail/static_assert.h>
diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h
new file mode 100644
index 0000000000..3ce27d2e11
--- /dev/null
+++ b/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -0,0 +1,199 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                            ForwardIt first,
+                            Size n,
+                            OutputIt out,
+                            InitialValueType init,
+                            BinaryOp op)
+  -> unique_eager_event
+{
+  using Dispatch32 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       InitialValueType,
+                                       thrust::detail::int32_t>;
+  using Dispatch64 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       InitialValueType,
+                                       thrust::detail::int64_t>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  cudaError_t status;
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (nullptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  init,
+                                  n_fixed,
+                                  nullptr,
+                                  THRUST_DEBUG_SYNC_FLAG));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for exclusive_scan");
+  }
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (tmp_ptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  init,
+                                  n_fixed,
+                                  user_raw_stream,
+                                  THRUST_DEBUG_SYNC_FLAG));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching exclusive_scan kernel");
+  }
+
+  return std::move(ev);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          InitialValueType &&init,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_exclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(init),
+    THRUST_FWD(op)
+  )
+)
+
+} // namespace cuda_cub
+
+} // namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h
new file mode 100644
index 0000000000..1600e15fab
--- /dev/null
+++ b/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                            ForwardIt first,
+                            Size n,
+                            OutputIt out,
+                            BinaryOp op)
+  -> unique_eager_event
+{
+  using Dispatch32 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       cub::NullType,
+                                       thrust::detail::int32_t>;
+
+  using Dispatch64 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       cub::NullType,
+                                       thrust::detail::int64_t>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  cudaError_t status;
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (nullptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  cub::NullType{},
+                                  n_fixed,
+                                  nullptr,
+                                  THRUST_DEBUG_SYNC_FLAG));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for inclusive_scan");
+  }
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (tmp_ptr,
+                                 tmp_size,
+                                 first,
+                                 out,
+                                 op,
+                                 cub::NullType{},
+                                 n_fixed,
+                                 user_raw_stream,
+                                 THRUST_DEBUG_SYNC_FLAG));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching inclusive_scan kernel");
+  }
+
+  return std::move(ev);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_inclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(op)
+  )
+)
+
+} // namespace cuda_cub
+
+} // namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/detail/async/scan.h b/thrust/system/cuda/detail/async/scan.h
new file mode 100644
index 0000000000..7d993e6641
--- /dev/null
+++ b/thrust/system/cuda/detail/async/scan.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/cpp14_required.h>
+
+#include <thrust/system/cuda/detail/async/exclusive_scan.h>
+#include <thrust/system/cuda/detail/async/inclusive_scan.h>
diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h
new file mode 100644
index 0000000000..a2a90618b4
--- /dev/null
+++ b/thrust/system/detail/adl/async/scan.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/scan.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async scans.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/scan.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+
diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl
index 300b697b26..83d272c3e9 100644
--- a/thrust/system/detail/generic/scan.inl
+++ b/thrust/system/detail/generic/scan.inl
@@ -61,9 +61,7 @@ __host__ __device__
 {
   // Use the input iterator's value type per https://wg21.link/P0571
   using ValueType = typename thrust::iterator_value<InputIterator>::type;
-
-  // assume 0 as the initialization value
-  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
+  return thrust::exclusive_scan(exec, first, last, result, ValueType{});
 } // end exclusive_scan()