From f1e2b916dc3d98f9e781ba093326346bd13f2c82 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Fri, 2 Feb 2024 12:53:14 +0000 Subject: [PATCH 1/4] [SYCL][Fusion] Take auxiliary resources from fused command groups Reductions use auxiliary resources to handle reduction temporal buffers. Assign those auxiliary resources from each fused command group to the placeholder fusion event. This event will not be marked as completed after the fused reductions finish execution either if fusion is completed, cancelled or aborted. Test is updated to check every algorithm that is selected automatically by `sycl::reduction`, i.e., every supported algorithm. We also cover both cases (fusion taking and not taking place). Signed-off-by: Victor Perez --- sycl/source/detail/jit_compiler.cpp | 4 +- .../source/detail/scheduler/graph_builder.cpp | 3 + sycl/source/detail/scheduler/scheduler.cpp | 33 ++++++- sycl/source/detail/scheduler/scheduler.hpp | 2 + .../group_reduce_and_last_wg_detection.cpp | 9 ++ .../local_atomic_and_atomic_cross_wg.cpp | 8 ++ .../KernelFusion/Reduction/range_basic.cpp | 6 ++ .../KernelFusion/Reduction/reduction.cpp | 62 ------------- .../KernelFusion/Reduction/reduction.hpp | 86 +++++++++++++++++++ 9 files changed, 145 insertions(+), 68 deletions(-) create mode 100644 sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp create mode 100644 sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp create mode 100644 sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp delete mode 100644 sycl/test-e2e/KernelFusion/Reduction/reduction.cpp create mode 100644 sycl/test-e2e/KernelFusion/Reduction/reduction.hpp diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index b07647707766f..469c445b0e66c 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, unsigned KernelIndex = 0; ParamList FusedParams; PromotionMap PromotedAccs; - // TODO: Collect information about streams and auxiliary resources (which - // contain reductions) and figure out how to fuse them. + // TODO: Collect information about streams and figure out how + // to fuse them. for (auto &RawCmd : InputKernels) { auto *KernelCmd = static_cast(RawCmd); auto &CG = KernelCmd->getCG(); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index b3b4343b530ad..104c38dcdd99b 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue, auto FusedKernelCmd = std::make_unique(std::move(FusedCG), Queue); + // Inherit auxiliary resources from fused command groups + Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(), + PlaceholderCmd->getEvent()); assert(PlaceholderCmd->MDeps.empty()); // Next, backwards iterate over all the commands in the fusion list and remove // them from the graph to restore the state before starting fusion, so we can diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c4771a9ed8c59..a83298a628539 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG( for (const auto &StreamImplPtr : Streams) { StreamImplPtr->flush(NewEvent); } - - if (!AuxiliaryResources.empty()) - registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); } + if (!AuxiliaryResources.empty()) + registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); + return NewEvent; } @@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { } } +static void registerAuxiliaryResourcesNoLock( + std::unordered_map>> + &AuxiliaryResources, + const EventImplPtr &Event, + std::vector> &&Resources) { + std::vector> &StoredResources = + AuxiliaryResources[Event]; + StoredResources.insert(StoredResources.end(), + std::make_move_iterator(Resources.begin()), + std::make_move_iterator(Resources.end())); +} + +void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst, + const EventImplPtr &Src) { + std::unique_lock Lock{MAuxiliaryResourcesMutex}; + auto Iter = MAuxiliaryResources.find(Src); + if (Iter == MAuxiliaryResources.end()) { + return; + } + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst, + std::move(Iter->second)); + MAuxiliaryResources.erase(Iter); +} + void Scheduler::registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources) { std::unique_lock Lock{MAuxiliaryResourcesMutex}; - MAuxiliaryResources.insert({Event, std::move(Resources)}); + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event, + std::move(Resources)); } void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 09ba43dbd1d4e..53ce295626045 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -543,6 +543,8 @@ class Scheduler { bool ShouldEnqueue; }; + /// Assign \p Src's auxiliary resources to \p Dst. + void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src); void registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources); void cleanupAuxiliaryResources(BlockingT Blocking); diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp new file mode 100644 index 0000000000000..fe1b5893a7b6c --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp @@ -0,0 +1,9 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out +// UNSUPPORTED: hip || cuda + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp new file mode 100644 index 0000000000000..fb3702907e17f --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp new file mode 100644 index 0000000000000..2c59b4e0369eb --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp @@ -0,0 +1,6 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { test(); } diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp deleted file mode 100644 index 120679246c64c..0000000000000 --- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// RUN: %{build} -fsycl-embed-ir -o %t.out -// RUN: %{run} %t.out -// -// The test fails on opencl:cpu when running on AMD runner and passes when -// running on Intel Arc GPU runner. -// UNSUPPORTED: cpu - -// Test fusion works with reductions. - -#include - -#include "../helpers.hpp" - -using namespace sycl; - -template class ReductionTest; - -int main() { - constexpr size_t dataSize = 512; - - int sumRes = -1; - int maxRes = -1; - - { - queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; - - buffer dataBuf{dataSize}; - buffer sumBuf{&sumRes, 1}; - buffer maxBuf{&maxRes, 1}; - - ext::codeplay::experimental::fusion_wrapper fw{q}; - fw.start_fusion(); - - iota(q, dataBuf, 0); - - q.submit([&](handler &cgh) { - accessor in(dataBuf, cgh, read_only); - auto sumRed = reduction(sumBuf, cgh, plus<>{}, - property::reduction::initialize_to_identity{}); - auto maxRed = reduction(maxBuf, cgh, maximum<>{}, - property::reduction::initialize_to_identity{}); - cgh.parallel_for(dataSize, sumRed, maxRed, - [=](id<1> i, auto &sum, auto &max) { - sum.combine(in[i]); - max.combine(in[i]); - }); - }); - - complete_fusion_with_check( - fw, ext::codeplay::experimental::property::no_barriers{}); - } - - constexpr int expectedMax = dataSize - 1; - constexpr int expectedSum = dataSize * expectedMax / 2; - - std::cerr << sumRes << "\n"; - - assert(maxRes == expectedMax && "Unexpected max value"); - assert(sumRes == expectedSum && "Unexpected sum value"); - - return 0; -} diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp new file mode 100644 index 0000000000000..34d66870f855b --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp @@ -0,0 +1,86 @@ +// Test fusion works with reductions. Some algorithms will lead to fusion being +// cancelled in some devices. These should work properly anyway. + +#include + +#include "../helpers.hpp" +#include "sycl/detail/reduction_forward.hpp" + +using namespace sycl; + +constexpr inline size_t globalSize = 512; + +template struct is_fusion_supported { + constexpr static inline bool value = + detail::reduction::strategy::group_reduce_and_last_wg_detection <= + Strategy && + Strategy < detail::reduction::strategy::group_reduce_and_atomic_cross_wg; +}; + +template +constexpr inline bool is_fusion_supported_v = + is_fusion_supported::value; + +template +void test(nd_range<1> ndr) { + static_assert(is_fusion_supported_v, + "Testing unsupported algorithm"); + std::array data; + int sumRes = 0; + int maxRes = 0; + + { + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + buffer dataBuf{data}; + buffer sumBuf{&sumRes, 1}; + buffer maxBuf{&maxRes, 1}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + + fw.start_fusion(); + iota(q, dataBuf, 0); + + q.submit([&](handler &cgh) { + accessor in(dataBuf, cgh, read_only); + auto sumRed = reduction(sumBuf, cgh, plus<>{}, + property::reduction::initialize_to_identity{}); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); + }); + + q.submit([&](handler &cgh) { + accessor in(dataBuf, cgh, read_only); + auto maxRed = reduction(maxBuf, cgh, maximum<>{}, + property::reduction::initialize_to_identity{}); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); + }); + + if constexpr (Fuse) { + fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{}); + } else { + fw.cancel_fusion(); + } + } + + constexpr int expectedMax = globalSize - 1; + constexpr int expectedSum = globalSize * expectedMax / 2; + + assert(sumRes == expectedSum); + assert(maxRes == expectedMax); +} + +template void test() { + for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) { + nd_range<1> ndr{globalSize, localSize}; + test(ndr); + test(ndr); + } +} From 9dab85a100055c9d169eec997a74bbfa34387a1c Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 6 Feb 2024 16:14:09 +0000 Subject: [PATCH 2/4] Update comment --- sycl/test-e2e/KernelFusion/Reduction/reduction.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp index 34d66870f855b..8827a8f344bec 100644 --- a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp +++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp @@ -1,5 +1,5 @@ -// Test fusion works with reductions. Some algorithms will lead to fusion being -// cancelled in some devices. These should work properly anyway. +// Test fusion works with reductions. Only algorithms automatically selected by +// `sycl::reduction` are supported. #include From a143612df1140bd314e6b1bd513887f72687f7f5 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 6 Feb 2024 16:32:09 +0000 Subject: [PATCH 3/4] Test more algorithms --- .../group_reduce_and_atomic_cross_wg.cpp | 8 +++++++ .../local_mem_tree_and_atomic_cross_wg.cpp | 8 +++++++ .../KernelFusion/Reduction/reduction.hpp | 22 ++++++++++--------- 3 files changed, 28 insertions(+), 10 deletions(-) create mode 100644 sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp create mode 100644 sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp new file mode 100644 index 0000000000000..e35d36b897cd2 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp new file mode 100644 index 0000000000000..53768affcbbf0 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp index 8827a8f344bec..c6e7c9ec4c139 100644 --- a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp +++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp @@ -8,13 +8,11 @@ using namespace sycl; -constexpr inline size_t globalSize = 512; - template struct is_fusion_supported { constexpr static inline bool value = detail::reduction::strategy::group_reduce_and_last_wg_detection <= Strategy && - Strategy < detail::reduction::strategy::group_reduce_and_atomic_cross_wg; + Strategy < detail::reduction::strategy::group_reduce_and_multiple_kernels; }; template @@ -25,7 +23,9 @@ template void test(nd_range<1> ndr) { static_assert(is_fusion_supported_v, "Testing unsupported algorithm"); - std::array data; + + auto globalSize = static_cast(ndr.get_global_range().size()); + std::vector data(globalSize); int sumRes = 0; int maxRes = 0; @@ -70,17 +70,19 @@ void test(nd_range<1> ndr) { } } - constexpr int expectedMax = globalSize - 1; - constexpr int expectedSum = globalSize * expectedMax / 2; + int expectedMax = globalSize - 1; + int expectedSum = globalSize * expectedMax / 2; assert(sumRes == expectedSum); assert(maxRes == expectedMax); } template void test() { - for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) { - nd_range<1> ndr{globalSize, localSize}; - test(ndr); - test(ndr); + for (size_t globalSize : {16, 512}) { + for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) { + nd_range<1> ndr{globalSize, localSize}; + test(ndr); + test(ndr); + } } } From db44bb5619cf0a49a48aa128e4e0378b9c0e94b1 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Wed, 7 Feb 2024 13:38:29 +0000 Subject: [PATCH 4/4] Add comment --- .../Reduction/group_reduce_and_last_wg_detection.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp index fe1b5893a7b6c..500847379d864 100644 --- a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp +++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp @@ -2,6 +2,9 @@ // RUN: %{run} %t.out // UNSUPPORTED: hip || cuda +// COM: When ran on HIP and CUDA, this algorithm launches 'memcpy' commands +// leading to an infinite loop due to a bug in kernel fusion. + #include "./reduction.hpp" int main() {