diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index b07647707766f..469c445b0e66c 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, unsigned KernelIndex = 0; ParamList FusedParams; PromotionMap PromotedAccs; - // TODO: Collect information about streams and auxiliary resources (which - // contain reductions) and figure out how to fuse them. + // TODO: Collect information about streams and figure out how + // to fuse them. for (auto &RawCmd : InputKernels) { auto *KernelCmd = static_cast(RawCmd); auto &CG = KernelCmd->getCG(); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index b3b4343b530ad..104c38dcdd99b 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue, auto FusedKernelCmd = std::make_unique(std::move(FusedCG), Queue); + // Inherit auxiliary resources from fused command groups + Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(), + PlaceholderCmd->getEvent()); assert(PlaceholderCmd->MDeps.empty()); // Next, backwards iterate over all the commands in the fusion list and remove // them from the graph to restore the state before starting fusion, so we can diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c4771a9ed8c59..a83298a628539 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG( for (const auto &StreamImplPtr : Streams) { StreamImplPtr->flush(NewEvent); } - - if (!AuxiliaryResources.empty()) - registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); } + if (!AuxiliaryResources.empty()) + registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); + return NewEvent; } @@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { } } +static void registerAuxiliaryResourcesNoLock( + std::unordered_map>> + &AuxiliaryResources, + const EventImplPtr &Event, + std::vector> &&Resources) { + std::vector> &StoredResources = + AuxiliaryResources[Event]; + StoredResources.insert(StoredResources.end(), + std::make_move_iterator(Resources.begin()), + std::make_move_iterator(Resources.end())); +} + +void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst, + const EventImplPtr &Src) { + std::unique_lock Lock{MAuxiliaryResourcesMutex}; + auto Iter = MAuxiliaryResources.find(Src); + if (Iter == MAuxiliaryResources.end()) { + return; + } + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst, + std::move(Iter->second)); + MAuxiliaryResources.erase(Iter); +} + void Scheduler::registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources) { std::unique_lock Lock{MAuxiliaryResourcesMutex}; - MAuxiliaryResources.insert({Event, std::move(Resources)}); + registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event, + std::move(Resources)); } void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 09ba43dbd1d4e..53ce295626045 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -543,6 +543,8 @@ class Scheduler { bool ShouldEnqueue; }; + /// Assign \p Src's auxiliary resources to \p Dst. + void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src); void registerAuxiliaryResources( EventImplPtr &Event, std::vector> Resources); void cleanupAuxiliaryResources(BlockingT Blocking); diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp new file mode 100644 index 0000000000000..e35d36b897cd2 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp new file mode 100644 index 0000000000000..500847379d864 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp @@ -0,0 +1,12 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out +// UNSUPPORTED: hip || cuda + +// COM: When ran on HIP and CUDA, this algorithm launches 'memcpy' commands +// leading to an infinite loop due to a bug in kernel fusion. + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp new file mode 100644 index 0000000000000..fb3702907e17f --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp new file mode 100644 index 0000000000000..53768affcbbf0 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { + test(); +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp new file mode 100644 index 0000000000000..2c59b4e0369eb --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp @@ -0,0 +1,6 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +#include "./reduction.hpp" + +int main() { test(); } diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp deleted file mode 100644 index 120679246c64c..0000000000000 --- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// RUN: %{build} -fsycl-embed-ir -o %t.out -// RUN: %{run} %t.out -// -// The test fails on opencl:cpu when running on AMD runner and passes when -// running on Intel Arc GPU runner. -// UNSUPPORTED: cpu - -// Test fusion works with reductions. - -#include - -#include "../helpers.hpp" - -using namespace sycl; - -template class ReductionTest; - -int main() { - constexpr size_t dataSize = 512; - - int sumRes = -1; - int maxRes = -1; - - { - queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; - - buffer dataBuf{dataSize}; - buffer sumBuf{&sumRes, 1}; - buffer maxBuf{&maxRes, 1}; - - ext::codeplay::experimental::fusion_wrapper fw{q}; - fw.start_fusion(); - - iota(q, dataBuf, 0); - - q.submit([&](handler &cgh) { - accessor in(dataBuf, cgh, read_only); - auto sumRed = reduction(sumBuf, cgh, plus<>{}, - property::reduction::initialize_to_identity{}); - auto maxRed = reduction(maxBuf, cgh, maximum<>{}, - property::reduction::initialize_to_identity{}); - cgh.parallel_for(dataSize, sumRed, maxRed, - [=](id<1> i, auto &sum, auto &max) { - sum.combine(in[i]); - max.combine(in[i]); - }); - }); - - complete_fusion_with_check( - fw, ext::codeplay::experimental::property::no_barriers{}); - } - - constexpr int expectedMax = dataSize - 1; - constexpr int expectedSum = dataSize * expectedMax / 2; - - std::cerr << sumRes << "\n"; - - assert(maxRes == expectedMax && "Unexpected max value"); - assert(sumRes == expectedSum && "Unexpected sum value"); - - return 0; -} diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp new file mode 100644 index 0000000000000..c6e7c9ec4c139 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp @@ -0,0 +1,88 @@ +// Test fusion works with reductions. Only algorithms automatically selected by +// `sycl::reduction` are supported. + +#include + +#include "../helpers.hpp" +#include "sycl/detail/reduction_forward.hpp" + +using namespace sycl; + +template struct is_fusion_supported { + constexpr static inline bool value = + detail::reduction::strategy::group_reduce_and_last_wg_detection <= + Strategy && + Strategy < detail::reduction::strategy::group_reduce_and_multiple_kernels; +}; + +template +constexpr inline bool is_fusion_supported_v = + is_fusion_supported::value; + +template +void test(nd_range<1> ndr) { + static_assert(is_fusion_supported_v, + "Testing unsupported algorithm"); + + auto globalSize = static_cast(ndr.get_global_range().size()); + std::vector data(globalSize); + int sumRes = 0; + int maxRes = 0; + + { + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + buffer dataBuf{data}; + buffer sumBuf{&sumRes, 1}; + buffer maxBuf{&maxRes, 1}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + + fw.start_fusion(); + iota(q, dataBuf, 0); + + q.submit([&](handler &cgh) { + accessor in(dataBuf, cgh, read_only); + auto sumRed = reduction(sumBuf, cgh, plus<>{}, + property::reduction::initialize_to_identity{}); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); + }); + + q.submit([&](handler &cgh) { + accessor in(dataBuf, cgh, read_only); + auto maxRed = reduction(maxBuf, cgh, maximum<>{}, + property::reduction::initialize_to_identity{}); + detail::reduction_parallel_for( + cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed, + [=](nd_item<1> Item, auto &Red) { + Red.combine(in[Item.get_global_id()]); + }); + }); + + if constexpr (Fuse) { + fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{}); + } else { + fw.cancel_fusion(); + } + } + + int expectedMax = globalSize - 1; + int expectedSum = globalSize * expectedMax / 2; + + assert(sumRes == expectedSum); + assert(maxRes == expectedMax); +} + +template void test() { + for (size_t globalSize : {16, 512}) { + for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) { + nd_range<1> ndr{globalSize, localSize}; + test(ndr); + test(ndr); + } + } +}