From f1e2b916dc3d98f9e781ba093326346bd13f2c82 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Fri, 2 Feb 2024 12:53:14 +0000
Subject: [PATCH 1/4] [SYCL][Fusion] Take auxiliary resources from fused
 command groups

Reductions use auxiliary resources to handle reduction temporal
buffers. Assign those auxiliary resources from each fused command
group to the placeholder fusion event. This event will not be marked
as completed after the fused reductions finish execution either if
fusion is completed, cancelled or aborted.

Test is updated to check every algorithm that is selected
automatically by `sycl::reduction`, i.e., every supported
algorithm. We also cover both cases (fusion taking and not taking
place).

Signed-off-by: Victor Perez <victor.perez@codeplay.com>
---
 sycl/source/detail/jit_compiler.cpp           |  4 +-
 .../source/detail/scheduler/graph_builder.cpp |  3 +
 sycl/source/detail/scheduler/scheduler.cpp    | 33 ++++++-
 sycl/source/detail/scheduler/scheduler.hpp    |  2 +
 .../group_reduce_and_last_wg_detection.cpp    |  9 ++
 .../local_atomic_and_atomic_cross_wg.cpp      |  8 ++
 .../KernelFusion/Reduction/range_basic.cpp    |  6 ++
 .../KernelFusion/Reduction/reduction.cpp      | 62 -------------
 .../KernelFusion/Reduction/reduction.hpp      | 86 +++++++++++++++++++
 9 files changed, 145 insertions(+), 68 deletions(-)
 create mode 100644 sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
 create mode 100644 sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
 create mode 100644 sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
 delete mode 100644 sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
 create mode 100644 sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index b07647707766f..469c445b0e66c 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   unsigned KernelIndex = 0;
   ParamList FusedParams;
   PromotionMap PromotedAccs;
-  // TODO: Collect information about streams and auxiliary resources (which
-  // contain reductions) and figure out how to fuse them.
+  // TODO: Collect information about streams and figure out how
+  // to fuse them.
   for (auto &RawCmd : InputKernels) {
     auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
     auto &CG = KernelCmd->getCG();
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index b3b4343b530ad..104c38dcdd99b 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
   auto FusedKernelCmd =
       std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
 
+  // Inherit auxiliary resources from fused command groups
+  Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
+                                                  PlaceholderCmd->getEvent());
   assert(PlaceholderCmd->MDeps.empty());
   // Next, backwards iterate over all the commands in the fusion list and remove
   // them from the graph to restore the state before starting fusion, so we can
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index c4771a9ed8c59..a83298a628539 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG(
     for (const auto &StreamImplPtr : Streams) {
       StreamImplPtr->flush(NewEvent);
     }
-
-    if (!AuxiliaryResources.empty())
-      registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
   }
 
+  if (!AuxiliaryResources.empty())
+    registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
+
   return NewEvent;
 }
 
@@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
   }
 }
 
+static void registerAuxiliaryResourcesNoLock(
+    std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
+        &AuxiliaryResources,
+    const EventImplPtr &Event,
+    std::vector<std::shared_ptr<const void>> &&Resources) {
+  std::vector<std::shared_ptr<const void>> &StoredResources =
+      AuxiliaryResources[Event];
+  StoredResources.insert(StoredResources.end(),
+                         std::make_move_iterator(Resources.begin()),
+                         std::make_move_iterator(Resources.end()));
+}
+
+void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst,
+                                       const EventImplPtr &Src) {
+  std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
+  auto Iter = MAuxiliaryResources.find(Src);
+  if (Iter == MAuxiliaryResources.end()) {
+    return;
+  }
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst,
+                                   std::move(Iter->second));
+  MAuxiliaryResources.erase(Iter);
+}
+
 void Scheduler::registerAuxiliaryResources(
     EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources) {
   std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
-  MAuxiliaryResources.insert({Event, std::move(Resources)});
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event,
+                                   std::move(Resources));
 }
 
 void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 09ba43dbd1d4e..53ce295626045 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -543,6 +543,8 @@ class Scheduler {
     bool ShouldEnqueue;
   };
 
+  /// Assign \p Src's auxiliary resources to \p Dst.
+  void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src);
   void registerAuxiliaryResources(
       EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
   void cleanupAuxiliaryResources(BlockingT Blocking);
diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
new file mode 100644
index 0000000000000..fe1b5893a7b6c
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
@@ -0,0 +1,9 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+// UNSUPPORTED: hip || cuda
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::group_reduce_and_last_wg_detection>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
new file mode 100644
index 0000000000000..fb3702907e17f
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::local_atomic_and_atomic_cross_wg>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
new file mode 100644
index 0000000000000..2c59b4e0369eb
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
@@ -0,0 +1,6 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() { test<detail::reduction::strategy::range_basic>(); }
diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
deleted file mode 100644
index 120679246c64c..0000000000000
--- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
-// RUN: %{run} %t.out
-//
-// The test fails on opencl:cpu when running on AMD runner and passes when
-// running on Intel Arc GPU runner.
-// UNSUPPORTED: cpu
-
-// Test fusion works with reductions.
-
-#include <sycl/sycl.hpp>
-
-#include "../helpers.hpp"
-
-using namespace sycl;
-
-template <typename BinaryOperation> class ReductionTest;
-
-int main() {
-  constexpr size_t dataSize = 512;
-
-  int sumRes = -1;
-  int maxRes = -1;
-
-  {
-    queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
-
-    buffer<int> dataBuf{dataSize};
-    buffer<int> sumBuf{&sumRes, 1};
-    buffer<int> maxBuf{&maxRes, 1};
-
-    ext::codeplay::experimental::fusion_wrapper fw{q};
-    fw.start_fusion();
-
-    iota(q, dataBuf, 0);
-
-    q.submit([&](handler &cgh) {
-      accessor in(dataBuf, cgh, read_only);
-      auto sumRed = reduction(sumBuf, cgh, plus<>{},
-                              property::reduction::initialize_to_identity{});
-      auto maxRed = reduction(maxBuf, cgh, maximum<>{},
-                              property::reduction::initialize_to_identity{});
-      cgh.parallel_for(dataSize, sumRed, maxRed,
-                       [=](id<1> i, auto &sum, auto &max) {
-                         sum.combine(in[i]);
-                         max.combine(in[i]);
-                       });
-    });
-
-    complete_fusion_with_check(
-        fw, ext::codeplay::experimental::property::no_barriers{});
-  }
-
-  constexpr int expectedMax = dataSize - 1;
-  constexpr int expectedSum = dataSize * expectedMax / 2;
-
-  std::cerr << sumRes << "\n";
-
-  assert(maxRes == expectedMax && "Unexpected max value");
-  assert(sumRes == expectedSum && "Unexpected sum value");
-
-  return 0;
-}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
new file mode 100644
index 0000000000000..34d66870f855b
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
@@ -0,0 +1,86 @@
+// Test fusion works with reductions. Some algorithms will lead to fusion being
+// cancelled in some devices. These should work properly anyway.
+
+#include <sycl/sycl.hpp>
+
+#include "../helpers.hpp"
+#include "sycl/detail/reduction_forward.hpp"
+
+using namespace sycl;
+
+constexpr inline size_t globalSize = 512;
+
+template <detail::reduction::strategy Strategy> struct is_fusion_supported {
+  constexpr static inline bool value =
+      detail::reduction::strategy::group_reduce_and_last_wg_detection <=
+          Strategy &&
+      Strategy < detail::reduction::strategy::group_reduce_and_atomic_cross_wg;
+};
+
+template <detail::reduction::strategy Strategy>
+constexpr inline bool is_fusion_supported_v =
+    is_fusion_supported<Strategy>::value;
+
+template <detail::reduction::strategy Strategy, bool Fuse>
+void test(nd_range<1> ndr) {
+  static_assert(is_fusion_supported_v<Strategy>,
+                "Testing unsupported algorithm");
+  std::array<int, globalSize> data;
+  int sumRes = 0;
+  int maxRes = 0;
+
+  {
+    queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+    buffer<int> dataBuf{data};
+    buffer<int> sumBuf{&sumRes, 1};
+    buffer<int> maxBuf{&maxRes, 1};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+
+    fw.start_fusion();
+    iota(q, dataBuf, 0);
+
+    q.submit([&](handler &cgh) {
+      accessor in(dataBuf, cgh, read_only);
+      auto sumRed = reduction(sumBuf, cgh, plus<>{},
+                              property::reduction::initialize_to_identity{});
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(dataBuf, cgh, read_only);
+      auto maxRed = reduction(maxBuf, cgh, maximum<>{},
+                              property::reduction::initialize_to_identity{});
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
+    });
+
+    if constexpr (Fuse) {
+      fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{});
+    } else {
+      fw.cancel_fusion();
+    }
+  }
+
+  constexpr int expectedMax = globalSize - 1;
+  constexpr int expectedSum = globalSize * expectedMax / 2;
+
+  assert(sumRes == expectedSum);
+  assert(maxRes == expectedMax);
+}
+
+template <detail::reduction::strategy Strategy> void test() {
+  for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) {
+    nd_range<1> ndr{globalSize, localSize};
+    test<Strategy, true>(ndr);
+    test<Strategy, false>(ndr);
+  }
+}

From 9dab85a100055c9d169eec997a74bbfa34387a1c Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Tue, 6 Feb 2024 16:14:09 +0000
Subject: [PATCH 2/4] Update comment

---
 sycl/test-e2e/KernelFusion/Reduction/reduction.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
index 34d66870f855b..8827a8f344bec 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
@@ -1,5 +1,5 @@
-// Test fusion works with reductions. Some algorithms will lead to fusion being
-// cancelled in some devices. These should work properly anyway.
+// Test fusion works with reductions. Only algorithms automatically selected by
+// `sycl::reduction` are supported.
 
 #include <sycl/sycl.hpp>
 

From a143612df1140bd314e6b1bd513887f72687f7f5 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Tue, 6 Feb 2024 16:32:09 +0000
Subject: [PATCH 3/4] Test more algorithms

---
 .../group_reduce_and_atomic_cross_wg.cpp      |  8 +++++++
 .../local_mem_tree_and_atomic_cross_wg.cpp    |  8 +++++++
 .../KernelFusion/Reduction/reduction.hpp      | 22 ++++++++++---------
 3 files changed, 28 insertions(+), 10 deletions(-)
 create mode 100644 sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
 create mode 100644 sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp

diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
new file mode 100644
index 0000000000000..e35d36b897cd2
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::group_reduce_and_atomic_cross_wg>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp
new file mode 100644
index 0000000000000..53768affcbbf0
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
index 8827a8f344bec..c6e7c9ec4c139 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
@@ -8,13 +8,11 @@
 
 using namespace sycl;
 
-constexpr inline size_t globalSize = 512;
-
 template <detail::reduction::strategy Strategy> struct is_fusion_supported {
   constexpr static inline bool value =
       detail::reduction::strategy::group_reduce_and_last_wg_detection <=
           Strategy &&
-      Strategy < detail::reduction::strategy::group_reduce_and_atomic_cross_wg;
+      Strategy < detail::reduction::strategy::group_reduce_and_multiple_kernels;
 };
 
 template <detail::reduction::strategy Strategy>
@@ -25,7 +23,9 @@ template <detail::reduction::strategy Strategy, bool Fuse>
 void test(nd_range<1> ndr) {
   static_assert(is_fusion_supported_v<Strategy>,
                 "Testing unsupported algorithm");
-  std::array<int, globalSize> data;
+
+  auto globalSize = static_cast<int>(ndr.get_global_range().size());
+  std::vector<int> data(globalSize);
   int sumRes = 0;
   int maxRes = 0;
 
@@ -70,17 +70,19 @@ void test(nd_range<1> ndr) {
     }
   }
 
-  constexpr int expectedMax = globalSize - 1;
-  constexpr int expectedSum = globalSize * expectedMax / 2;
+  int expectedMax = globalSize - 1;
+  int expectedSum = globalSize * expectedMax / 2;
 
   assert(sumRes == expectedSum);
   assert(maxRes == expectedMax);
 }
 
 template <detail::reduction::strategy Strategy> void test() {
-  for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) {
-    nd_range<1> ndr{globalSize, localSize};
-    test<Strategy, true>(ndr);
-    test<Strategy, false>(ndr);
+  for (size_t globalSize : {16, 512}) {
+    for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) {
+      nd_range<1> ndr{globalSize, localSize};
+      test<Strategy, true>(ndr);
+      test<Strategy, false>(ndr);
+    }
   }
 }

From db44bb5619cf0a49a48aa128e4e0378b9c0e94b1 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Wed, 7 Feb 2024 13:38:29 +0000
Subject: [PATCH 4/4] Add comment

---
 .../Reduction/group_reduce_and_last_wg_detection.cpp           | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
index fe1b5893a7b6c..500847379d864 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
@@ -2,6 +2,9 @@
 // RUN: %{run} %t.out
 // UNSUPPORTED: hip || cuda
 
+// COM: When ran on HIP and CUDA, this algorithm launches 'memcpy' commands
+// leading to an infinite loop due to a bug in kernel fusion.
+
 #include "./reduction.hpp"
 
 int main() {