diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index b07647707766f..469c445b0e66c 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   unsigned KernelIndex = 0;
   ParamList FusedParams;
   PromotionMap PromotedAccs;
-  // TODO: Collect information about streams and auxiliary resources (which
-  // contain reductions) and figure out how to fuse them.
+  // TODO: Collect information about streams and figure out how
+  // to fuse them.
   for (auto &RawCmd : InputKernels) {
     auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
     auto &CG = KernelCmd->getCG();
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index b3b4343b530ad..104c38dcdd99b 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
   auto FusedKernelCmd =
       std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
 
+  // Inherit auxiliary resources from fused command groups
+  Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
+                                                  PlaceholderCmd->getEvent());
   assert(PlaceholderCmd->MDeps.empty());
   // Next, backwards iterate over all the commands in the fusion list and remove
   // them from the graph to restore the state before starting fusion, so we can
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index c4771a9ed8c59..a83298a628539 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG(
     for (const auto &StreamImplPtr : Streams) {
       StreamImplPtr->flush(NewEvent);
     }
-
-    if (!AuxiliaryResources.empty())
-      registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
   }
 
+  if (!AuxiliaryResources.empty())
+    registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
+
   return NewEvent;
 }
 
@@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
   }
 }
 
+static void registerAuxiliaryResourcesNoLock(
+    std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
+        &AuxiliaryResources,
+    const EventImplPtr &Event,
+    std::vector<std::shared_ptr<const void>> &&Resources) {
+  std::vector<std::shared_ptr<const void>> &StoredResources =
+      AuxiliaryResources[Event];
+  StoredResources.insert(StoredResources.end(),
+                         std::make_move_iterator(Resources.begin()),
+                         std::make_move_iterator(Resources.end()));
+}
+
+void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst,
+                                       const EventImplPtr &Src) {
+  std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
+  auto Iter = MAuxiliaryResources.find(Src);
+  if (Iter == MAuxiliaryResources.end()) {
+    return;
+  }
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst,
+                                   std::move(Iter->second));
+  MAuxiliaryResources.erase(Iter);
+}
+
 void Scheduler::registerAuxiliaryResources(
     EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources) {
   std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
-  MAuxiliaryResources.insert({Event, std::move(Resources)});
+  registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event,
+                                   std::move(Resources));
 }
 
 void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 09ba43dbd1d4e..53ce295626045 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -543,6 +543,8 @@ class Scheduler {
     bool ShouldEnqueue;
   };
 
+  /// Assign \p Src's auxiliary resources to \p Dst.
+  void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src);
   void registerAuxiliaryResources(
       EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
   void cleanupAuxiliaryResources(BlockingT Blocking);
diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
new file mode 100644
index 0000000000000..e35d36b897cd2
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::group_reduce_and_atomic_cross_wg>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
new file mode 100644
index 0000000000000..500847379d864
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
@@ -0,0 +1,12 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+// UNSUPPORTED: hip || cuda
+
+// COM: When ran on HIP and CUDA, this algorithm launches 'memcpy' commands
+// leading to an infinite loop due to a bug in kernel fusion.
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::group_reduce_and_last_wg_detection>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
new file mode 100644
index 0000000000000..fb3702907e17f
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::local_atomic_and_atomic_cross_wg>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp
new file mode 100644
index 0000000000000..53768affcbbf0
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() {
+  test<detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg>();
+}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
new file mode 100644
index 0000000000000..2c59b4e0369eb
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
@@ -0,0 +1,6 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+#include "./reduction.hpp"
+
+int main() { test<detail::reduction::strategy::range_basic>(); }
diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
deleted file mode 100644
index 120679246c64c..0000000000000
--- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
-// RUN: %{run} %t.out
-//
-// The test fails on opencl:cpu when running on AMD runner and passes when
-// running on Intel Arc GPU runner.
-// UNSUPPORTED: cpu
-
-// Test fusion works with reductions.
-
-#include <sycl/sycl.hpp>
-
-#include "../helpers.hpp"
-
-using namespace sycl;
-
-template <typename BinaryOperation> class ReductionTest;
-
-int main() {
-  constexpr size_t dataSize = 512;
-
-  int sumRes = -1;
-  int maxRes = -1;
-
-  {
-    queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
-
-    buffer<int> dataBuf{dataSize};
-    buffer<int> sumBuf{&sumRes, 1};
-    buffer<int> maxBuf{&maxRes, 1};
-
-    ext::codeplay::experimental::fusion_wrapper fw{q};
-    fw.start_fusion();
-
-    iota(q, dataBuf, 0);
-
-    q.submit([&](handler &cgh) {
-      accessor in(dataBuf, cgh, read_only);
-      auto sumRed = reduction(sumBuf, cgh, plus<>{},
-                              property::reduction::initialize_to_identity{});
-      auto maxRed = reduction(maxBuf, cgh, maximum<>{},
-                              property::reduction::initialize_to_identity{});
-      cgh.parallel_for(dataSize, sumRed, maxRed,
-                       [=](id<1> i, auto &sum, auto &max) {
-                         sum.combine(in[i]);
-                         max.combine(in[i]);
-                       });
-    });
-
-    complete_fusion_with_check(
-        fw, ext::codeplay::experimental::property::no_barriers{});
-  }
-
-  constexpr int expectedMax = dataSize - 1;
-  constexpr int expectedSum = dataSize * expectedMax / 2;
-
-  std::cerr << sumRes << "\n";
-
-  assert(maxRes == expectedMax && "Unexpected max value");
-  assert(sumRes == expectedSum && "Unexpected sum value");
-
-  return 0;
-}
diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
new file mode 100644
index 0000000000000..c6e7c9ec4c139
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
@@ -0,0 +1,88 @@
+// Test fusion works with reductions. Only algorithms automatically selected by
+// `sycl::reduction` are supported.
+
+#include <sycl/sycl.hpp>
+
+#include "../helpers.hpp"
+#include "sycl/detail/reduction_forward.hpp"
+
+using namespace sycl;
+
+template <detail::reduction::strategy Strategy> struct is_fusion_supported {
+  constexpr static inline bool value =
+      detail::reduction::strategy::group_reduce_and_last_wg_detection <=
+          Strategy &&
+      Strategy < detail::reduction::strategy::group_reduce_and_multiple_kernels;
+};
+
+template <detail::reduction::strategy Strategy>
+constexpr inline bool is_fusion_supported_v =
+    is_fusion_supported<Strategy>::value;
+
+template <detail::reduction::strategy Strategy, bool Fuse>
+void test(nd_range<1> ndr) {
+  static_assert(is_fusion_supported_v<Strategy>,
+                "Testing unsupported algorithm");
+
+  auto globalSize = static_cast<int>(ndr.get_global_range().size());
+  std::vector<int> data(globalSize);
+  int sumRes = 0;
+  int maxRes = 0;
+
+  {
+    queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+    buffer<int> dataBuf{data};
+    buffer<int> sumBuf{&sumRes, 1};
+    buffer<int> maxBuf{&maxRes, 1};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+
+    fw.start_fusion();
+    iota(q, dataBuf, 0);
+
+    q.submit([&](handler &cgh) {
+      accessor in(dataBuf, cgh, read_only);
+      auto sumRed = reduction(sumBuf, cgh, plus<>{},
+                              property::reduction::initialize_to_identity{});
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(dataBuf, cgh, read_only);
+      auto maxRed = reduction(maxBuf, cgh, maximum<>{},
+                              property::reduction::initialize_to_identity{});
+      detail::reduction_parallel_for<detail::auto_name, Strategy>(
+          cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed,
+          [=](nd_item<1> Item, auto &Red) {
+            Red.combine(in[Item.get_global_id()]);
+          });
+    });
+
+    if constexpr (Fuse) {
+      fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{});
+    } else {
+      fw.cancel_fusion();
+    }
+  }
+
+  int expectedMax = globalSize - 1;
+  int expectedSum = globalSize * expectedMax / 2;
+
+  assert(sumRes == expectedSum);
+  assert(maxRes == expectedMax);
+}
+
+template <detail::reduction::strategy Strategy> void test() {
+  for (size_t globalSize : {16, 512}) {
+    for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) {
+      nd_range<1> ndr{globalSize, localSize};
+      test<Strategy, true>(ndr);
+      test<Strategy, false>(ndr);
+    }
+  }
+}