From 62bd64e85d294700ce52351ccd53c340ea965ef8 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 6 Feb 2024 14:58:28 +0000 Subject: [PATCH] [SYCL][Fusion] Avoid deadlock on events with no command group `sycl::queue` shortcut functions, e.g., `sycl::queue::memcpy`, may not create a command group, so events coming from those will not have a command group attached. Avoid deadlock when going through event list in fusion mode. Signed-off-by: Victor Perez --- .../source/detail/scheduler/graph_builder.cpp | 1 + .../group_reduce_and_last_wg_detection.cpp | 1 - .../KernelFusion/queue-shortcut-functions.cpp | 99 +++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 104c38dcdd99b..5a26623ce4003 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -963,6 +963,7 @@ Scheduler::GraphBuildResult Scheduler::GraphBuilder::addCG( for (auto Ev = Events.begin(); Ev != Events.end();) { auto *EvDepCmd = static_cast((*Ev)->getCommand()); if (!EvDepCmd) { + ++Ev; continue; } // Handle event dependencies on any commands part of another active diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp index 500847379d864..0868428cc2ef6 100644 --- a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp +++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp @@ -1,6 +1,5 @@ // RUN: %{build} -fsycl-embed-ir -o %t.out // RUN: %{run} %t.out -// UNSUPPORTED: hip || cuda // COM: When ran on HIP and CUDA, this algorithm launches 'memcpy' commands // leading to an infinite loop due to a bug in kernel fusion. diff --git a/sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp b/sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp new file mode 100644 index 0000000000000..7696c18adc7ed --- /dev/null +++ b/sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp @@ -0,0 +1,99 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 \ +// RUN: | FileCheck %s --implicit-check-not=ERROR + +// Test fusion with queue shortcut functions being involved. + +#include + +using namespace sycl; + +template class Kernel; + +template void test() { + static_assert(0 <= FusionStartPoint && FusionStartPoint < 3, + "Invalid fusion start point"); + + constexpr size_t size = 1024; + constexpr float value = 10; + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + std::array h; + h.fill(0); + auto *ptr0 = sycl::malloc_device(size, q); + auto *ptr1 = sycl::malloc_device(size, q); + + { + range<1> r{size}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + + if constexpr (FusionStartPoint == 0) { + fw.start_fusion(); + } + + // ptr0(x) = value + auto e0 = q.parallel_for>( + r, [=](sycl::id<1> i) { ptr0[i] = value; }); + // ptr1(x) = value / 2 + auto e1 = q.parallel_for>( + r, [=](sycl::id<1> i) { ptr1[i] = value / 2; }); + + if constexpr (FusionStartPoint == 1) { + fw.start_fusion(); + } + + // ptr0(x) = value / 2 if x < size / 2 else value + auto e2 = q.memcpy(ptr0, ptr1, sizeof(float) * size / 2, {e0, e1}); + + if constexpr (FusionStartPoint == 2) { + fw.start_fusion(); + } + + // ptr0(x) = value / 2 + 1 if x < size / 2 else value + 1 + auto e3 = q.parallel_for>( + r, e2, [=](sycl::id<1> i) { ptr0[i]++; }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + // Copyback + q.memcpy(h.data(), ptr0, sizeof(float) * size, e3).wait(); + } + + sycl::free(ptr0, q); + sycl::free(ptr1, q); + + assert(std::all_of(h.begin(), h.begin() + size / 2, + [=](float f) { return f == value / 2 + 1; }) && + "ERROR"); + assert(std::all_of(h.begin() + size / 2, h.end(), + [=](float f) { return f == value + 1; }) && + "ERROR"); +} + +int main() { + std::cerr << "FusionStartPoint = 0:\n"; + // COM: memcpy leads to a CG being created as it depends on CGs not producing + // a PI event (coming from the CGs to be fused), so not safe to bypass. Fusion + // should be cancelled as a dependency with an event to be fused is found. + + // CHECK: FusionStartPoint = 0: + // CHECK-NEXT: WARNING: Not fusing 'copy usm' command group. Can only fuse device kernel command groups. + // CHECK-NEXT: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested + test<0>(); + + std::cerr << "FusionStartPoint = 1:\n"; + // COM: memcpy does not create CG, memory manager handles the operation + // instead. As no dependency with a CG to be fused is found, events are issued + // as usual and fusion takes place. + + // CHECK-NEXT: FusionStartPoint = 1: + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found + test<1>(); + + std::cerr << "FusionStartPoint = 2:\n"; + // COM: Same as above. + + // CHECK-NEXT: FusionStartPoint = 2: + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found + test<2>(); +}