Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Fusion] Take auxiliary resources from fused command groups #12593

Merged
merged 4 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sycl/source/detail/jit_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
unsigned KernelIndex = 0;
ParamList FusedParams;
PromotionMap PromotedAccs;
// TODO: Collect information about streams and auxiliary resources (which
// contain reductions) and figure out how to fuse them.
// TODO: Collect information about streams and figure out how
// to fuse them.
for (auto &RawCmd : InputKernels) {
auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
auto &CG = KernelCmd->getCG();
Expand Down
3 changes: 3 additions & 0 deletions sycl/source/detail/scheduler/graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
auto FusedKernelCmd =
std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);

// Inherit auxiliary resources from fused command groups
Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
PlaceholderCmd->getEvent());
assert(PlaceholderCmd->MDeps.empty());
// Next, backwards iterate over all the commands in the fusion list and remove
// them from the graph to restore the state before starting fusion, so we can
Expand Down
33 changes: 29 additions & 4 deletions sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG(
for (const auto &StreamImplPtr : Streams) {
StreamImplPtr->flush(NewEvent);
}

if (!AuxiliaryResources.empty())
registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
}

if (!AuxiliaryResources.empty())
registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));

return NewEvent;
}

Expand Down Expand Up @@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
}
}

static void registerAuxiliaryResourcesNoLock(
std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
&AuxiliaryResources,
const EventImplPtr &Event,
std::vector<std::shared_ptr<const void>> &&Resources) {
std::vector<std::shared_ptr<const void>> &StoredResources =
AuxiliaryResources[Event];
StoredResources.insert(StoredResources.end(),
std::make_move_iterator(Resources.begin()),
std::make_move_iterator(Resources.end()));
}

void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst,
const EventImplPtr &Src) {
std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
auto Iter = MAuxiliaryResources.find(Src);
if (Iter == MAuxiliaryResources.end()) {
return;
}
registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst,
std::move(Iter->second));
MAuxiliaryResources.erase(Iter);
}

void Scheduler::registerAuxiliaryResources(
EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources) {
std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
MAuxiliaryResources.insert({Event, std::move(Resources)});
registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event,
std::move(Resources));
}

void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,8 @@ class Scheduler {
bool ShouldEnqueue;
};

/// Assign \p Src's auxiliary resources to \p Dst.
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src);
void registerAuxiliaryResources(
EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
void cleanupAuxiliaryResources(BlockingT Blocking);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

#include "./reduction.hpp"

int main() {
test<detail::reduction::strategy::group_reduce_and_atomic_cross_wg>();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out
// UNSUPPORTED: hip || cuda
victor-eds marked this conversation as resolved.
Show resolved Hide resolved

#include "./reduction.hpp"

int main() {
test<detail::reduction::strategy::group_reduce_and_last_wg_detection>();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

#include "./reduction.hpp"

int main() {
test<detail::reduction::strategy::local_atomic_and_atomic_cross_wg>();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

#include "./reduction.hpp"

int main() {
test<detail::reduction::strategy::local_mem_tree_and_atomic_cross_wg>();
}
6 changes: 6 additions & 0 deletions sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

#include "./reduction.hpp"

int main() { test<detail::reduction::strategy::range_basic>(); }
62 changes: 0 additions & 62 deletions sycl/test-e2e/KernelFusion/Reduction/reduction.cpp

This file was deleted.

88 changes: 88 additions & 0 deletions sycl/test-e2e/KernelFusion/Reduction/reduction.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Test fusion works with reductions. Only algorithms automatically selected by
// `sycl::reduction` are supported.

#include <sycl/sycl.hpp>

#include "../helpers.hpp"
#include "sycl/detail/reduction_forward.hpp"

using namespace sycl;

template <detail::reduction::strategy Strategy> struct is_fusion_supported {
constexpr static inline bool value =
detail::reduction::strategy::group_reduce_and_last_wg_detection <=
Strategy &&
Strategy < detail::reduction::strategy::group_reduce_and_multiple_kernels;
};

template <detail::reduction::strategy Strategy>
constexpr inline bool is_fusion_supported_v =
is_fusion_supported<Strategy>::value;

template <detail::reduction::strategy Strategy, bool Fuse>
void test(nd_range<1> ndr) {
static_assert(is_fusion_supported_v<Strategy>,
"Testing unsupported algorithm");

auto globalSize = static_cast<int>(ndr.get_global_range().size());
std::vector<int> data(globalSize);
int sumRes = 0;
int maxRes = 0;

{
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

buffer<int> dataBuf{data};
buffer<int> sumBuf{&sumRes, 1};
buffer<int> maxBuf{&maxRes, 1};

ext::codeplay::experimental::fusion_wrapper fw{q};

fw.start_fusion();
iota(q, dataBuf, 0);

q.submit([&](handler &cgh) {
accessor in(dataBuf, cgh, read_only);
auto sumRed = reduction(sumBuf, cgh, plus<>{},
property::reduction::initialize_to_identity{});
detail::reduction_parallel_for<detail::auto_name, Strategy>(
cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed,
[=](nd_item<1> Item, auto &Red) {
Red.combine(in[Item.get_global_id()]);
});
});

q.submit([&](handler &cgh) {
accessor in(dataBuf, cgh, read_only);
auto maxRed = reduction(maxBuf, cgh, maximum<>{},
property::reduction::initialize_to_identity{});
detail::reduction_parallel_for<detail::auto_name, Strategy>(
cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed,
[=](nd_item<1> Item, auto &Red) {
Red.combine(in[Item.get_global_id()]);
});
});

if constexpr (Fuse) {
fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{});
} else {
fw.cancel_fusion();
}
}

int expectedMax = globalSize - 1;
int expectedSum = globalSize * expectedMax / 2;

assert(sumRes == expectedSum);
assert(maxRes == expectedMax);
}

template <detail::reduction::strategy Strategy> void test() {
for (size_t globalSize : {16, 512}) {
for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) {
nd_range<1> ndr{globalSize, localSize};
test<Strategy, true>(ndr);
test<Strategy, false>(ndr);
}
}
}
Loading