Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Improve record and replay edge determination #46

Merged
merged 5 commits into from
Dec 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 48 additions & 5 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,63 @@ void graph_impl::remove_root(node_ptr n) {
MSchedule.clear();
}

// Recursive check if a graph node or its successors contains a given kernel
// argument.
//
// @param[in] arg The kernel argument to check for.
// @param[in] currentNode The current graph node being checked.
// @param[in,out] deps The unique list of dependencies which have been
// identified for this arg.
// @param[in] dereferencePtr if true arg comes direct from the handler in which
// case it will need to be deferenced to check actual value.
//
// @returns True if a dependency was added in this node of any of its
// successors.
bool check_for_arg(const sycl::detail::ArgDesc &arg, node_ptr currentNode,
std::set<node_ptr> &deps, bool dereferencePtr = false) {
bool successorAddedDep = false;
for (auto &successor : currentNode->MSuccessors) {
successorAddedDep |= check_for_arg(arg, successor, deps, dereferencePtr);
}

if (deps.find(currentNode) == deps.end() &&
currentNode->has_arg(arg, dereferencePtr) && !successorAddedDep) {
deps.insert(currentNode);
return true;
}
return successorAddedDep;
}

template <typename T>
node_ptr graph_impl::add(graph_ptr impl, T cgf,
const std::vector<sycl::detail::ArgDesc> &args,
const std::vector<node_ptr> &dep) {
node_ptr nodeImpl = std::make_shared<node_impl>(impl, cgf);
if (!dep.empty()) {
for (auto n : dep) {
node_ptr nodeImpl = std::make_shared<node_impl>(impl, cgf, args);
// Copy deps so we can modify them
auto deps = dep;
// A unique set of dependencies obtained by checking kernel arguments
std::set<node_ptr> uniqueDeps;
for (auto &arg : args) {
if (arg.MType != sycl::detail::kernel_param_kind_t::kind_pointer) {
continue;
}
// Look through the graph for nodes which share this argument
for (auto nodePtr : MRoots) {
check_for_arg(arg, nodePtr, uniqueDeps, true);
}
}

// Add any deps determined from arguments into the dependency list
deps.insert(deps.end(), uniqueDeps.begin(), uniqueDeps.end());
if (!deps.empty()) {
for (auto n : deps) {
n->register_successor(nodeImpl); // register successor
this->remove_root(nodeImpl); // remove receiver from root node
// list
}
} else {
this->add_root(nodeImpl);
}
MLastNode = nodeImpl;
return nodeImpl;
}

Expand All @@ -93,7 +136,7 @@ node command_graph<graph_state::modifiable>::add_impl(
depImpls.push_back(sycl::detail::getSyclObjImpl(d));
}

auto nodeImpl = impl->add(impl, cgf, depImpls);
auto nodeImpl = impl->add(impl, cgf, {}, depImpls);
return sycl::detail::createSyclObjFromImpl<node>(nodeImpl);
}

Expand Down
37 changes: 31 additions & 6 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#pragma once

#include <sycl/detail/cg_types.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/handler.hpp>

Expand Down Expand Up @@ -53,6 +54,8 @@ struct node_impl {

std::function<void(sycl::handler &)> MBody;

std::vector<sycl::detail::ArgDesc> MArgs;

void exec(sycl::detail::queue_ptr q);

void register_successor(node_ptr n) {
Expand All @@ -65,7 +68,17 @@ struct node_impl {
sycl::event get_event(void) const { return MEvent; }

template <typename T>
node_impl(graph_ptr g, T cgf) : MScheduled(false), MGraph(g), MBody(cgf) {}
node_impl(graph_ptr g, T cgf, const std::vector<sycl::detail::ArgDesc> &args)
: MScheduled(false), MGraph(g), MBody(cgf), MArgs(args) {
for (size_t i = 0; i < MArgs.size(); i++) {
if (MArgs[i].MType == sycl::detail::kernel_param_kind_t::kind_pointer) {
// Make sure we are storing the actual USM pointer for comparison
// purposes, note we couldn't actually submit using these copies of the
// args if subsequent code expects a void**.
MArgs[i].MPtr = *(void **)(MArgs[i].MPtr);
}
}
}

// Recursively adding nodes to execution stack:
void topology_sort(std::list<node_ptr> &schedule) {
Expand All @@ -76,11 +89,23 @@ struct node_impl {
}
schedule.push_front(node_ptr(this));
}

bool has_arg(const sycl::detail::ArgDesc &arg, bool dereferencePtr = false) {
for (auto &nodeArg : MArgs) {
if (arg.MType == nodeArg.MType && arg.MSize == nodeArg.MSize) {
// Args coming directly from the handler will need to be dereferenced
// since they are actually void**
void *incomingPtr = dereferencePtr ? *(void **)arg.MPtr : arg.MPtr;
if (incomingPtr == nodeArg.MPtr) {
return true;
}
}
}
return false;
}
};

struct graph_impl {
// The last node added to the graph.
node_ptr MLastNode;
std::set<node_ptr> MRoots;
std::list<node_ptr> MSchedule;
// TODO: Change one time initialization to per executable object
Expand All @@ -95,9 +120,9 @@ struct graph_impl {
void remove_root(node_ptr n);

template <typename T>
node_ptr add(graph_ptr impl, T cgf, const std::vector<node_ptr> &dep = {});

node_ptr getLastNode() const { return MLastNode; }
node_ptr add(graph_ptr impl, T cgf,
const std::vector<sycl::detail::ArgDesc> &args,
const std::vector<node_ptr> &dep = {});

graph_impl() : MFirst(true) {}
};
Expand Down
18 changes: 6 additions & 12 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,20 +537,14 @@ class queue_impl {
const SubmitPostProcessF *PostProcess) {
event Event = detail::createSyclObjFromImpl<event>(
std::make_shared<detail::event_impl>());
handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
Handler.saveCodeLoc(Loc);
CGF(Handler);
if (auto graphImpl = Self->getCommandGraph(); graphImpl != nullptr) {

// TODO: Simple implementation schedules all recorded nodes in order with
// each having a dependency on the previous node. This should be improved
// to correctly determine edges and dependencies.
std::vector<ext::oneapi::experimental::detail::node_ptr> deps;
if (auto lastNode = graphImpl->getLastNode(); lastNode != nullptr) {
deps.push_back(lastNode);
}
graphImpl->add(graphImpl, CGF, deps);
// Pass the args obtained by the handler to the graph to use in
// determining edges between this node and previously submitted nodes.
graphImpl->add(graphImpl, CGF, Handler.MArgs, {});
} else {
handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
Handler.saveCodeLoc(Loc);
CGF(Handler);

// Scheduler will later omit events, that are not required to execute
// tasks. Host and interop tasks, however, are not submitted to low-level
Expand Down
112 changes: 112 additions & 0 deletions sycl/test/graph/graph-record-dotp-buffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
#include <CL/sycl.hpp>
#include <iostream>
#include <thread>

#include <sycl/ext/oneapi/experimental/graph.hpp>

const size_t n = 10;

float host_gold_result() {
float alpha = 1.0f;
float beta = 2.0f;
float gamma = 3.0f;

float sum = 0.0f;

for (size_t i = 0; i < n; ++i) {
sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f);
}

return sum;
}

int main() {
float alpha = 1.0f;
float beta = 2.0f;
float gamma = 3.0f;

sycl::property_list properties{
sycl::property::queue::in_order(),
sycl::ext::oneapi::property::queue::lazy_execution{}};

sycl::queue q{sycl::gpu_selector_v, properties};

sycl::ext::oneapi::experimental::command_graph g;

float dotpData = 0.f;
std::vector<float> xData(n);
std::vector<float> yData(n);
std::vector<float> zData(n);

{
sycl::buffer dotpBuf(&dotpData, sycl::range<1>(1));

sycl::buffer xBuf(xData);
sycl::buffer yBuf(yData);
sycl::buffer zBuf(zData);

q.begin_recording(g);

/* init data on the device */
q.submit([&](sycl::handler &h) {
auto x = xBuf.get_access(h);
auto y = yBuf.get_access(h);
auto z = zBuf.get_access(h);
h.parallel_for(n, [=](sycl::id<1> it) {
const size_t i = it[0];
x[i] = 1.0f;
y[i] = 2.0f;
z[i] = 3.0f;
});
});

q.submit([&](sycl::handler &h) {
auto x = xBuf.get_access(h);
auto y = yBuf.get_access(h);
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
const size_t i = it[0];
x[i] = alpha * x[i] + beta * y[i];
});
});

q.submit([&](sycl::handler &h) {
auto y = yBuf.get_access(h);
auto z = zBuf.get_access(h);
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
const size_t i = it[0];
z[i] = gamma * z[i] + beta * y[i];
});
});

q.submit([&](sycl::handler &h) {
auto dotp = dotpBuf.get_access(h);
auto x = xBuf.get_access(h);
auto z = zBuf.get_access(h);
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
const size_t i = it[0];
// Doing a manual reduction here because reduction objects cause issues
// with graphs.
if (i == 0) {
for (size_t j = 0; j < n; j++) {
dotp[0] += x[j] * z[j];
}
}
});
});

q.end_recording();

auto exec_graph = g.finalize(q.get_context());

q.submit([&](sycl::handler &h) { h.exec_graph(exec_graph); });
}

if (dotpData != host_gold_result()) {
std::cout << "Error unexpected result!\n";
}

std::cout << "done.\n";

return 0;
}