Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][NFCI][ABI-Break] Move handler members to impl #14460

Merged
merged 24 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
98c691c
[SYCL][NFCI][ABI-Break] Move handler members to impl
steffenlarsen Jul 5, 2024
5805367
Fix shallowing integer
steffenlarsen Jul 5, 2024
8878682
Merge remote-tracking branch 'intel/sycl' into steffen/move_handler_e…
steffenlarsen Jul 8, 2024
e339faf
Fix failure
steffenlarsen Jul 8, 2024
d93ef7b
Move a few more members
steffenlarsen Jul 8, 2024
4fa05e8
Merge remote-tracking branch 'intel/sycl' into steffen/move_handler_e…
steffenlarsen Jul 8, 2024
69abfe6
Fix AddLifetimeSharedPtrStorage arg
steffenlarsen Jul 8, 2024
d1ca544
Fix padding
steffenlarsen Jul 8, 2024
a2f49a5
Merge remote-tracking branch 'intel/sycl' into steffen/move_handler_e…
steffenlarsen Jul 8, 2024
5f99c91
Pad NDRDescT global size in NumWorkGroups case
steffenlarsen Jul 8, 2024
8fdfcdd
Merge remote-tracking branch 'intel/sycl' into steffen/move_handler_e…
steffenlarsen Jul 8, 2024
ad2e20c
Fix build issue
steffenlarsen Jul 8, 2024
c389dc0
Fix formatting and missed padding case
steffenlarsen Jul 9, 2024
72f7fb6
Move back to letting NDRDescT govern its padding
steffenlarsen Jul 9, 2024
dc76694
Merge remote-tracking branch 'intel/sycl' into steffen/move_handler_e…
steffenlarsen Jul 9, 2024
adf52d4
Include interop handle header
steffenlarsen Jul 9, 2024
e51e47a
Fix broken include paths
steffenlarsen Jul 9, 2024
9b33cf5
Add back the case for interop_handle host_task
steffenlarsen Jul 9, 2024
1d9e1f5
Add interop handler include on hip and cuda tests
steffenlarsen Jul 10, 2024
a0aba15
Remove unnecessary includes
steffenlarsen Jul 10, 2024
628ee06
Merge remote-tracking branch 'intel/sycl' into steffen/move_handler_e…
steffenlarsen Jul 11, 2024
abb11cc
Add Windows symbols and change namings to match other members
steffenlarsen Jul 11, 2024
1a4f3a3
Remove unreachable asserts causing Windows warnings
steffenlarsen Jul 11, 2024
affd0ba
Merge branch 'sycl' into steffen/move_handler_elems
steffenlarsen Jul 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
297 changes: 28 additions & 269 deletions sycl/include/sycl/detail/cg_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,105 +37,34 @@ class handler;
namespace detail {
class HostTask;

// The structure represents kernel argument.
class ArgDesc {
public:
ArgDesc(sycl::detail::kernel_param_kind_t Type, void *Ptr, int Size,
int Index)
: MType(Type), MPtr(Ptr), MSize(Size), MIndex(Index) {}

sycl::detail::kernel_param_kind_t MType;
void *MPtr;
int MSize;
int MIndex;
};

// The structure represents NDRange - global, local sizes, global offset and
// number of dimensions.
class NDRDescT {
// The method initializes all sizes for dimensions greater than the passed one
// to the default values, so they will not affect execution.
void setNDRangeLeftover(int Dims_) {
for (int I = Dims_; I < 3; ++I) {
GlobalSize[I] = 1;
LocalSize[I] = LocalSize[0] ? 1 : 0;
GlobalOffset[I] = 0;
NumWorkGroups[I] = 0;
}
}

public:
NDRDescT()
: GlobalSize{0, 0, 0}, LocalSize{0, 0, 0}, NumWorkGroups{0, 0, 0},
Dims{0} {}

template <int Dims_> void set(sycl::range<Dims_> NumWorkItems) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = NumWorkItems[I];
LocalSize[I] = 0;
GlobalOffset[I] = 0;
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

// Initializes this ND range descriptor with given range of work items and
// offset.
template <int Dims_>
void set(sycl::range<Dims_> NumWorkItems, sycl::id<Dims_> Offset) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = NumWorkItems[I];
LocalSize[I] = 0;
GlobalOffset[I] = Offset[I];
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

template <int Dims_> void set(sycl::nd_range<Dims_> ExecutionRange) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = ExecutionRange.get_global_range()[I];
LocalSize[I] = ExecutionRange.get_local_range()[I];
GlobalOffset[I] = ExecutionRange.get_offset()[I];
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

void set(int Dims_, sycl::nd_range<3> ExecutionRange) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = ExecutionRange.get_global_range()[I];
LocalSize[I] = ExecutionRange.get_local_range()[I];
GlobalOffset[I] = ExecutionRange.get_offset()[I];
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

template <int Dims_> void setNumWorkGroups(sycl::range<Dims_> N) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = 0;
// '0' is a mark to adjust before kernel launch when there is enough info:
LocalSize[I] = 0;
GlobalOffset[I] = 0;
NumWorkGroups[I] = N[I];
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

sycl::range<3> GlobalSize;
sycl::range<3> LocalSize;
sycl::id<3> GlobalOffset;
/// Number of workgroups, used to record the number of workgroups from the
/// simplest form of parallel_for_work_group. If set, all other fields must be
/// zero
sycl::range<3> NumWorkGroups;
size_t Dims;
/// Type of the command group.
/// NOTE: Changing the values of any of these enumerators is an API-break.
EwanC marked this conversation as resolved.
Show resolved Hide resolved
enum class CGType : unsigned int {
None = 0,
Kernel = 1,
CopyAccToPtr = 2,
CopyPtrToAcc = 3,
CopyAccToAcc = 4,
Barrier = 5,
BarrierWaitlist = 6,
Fill = 7,
UpdateHost = 8,
CopyUSM = 10,
FillUSM = 11,
PrefetchUSM = 12,
CodeplayHostTask = 14,
AdviseUSM = 15,
Copy2DUSM = 16,
Fill2DUSM = 17,
Memset2DUSM = 18,
CopyToDeviceGlobal = 19,
CopyFromDeviceGlobal = 20,
ReadWriteHostPipe = 21,
ExecCommandBuffer = 22,
CopyImage = 23,
SemaphoreWait = 24,
SemaphoreSignal = 25,
ProfilingTag = 26,
};

template <typename, typename T> struct check_fn_signature {
Expand Down Expand Up @@ -224,8 +153,6 @@ runKernelWithArg(KernelType KernelName, ArgType Arg) {
// The pure virtual class aimed to store lambda/functors of any type.
class HostKernelBase {
public:
// The method executes lambda stored using NDRange passed.
virtual void call(const NDRDescT &NDRDesc, HostProfilingInfo *HPI) = 0;
// Return pointer to the lambda object.
// Used to extract captured variables.
virtual char *getPtr() = 0;
Expand All @@ -243,177 +170,9 @@ class HostKernel : public HostKernelBase {

public:
HostKernel(KernelType Kernel) : MKernel(Kernel) {}
void call(const NDRDescT &NDRDesc, HostProfilingInfo *HPI) override {
// adjust ND range for serial host:
NDRDescT AdjustedRange = NDRDesc;

if (NDRDesc.GlobalSize[0] == 0 && NDRDesc.NumWorkGroups[0] != 0) {
// This is a special case - NDRange information is not complete, only the
// desired number of work groups is set by the user. Choose work group
// size (LocalSize), calculate the missing NDRange characteristics
// needed to invoke the kernel and adjust the NDRange descriptor
// accordingly. For some devices the work group size selection requires
// access to the device's properties, hence such late "adjustment".
range<3> WGsize{1, 1, 1}; // no better alternative for serial host?
AdjustedRange.set(NDRDesc.Dims,
nd_range<3>(NDRDesc.NumWorkGroups * WGsize, WGsize));
}
// If local size for host is not set explicitly, let's adjust it to 1,
// so nd_range_error for zero local size is not thrown.
if (AdjustedRange.LocalSize[0] == 0)
for (size_t I = 0; I < AdjustedRange.Dims; ++I)
AdjustedRange.LocalSize[I] = 1;
if (HPI)
HPI->start();
runOnHost(AdjustedRange);
if (HPI)
HPI->end();
}

char *getPtr() override { return reinterpret_cast<char *>(&MKernel); }

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, void>>
runOnHost(const NDRDescT &) {
runKernelWithoutArg(MKernel);
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, sycl::id<Dims>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
sycl::id<Dims> Offset;
sycl::range<Dims> Stride(
InitializedVal<Dims, range>::template get<1>()); // initialized to 1
sycl::range<Dims> UpperBound(
InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
Range[I] = NDRDesc.GlobalSize[I];
Offset[I] = NDRDesc.GlobalOffset[I];
UpperBound[I] = Range[I] + Offset[I];
}

detail::NDLoop<Dims>::iterate(
/*LowerBound=*/Offset, Stride, UpperBound,
[&](const sycl::id<Dims> &ID) {
sycl::item<Dims, /*Offset=*/true> Item =
IDBuilder::createItem<Dims, true>(Range, ID, Offset);

runKernelWithArg<const sycl::id<Dims> &>(MKernel, ID);
});
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, item<Dims, /*Offset=*/false>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::id<Dims> ID;
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I)
Range[I] = NDRDesc.GlobalSize[I];

detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> ID) {
sycl::item<Dims, /*Offset=*/false> Item =
IDBuilder::createItem<Dims, false>(Range, ID);
sycl::item<Dims, /*Offset=*/true> ItemWithOffset = Item;

runKernelWithArg<sycl::item<Dims, /*Offset=*/false>>(MKernel, Item);
});
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, item<Dims, /*Offset=*/true>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
sycl::id<Dims> Offset;
sycl::range<Dims> Stride(
InitializedVal<Dims, range>::template get<1>()); // initialized to 1
sycl::range<Dims> UpperBound(
InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
Range[I] = NDRDesc.GlobalSize[I];
Offset[I] = NDRDesc.GlobalOffset[I];
UpperBound[I] = Range[I] + Offset[I];
}

detail::NDLoop<Dims>::iterate(
/*LowerBound=*/Offset, Stride, UpperBound,
[&](const sycl::id<Dims> &ID) {
sycl::item<Dims, /*Offset=*/true> Item =
IDBuilder::createItem<Dims, true>(Range, ID, Offset);

runKernelWithArg<sycl::item<Dims, /*Offset=*/true>>(MKernel, Item);
});
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, nd_item<Dims>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> GroupSize(InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
if (NDRDesc.LocalSize[I] == 0 ||
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
throw sycl::nd_range_error("Invalid local size for global size",
PI_ERROR_INVALID_WORK_GROUP_SIZE);
GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
}

sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
sycl::range<Dims> GlobalSize(
InitializedVal<Dims, range>::template get<0>());
sycl::id<Dims> GlobalOffset;
for (int I = 0; I < Dims; ++I) {
GlobalOffset[I] = NDRDesc.GlobalOffset[I];
LocalSize[I] = NDRDesc.LocalSize[I];
GlobalSize[I] = NDRDesc.GlobalSize[I];
}

detail::NDLoop<Dims>::iterate(GroupSize, [&](const id<Dims> &GroupID) {
sycl::group<Dims> Group = IDBuilder::createGroup<Dims>(
GlobalSize, LocalSize, GroupSize, GroupID);

detail::NDLoop<Dims>::iterate(LocalSize, [&](const id<Dims> &LocalID) {
id<Dims> GlobalID =
GroupID * id<Dims>{LocalSize} + LocalID + GlobalOffset;
const sycl::item<Dims, /*Offset=*/true> GlobalItem =
IDBuilder::createItem<Dims, true>(GlobalSize, GlobalID,
GlobalOffset);
const sycl::item<Dims, /*Offset=*/false> LocalItem =
IDBuilder::createItem<Dims, false>(LocalSize, LocalID);
const sycl::nd_item<Dims> NDItem =
IDBuilder::createNDItem<Dims>(GlobalItem, LocalItem, Group);

runKernelWithArg<const sycl::nd_item<Dims>>(MKernel, NDItem);
});
});
}

template <typename ArgT = KernelArgType>
std::enable_if_t<std::is_same_v<ArgT, sycl::group<Dims>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> NGroups(InitializedVal<Dims, range>::template get<0>());

for (int I = 0; I < Dims; ++I) {
if (NDRDesc.LocalSize[I] == 0 ||
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
throw sycl::nd_range_error("Invalid local size for global size",
PI_ERROR_INVALID_WORK_GROUP_SIZE);
NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
}

sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
sycl::range<Dims> GlobalSize(
InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
LocalSize[I] = NDRDesc.LocalSize[I];
GlobalSize[I] = NDRDesc.GlobalSize[I];
}
detail::NDLoop<Dims>::iterate(NGroups, [&](const id<Dims> &GroupID) {
sycl::group<Dims> Group =
IDBuilder::createGroup<Dims>(GlobalSize, LocalSize, NGroups, GroupID);
runKernelWithArg<sycl::group<Dims>>(MKernel, Group);
});
}

~HostKernel() = default;
};

Expand Down
1 change: 0 additions & 1 deletion sycl/include/sycl/ext/oneapi/memcpy2d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#pragma once

#include <sycl/detail/host_task_impl.hpp>
#include <sycl/handler.hpp>
#include <sycl/queue.hpp>
#include <sycl/usm/usm_enums.hpp>
Expand Down
Loading