Skip to content

Commit

Permalink
[ABI-Break][SYCL] Restrict nd_range parallel_for to nd_item
Browse files Browse the repository at this point in the history
Enables the change from intel#11067 by default.
  • Loading branch information
aelovikov-intel committed Apr 1, 2024
1 parent ff9e48a commit 3bf8a8b
Show file tree
Hide file tree
Showing 23 changed files with 105 additions and 84 deletions.
11 changes: 0 additions & 11 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1326,8 +1326,6 @@ class __SYCL_EXPORT handler {
"Kernel argument cannot have a sycl::nd_item type in "
"sycl::parallel_for with sycl::range");

#if defined(SYCL2020_CONFORMANT_APIS) || \
defined(__INTEL_PREVIEW_BREAKING_CHANGES)
static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
std::is_convertible_v<item<Dims, false>, LambdaArgType>,
"sycl::parallel_for(sycl::range) kernel must have the "
Expand All @@ -1340,7 +1338,6 @@ class __SYCL_EXPORT handler {
std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
"SYCL kernel lambda/functor has an unexpected signature, it should be "
"invocable with sycl::item and optionally sycl::kernel_handler");
#endif

// TODO: Properties may change the kernel function, so in order to avoid
// conflicts they should be included in the name.
Expand Down Expand Up @@ -1432,19 +1429,11 @@ class __SYCL_EXPORT handler {
verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
using LambdaArgType =
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
#if defined(SYCL2020_CONFORMANT_APIS) || \
defined(__INTEL_PREVIEW_BREAKING_CHANGES)
static_assert(
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
"must be either sycl::nd_item or be convertible from sycl::nd_item");
using TransformedArgType = sycl::nd_item<Dims>;
#else
// If user type is convertible from sycl::item/sycl::nd_item, use
// sycl::item/sycl::nd_item to transport item information
using TransformedArgType =
typename TransformUserItemType<Dims, LambdaArgType>::type;
#endif

(void)ExecutionRange;
(void)Props;
Expand Down
7 changes: 4 additions & 3 deletions sycl/test-e2e/Assert/assert_in_multiple_tus.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,12 @@ void enqueueKernel_1_fromFile1(queue *Q) {

CGH.parallel_for<class Kernel_1>(
sycl::nd_range(Buf.get_range(), sycl::range<1>(4)),
[=](sycl::id<1> wiID) {
[=](sycl::nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
int X = 0;
if (wiID == 5)
if (gid == 5)
X = checkFunction();
Acc[wiID] = X;
Acc[gid] = X;
});
});
}
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/DiscardEvents/discard_events_accessors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ int main(int Argc, const char *Argv[]) {
sycl::local_accessor<int, 1> LocalAcc(LocalMemSize, CGH);

CGH.parallel_for<class kernel_using_local_memory>(
NDRange, [=](sycl::item<1> itemID) {
size_t i = itemID.get_id(0);
NDRange, [=](sycl::nd_item<1> ndi) {
size_t i = ndi.get_global_id(0);
int *Ptr = LocalAcc.get_pointer();
Ptr[i] = i + 5;
Harray[i] = Ptr[i] + 5;
Expand Down
7 changes: 4 additions & 3 deletions sycl/test-e2e/ESIMD/bfn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,17 @@ struct DeviceFunc {
DeviceFunc(const T *In0, const T *In1, const T *In2, T *Out)
: In0(In0), In1(In1), In2(In2), Out(Out) {}

void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
unsigned int Offset = I * N;
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
auto gid = ndi.get_global_id(0);
unsigned int Offset = gid * N;
esimd::simd<T, N> V0;
esimd::simd<T, N> V1;
esimd::simd<T, N> V2;
V0.copy_from(In0 + Offset);
V1.copy_from(In1 + Offset);
V2.copy_from(In2 + Offset);

if (I.get(0) % 2 == 0) {
if (gid % 2 == 0) {
for (int J = 0; J < N; J++) {
Kernel<T, N, Op, AllSca> DevF{};
T Val0 = V0[J];
Expand Down
14 changes: 8 additions & 6 deletions sycl/test-e2e/ESIMD/ext_math.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,12 +238,13 @@ struct UnaryDeviceFunc {

UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {}

void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
unsigned int Offset = I * N * sizeof(T);
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
auto gid = ndi.get_global_id(0);
unsigned int Offset = gid * N * sizeof(T);
esimd::simd<T, N> Vx;
Vx.copy_from(In, Offset);

if (I.get(0) % 2 == 0) {
if (gid % 2 == 0) {
for (int J = 0; J < N; J++) {
Kernel<T, N, Op, AllSca> DevF{};
T Val = Vx[J];
Expand All @@ -269,13 +270,14 @@ struct BinaryDeviceFunc {
BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out)
: In1(In1), In2(In2), Out(Out) {}

void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
unsigned int Offset = I * N * sizeof(T);
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
auto gid = ndi.get_global_id(0);
unsigned int Offset = gid * N * sizeof(T);
esimd::simd<T, N> V1(In1, Offset);
esimd::simd<T, N> V2(In2, Offset);
esimd::simd<T, N> V;

if (I.get(0) % 2 == 0) {
if (gid % 2 == 0) {
int Ind = 0;
{
Kernel<T, N, Op, AllSca> DevF{};
Expand Down
2 changes: 1 addition & 1 deletion sycl/test-e2e/ESIMD/local_accessor_gather_scatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
q.submit([&](handler &cgh) {
auto acc = buf.template get_access<access::mode::read_write>(cgh);
auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
simd<T, VL> valsIn;
valsIn.copy_from(acc, 0);
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/ESIMD/local_accessor_gather_scatter_rgba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
auto OutAcc = OutBuf.template get_access<access::mode::read_write>(cgh);
auto LocalAcc = local_accessor<T, 1>(VL * NUM_RGBA_CHANNELS, cgh);

cgh.parallel_for(Range, [=](id<1> i) SYCL_ESIMD_KERNEL {
cgh.parallel_for(Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
constexpr int numChannels = get_num_channels_enabled(CH_MASK);

Expand Down Expand Up @@ -135,7 +135,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
-1;
}

uint32_t global_offset = i * VL * NUM_RGBA_CHANNELS;
uint32_t global_offset = ndi.get_global_id(0) * VL * NUM_RGBA_CHANNELS;
valsOut.copy_to(OutAcc, global_offset);
});
}).wait();
Expand Down
8 changes: 4 additions & 4 deletions sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ bool test(queue q, const Config &cfg) {
try {
auto e = q.submit([&](handler &cgh) {
cgh.parallel_for<TestID<T, N, ImplF>>(
rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
int i = ndi.get_global_id(0);
#ifndef USE_SCALAR_OFFSET
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
cfg.stride * sizeof(T));
Expand Down Expand Up @@ -332,8 +332,8 @@ bool test(queue q, const Config &cfg) {
auto e = q.submit([&](handler &cgh) {
auto accessor = buf.template get_access<access::mode::read_write>(cgh);
cgh.parallel_for<TestID<T, N, ImplF>>(
rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
rng, [=](nd_item<1> gid) SYCL_ESIMD_KERNEL {
int i = gid.get_global_id(0);
#ifndef USE_SCALAR_OFFSET
simd<Toffset, N> offsets(start * sizeof(T), stride * sizeof(T));
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
q.submit([&](handler &cgh) {
auto acc = buf.template get_access<access::mode::read_write>(cgh);
auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
simd<T, VL> valsIn;
valsIn.copy_from(acc, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ bool test_usm(queue q, const Config &cfg) {

try {
auto e = q.submit([&](handler &cgh) {
cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
int i = ndi.get_global_id(0);
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
cfg.stride * sizeof(T));
simd_mask<N> m = 1;
Expand Down Expand Up @@ -287,8 +287,8 @@ bool test_acc(queue q, const Config &cfg) {
auto e = q.submit([&](handler &cgh) {
auto arr_acc =
arr_buf.template get_access<access::mode::read_write>(cgh);
cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
int i = ndi.get_global_id(0);
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
cfg.stride * sizeof(T));
simd_mask<N> m = 1;
Expand Down
18 changes: 11 additions & 7 deletions sycl/test-e2e/KernelFusion/abort_fusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,16 @@ template <int Dimensions> size_t getSize(nd_range<Dimensions> r) {
return r.get_global_range().size();
}

template <int N> auto global_linear_id(sycl::nd_item<N> ndi) {
return ndi.get_global_linear_id();
}
template <int N> auto global_linear_id(sycl::item<N> i) {
return i.get_linear_id();
}

template <typename Kernel1Name, typename Kernel2Name, typename Range1,
typename Range2>
void performFusion(queue &q, Range1 R1, Range2 R2) {
using IndexTy1 = item<Range1::dimensions>;
using IndexTy2 = item<Range2::dimensions>;

int in[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
Expand All @@ -51,17 +55,17 @@ void performFusion(queue &q, Range1 R1, Range2 R2) {
q.submit([&](handler &cgh) {
auto accIn = bIn.get_access(cgh);
auto accTmp = bTmp.get_access(cgh);
cgh.parallel_for<Kernel1Name>(R1, [=](IndexTy1 i) {
size_t j = i.get_linear_id();
cgh.parallel_for<Kernel1Name>(R1, [=](auto i) {
size_t j = global_linear_id(i);
accTmp[j] = accIn[j] + 5;
});
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<Kernel2Name>(R2, [=](IndexTy2 i) {
size_t j = i.get_linear_id();
cgh.parallel_for<Kernel2Name>(R2, [=](auto i) {
size_t j = global_linear_id(i);
accOut[j] = accTmp[j] * 2;
});
});
Expand Down
12 changes: 8 additions & 4 deletions sycl/test-e2e/KernelFusion/abort_internalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@ void performFusion(queue &q, Internalization intKernel1,

if (localSizeKernel1 > 0) {
cgh.parallel_for<class Kernel1>(
nd_range<1>{{dataSize}, {localSizeKernel1}},
[=](id<1> i) { accTmp[i] = accIn[i] + 5; });
nd_range<1>{{dataSize}, {localSizeKernel1}}, [=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp[i] = accIn[i] + 5;
});
} else {
cgh.parallel_for<class KernelOne>(
dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
Expand All @@ -70,8 +72,10 @@ void performFusion(queue &q, Internalization intKernel1,
auto accOut = bOut.get_access(cgh);
if (localSizeKernel2 > 0) {
cgh.parallel_for<class Kernel2>(
nd_range<1>{{dataSize}, {localSizeKernel2}},
[=](id<1> i) { accOut[i] = accTmp[i] * 2; });
nd_range<1>{{dataSize}, {localSizeKernel2}}, [=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accOut[i] = accTmp[i] * 2;
});
} else {
cgh.parallel_for<class KernelTwo>(
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
Expand Down
21 changes: 14 additions & 7 deletions sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ struct AddKernel {
accessor<int, 1> accIn2;
accessor<int, 1> accOut;

void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
void operator()(nd_item<1> ndi) const {
auto i = ndi.get_global_id(0);
accOut[i] = accIn1[i] + accIn2[i];
}
};

int main() {
Expand Down Expand Up @@ -71,17 +74,21 @@ int main() {
auto accTmp1 = bTmp1.get_access(cgh);
auto accIn3 = bIn3.get_access(cgh);
auto accTmp2 = bTmp2.get_access(cgh);
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp2[i] = accTmp1[i] * accIn3[i];
});
});

q.submit([&](handler &cgh) {
auto accTmp1 = bTmp1.get_access(cgh);
auto accTmp3 = bTmp3.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp3[i] = accTmp1[i] * 5;
});
});

q.submit([&](handler &cgh) {
Expand Down
8 changes: 5 additions & 3 deletions sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ int main() {
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accOut[i] = accTmp[i] * accIn3[i];
});
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ int main() {
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
auto id = ndi.get_global_id();
const auto &accIn1Wrapp = accIn1[id];
const auto &accIn2Wrapp = accIn2[id];
auto &accTmpWrapp = accTmp[id];
Expand All @@ -105,7 +106,8 @@ int main() {
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
auto id = ndi.get_global_id();
const auto &tmpWrapp = accTmp[id];
const auto &accIn3Wrapp = accIn3[id];
auto &accOutWrapp = accOut[id];
Expand Down
16 changes: 10 additions & 6 deletions sycl/test-e2e/KernelFusion/local_internalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,23 @@ int main() {
auto accIn2 = bIn2.get_access(cgh);
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp[i] = accIn1[i] + accIn2[i];
});
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accOut[i] = accTmp[i] * accIn3[i];
});
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
Expand Down
8 changes: 4 additions & 4 deletions sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ int main() {
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
cgh.parallel_for<class KernelOne>(
nd_range<1>{{128}, {8}}, [=](item<1> i) {
auto baseOffset = i.get_linear_id() * 4;
nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
auto baseOffset = ndi.get_global_linear_id() * 4;
for (size_t j = 0; j < 4; ++j) {
accTmp[baseOffset + j] =
accIn1[baseOffset + j] + accIn2[baseOffset + j];
Expand All @@ -57,8 +57,8 @@ int main() {
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{128}, {8}}, [=](item<1> i) {
auto baseOffset = i.get_linear_id() * 4;
nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
auto baseOffset = ndi.get_global_linear_id() * 4;
for (size_t j = 0; j < 4; ++j) {
accOut[baseOffset + j] =
accTmp[baseOffset + j] * accIn3[baseOffset + j];
Expand Down
Loading

0 comments on commit 3bf8a8b

Please sign in to comment.