diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 4ffd5c418138c..c26eb1ceb3204 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -4315,9 +4315,8 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd vals, /// void slm_scatter( /// OffsetSimdViewT byte_offsets, simd vals, /// PropertyListT props = {}); // (slm-sc-4) -/// Loads ("gathers") elements of the type 'T' from Shared Local Memory -/// locations addressed by byte offsets \p byte_offsets, and returns the loaded -/// elements. +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. /// @tparam T Element type. /// @tparam N Number of elements to read. /// @tparam VS Vector size. It can also be read as the number of reads per each @@ -7913,6 +7912,196 @@ __ESIMD_API offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask); } +/// Variant of scatter that uses local accessor as a parameter +/// template +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-1) + +/// template +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-2) + +/// The next two functions are similar to lacc-sc-{1,2} with the 'byte_offsets' +/// parameter represerented as 'simd_view'. + +/// template +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-3) + +/// template +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-4) + +/// template +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-1) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// Access to any element's memory location can be disabled via +/// the input mask. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, simd byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + slm_scatter(byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), + vals, mask, props); +} + +/// template +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-2) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, simd byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(acc, byte_offsets, vals, Mask, props); +} + +/// template +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-3) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// Access to any element's memory location can be disabled via the input mask. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + scatter(acc, byte_offsets.read(), vals, mask, props); +} + +/// template +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-4) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(acc, byte_offsets.read(), vals, Mask, props); +} + /// Variant of scatter that uses local accessor as a parameter /// /// Writes elements of a \ref simd object into an accessor at given offsets. @@ -7935,7 +8124,7 @@ template __ESIMD_API std::enable_if_t> scatter(AccessorTy acc, simd offsets, simd vals, - uint32_t glob_offset = 0, simd_mask mask = 1) { + uint32_t glob_offset, simd_mask mask = 1) { slm_scatter(offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), vals, mask); diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp index e34f259c093ec..e85e16e03d3e1 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp @@ -796,7 +796,6 @@ bool testLACC(queue Q, uint32_t MaskStride, PropertiesT) { In[I] = esimd_test::getRandomValue(); try { - buffer InBuf(In, Size * 2); Q.submit([&](handler &CGH) { // Allocate a bit more to safely initialize it with 8-element chunks. constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 8ea1fcf4a08ad..b3b3d498276f4 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -449,3 +449,216 @@ template bool testSLM(queue Q) { return Passed; } + +template +bool testLACC(queue Q, uint32_t MaskStride, + ScatterPropertiesT ScatterProperties) { + constexpr uint32_t Groups = 8; + constexpr uint32_t Threads = 1; + constexpr size_t Size = Groups * Threads * N; + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "Local Accessor case: T=" << esimd_test::type_name() + << ",N=" << N << ", VS=" << VS << ",UseMask=" << UseMask + << ",UseProperties=" << UseProperties << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = static_cast(sycl::malloc_shared(Size * sizeof(T), Q)); + for (size_t i = 0; i < Size; i++) + Out[i] = i; + + try { + Q.submit([&](handler &cgh) { + constexpr uint32_t SLMSize = N; + auto LocalAcc = local_accessor(SLMSize, cgh); + + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + ScatterPropertiesT Props{}; + uint16_t GlobalID = ndi.get_global_id(0); + uint16_t LocalID = ndi.get_local_id(0); + uint32_t GlobalElemOffset = GlobalID * N; + uint32_t LocalElemOffset = LocalID * N; + + simd InVec(GlobalElemOffset, 1); + + simd ByteOffsets(0, VS * sizeof(T)); + scatter(LocalAcc, ByteOffsets, InVec); + auto ByteOffsetsView = ByteOffsets.template select(); + simd Vals = gather(LocalAcc, ByteOffsets, Props); + + Vals *= 2; + + auto ValsView = Vals.template select(); + + simd_mask Pred = 0; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals, Pred, + Props); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView, Pred, + Props); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred, + Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView); + } + } + } else { // VS == 1 + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred, + Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(LocalAcc, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(LocalAcc, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(LocalAcc, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(LocalAcc, ByteOffsetsView, ValsView); + } + } + } + + simd OutVec = gather(LocalAcc, ByteOffsets, Props); + OutVec.copy_to(Out + GlobalElemOffset); + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask); + + sycl::free(Out, Q); + + return Passed; +} + +template bool testLACC(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties EmptyProps; + properties AlignElemProps{alignment}; + + bool Passed = true; + + // Test scatter() that is available on Gen12 and PVC. + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 1, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 1, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + + // // Test scatter() without passing compile-time properties argument. + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{alignment}; + Passed &= testLACC(Q, 2, LSCProps); + Passed &= testLACC(Q, 2, LSCProps); + Passed &= testLACC(Q, 2, LSCProps); + Passed &= testLACC(Q, 2, LSCProps); + + Passed &= testLACC(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= + // testLACC(Q, 2, + // AlignElemProps) + Passed &= + testLACC(Q, 2, AlignElemProps); + Passed &= + testLACC(Q, 2, AlignElemProps); + Passed &= + testLACC(Q, 2, AlignElemProps); + } + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp new file mode 100644 index 0000000000000..301392a247381 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp @@ -0,0 +1,36 @@ +//==------- scatter_lacc.cpp - DPC++ ESIMD on-device test -----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES-INTEL-DRIVER: lin: 26816, win: 101.51086 +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting local accessor +// and optional compile-time esimd::properties. +// The scatter() calls in this test do not use VS > 1 (number of loads per +// offset) to not impose using PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc_dg2_pvc.cpp new file mode 100644 index 0000000000000..da358621df927 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc_dg2_pvc.cpp @@ -0,0 +1,38 @@ +//==------- scatter_lacc_dg2_pvc.cpp - DPC++ ESIMD on-device test------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting local accessor +// and optional compile-time esimd::properties. +// The scatter() calls in this test use VS > 1 (number of loads per +// offset) and requires DG2 or PVC. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::PVC; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testLACC(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 8305bd9b83b18..f9a1057a3550a 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1282,9 +1282,71 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, scatter(ptrf, ioffset_n16_view, usm_view); + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + scatter(local_acc, ioffset_n32, usm, mask_n32); + + scatter(local_acc, ioffset_n32, usm); + + scatter(local_acc, ioffset_n32, usm, mask_n32, props_align4); + + scatter(local_acc, ioffset_n32, usm, props_align4); + + // CHECK-COUNT-8: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + scatter(local_acc, ioffset_n32, usm, mask_n32, props_align4); + scatter(local_acc, ioffset_n32, usm, props_align4); + + scatter(local_acc, ioffset_n32_view, usm, mask_n32, props_align4); + scatter(local_acc, ioffset_n32_view, usm, props_align4); + + scatter(local_acc, ioffset_n32, usm_view, mask_n32, props_align4); + scatter(local_acc, ioffset_n32, usm_view, props_align4); + + scatter(local_acc, ioffset_n32_view, usm_view, mask_n32, + props_align4); + scatter(local_acc, ioffset_n32_view, usm_view, props_align4); + + // VS > 1 + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + scatter(local_acc, ioffset_n16, usm, mask_n16, props_align4); + + scatter(local_acc, ioffset_n16, usm, props_align4); + + scatter(local_acc, ioffset_n16_view, usm, mask_n16, + props_align4); + scatter(local_acc, ioffset_n16_view, usm, props_align4); + + scatter(local_acc, ioffset_n16, usm_view, mask_n16, + props_align4); + scatter(local_acc, ioffset_n16, usm_view, props_align4); + + scatter(local_acc, ioffset_n16_view, usm_view, mask_n16, + props_align4); + scatter(local_acc, ioffset_n16_view, usm_view, props_align4); + + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + scatter(local_acc, ioffset_n16, usm, mask_n16); + + scatter(local_acc, ioffset_n16, usm); + + scatter(local_acc, ioffset_n16_view, usm, mask_n16); + + scatter(local_acc, ioffset_n16_view, usm); + + scatter(local_acc, ioffset_n16, usm_view, mask_n16); + + scatter(local_acc, ioffset_n16, usm_view); + + scatter(local_acc, ioffset_n16_view, usm_view, mask_n16); + + scatter(local_acc, ioffset_n16_view, usm_view); simd ioffset_n10(byte_offset32, 8); simd usm_n10; + // Check special case involving glbal offset and mask + // CHECK-COUNT-2: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + scatter(local_acc, ioffset_n32, usm, 0, 1); + scatter(local_acc, ioffset_n32, usm, 0); + // Check special case to verify that for cases when N is not power of 2 llvm // intrinsic is used // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}})