diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index a23c9884fa993..aed21c8b54509 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -672,7 +672,7 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - constexpr uint32_t SLMSize = N; + constexpr uint32_t SLMSize = (Threads * N + 8); auto LocalAcc = local_accessor(SLMSize, cgh); cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { @@ -682,10 +682,17 @@ bool testLACC(queue Q, uint32_t MaskStride, uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - simd InVec(GlobalElemOffset, 1); + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd InVec(Out + GlobalElemOffset + I); + simd Offsets(I * sizeof(T), sizeof(T)); + scatter(LocalAcc, Offsets, InVec); + } + } + barrier(); - simd ByteOffsets(0, VS * sizeof(T)); - scatter(LocalAcc, ByteOffsets, InVec); + simd ByteOffsets(LocalElemOffset * sizeof(T), + VS * sizeof(T)); auto ByteOffsetsView = ByteOffsets.template select(); simd Vals = gather(LocalAcc, ByteOffsets, Props); @@ -786,8 +793,12 @@ bool testLACC(queue Q, uint32_t MaskStride, } } - simd OutVec = gather(LocalAcc, ByteOffsets, Props); - OutVec.copy_to(Out + GlobalElemOffset); + barrier(); + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I++) { + Out[GlobalElemOffset + I] = LocalAcc[I]; + } + } }); }).wait(); } catch (sycl::exception const &e) {