diff --git a/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp index cb07af7ae4a8..e34e8d677d42 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp @@ -89,9 +89,6 @@ class ESIMDLowerVecArgPass { Function *rewriteFunc(Function &F); Type *getSimdArgPtrTyOrNull(Value *arg); void fixGlobals(Module &M); - void replaceConstExprWithGlobals(Module &M); - ConstantExpr *createNewConstantExpr(GlobalVariable *newGlobalVar, - Type *oldGlobalType, Value *old); void removeOldGlobals(); }; @@ -229,41 +226,6 @@ Function *ESIMDLowerVecArgPass::rewriteFunc(Function &F) { return NF; } -// Replace ConstantExpr if it contains old global variable. -ConstantExpr * -ESIMDLowerVecArgPass::createNewConstantExpr(GlobalVariable *NewGlobalVar, - Type *OldGlobalType, Value *Old) { - ConstantExpr *NewConstantExpr = nullptr; - - if (isa(Old)) { - NewConstantExpr = cast( - ConstantExpr::getBitCast(NewGlobalVar, OldGlobalType)); - return NewConstantExpr; - } - - auto InnerMost = createNewConstantExpr( - NewGlobalVar, OldGlobalType, cast(Old)->getOperand(0)); - - NewConstantExpr = cast( - cast(Old)->getWithOperandReplaced(0, InnerMost)); - - return NewConstantExpr; -} - -// Globals are part of ConstantExpr. This loop iterates over -// all such instances and replaces them with a new ConstantExpr -// consisting of new global vector* variable. -void ESIMDLowerVecArgPass::replaceConstExprWithGlobals(Module &M) { - for (auto &GlobalVars : OldNewGlobal) { - auto &G = *GlobalVars.first; - for (auto UseOfG : G.users()) { - auto NewGlobal = GlobalVars.second; - auto NewConstExpr = createNewConstantExpr(NewGlobal, G.getType(), UseOfG); - UseOfG->replaceAllUsesWith(NewConstExpr); - } - } -} - // This function creates new global variables of type vector* type // when old one is of simd* type. void ESIMDLowerVecArgPass::fixGlobals(Module &M) { @@ -288,16 +250,17 @@ void ESIMDLowerVecArgPass::fixGlobals(Module &M) { } } - replaceConstExprWithGlobals(M); - removeOldGlobals(); } // Remove old global variables from the program. void ESIMDLowerVecArgPass::removeOldGlobals() { for (auto &G : OldNewGlobal) { - G.first->removeDeadConstantUsers(); - G.first->eraseFromParent(); + auto OldGlob = G.first; + auto NewGlobal = G.second; + OldGlob->replaceAllUsesWith( + ConstantExpr::getBitCast(NewGlobal, OldGlob->getType())); + OldGlob->eraseFromParent(); } } diff --git a/llvm/test/SYCLLowerIR/esimd_global_crash.ll b/llvm/test/SYCLLowerIR/esimd_global_crash.ll new file mode 100644 index 000000000000..c25f1e494528 --- /dev/null +++ b/llvm/test/SYCLLowerIR/esimd_global_crash.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -ESIMDLowerVecArg -S | FileCheck %s + +; This test checks that there is no compiler crash when a Global +; is used in simple instruction, not directly in ConstantExpr. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown-sycldevice" + +%"class.cl::sycl::INTEL::gpu::simd" = type { <2512 x i32> } + +; CHECK: @Global = dso_local global <2512 x i32> undef, align 16384 +@Global = dso_local global %"class.cl::sycl::INTEL::gpu::simd" undef, align 16384 + +define void @no_crash(<2512 x i32> %simd_val) { +; CHECK-LABEL: @no_crash( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast %"class.cl::sycl::INTEL::gpu::simd"* bitcast (<2512 x i32>* @Global to %"class.cl::sycl::INTEL::gpu::simd"*) to %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)* +; CHECK-NEXT: [[GEP:%.*]] = getelementptr %"class.cl::sycl::INTEL::gpu::simd", %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)* [[CAST]], i64 0, i32 0 +; CHECK-NEXT: store <2512 x i32> [[SIMD_VAL:%.*]], <2512 x i32> addrspace(4)* [[GEP]], align 16384 +; CHECK-NEXT: ret void +; + %cast = addrspacecast %"class.cl::sycl::INTEL::gpu::simd"* @Global to %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)* + %gep = getelementptr %"class.cl::sycl::INTEL::gpu::simd", %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)* %cast, i64 0, i32 0 + store <2512 x i32> %simd_val, <2512 x i32> addrspace(4)* %gep, align 16384 + ret void +} diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp index df480f58f99f..ee2a3bd9a327 100644 --- a/sycl/include/CL/sycl/detail/type_traits.hpp +++ b/sycl/include/CL/sycl/detail/type_traits.hpp @@ -207,6 +207,19 @@ template struct is_vector_arithmetic : bool_constant::value && is_arithmetic::value> {}; +// is_bool +template +struct is_scalar_bool + : bool_constant, bool>::value> {}; + +template +struct is_vector_bool + : bool_constant::value && + is_scalar_bool>::value> {}; + +template +struct is_bool : bool_constant>::value> {}; + // is_pointer template struct is_pointer_impl : std::false_type {}; diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp index 9692f33b094e..52b5e61d1da7 100644 --- a/sycl/include/CL/sycl/group.hpp +++ b/sycl/include/CL/sycl/group.hpp @@ -274,58 +274,99 @@ template class group { __spirv_MemoryBarrier(__spv::Scope::Workgroup, flags); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest + /// with a source stride specified by \p srcStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for dataT are all scalar and vector types, except boolean. template - device_event async_work_group_copy(local_ptr dest, - global_ptr src, - size_t numElements) const { + detail::enable_if_t::value, device_event> + async_work_group_copy(local_ptr dest, global_ptr src, + size_t numElements, size_t srcStride) const { using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t e = OpGroupAsyncCopyGlobalToLocal( + __ocl_event_t E = OpGroupAsyncCopyGlobalToLocal( __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, 1, 0); - return device_event(&e); + numElements, srcStride, 0); + return device_event(&E); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest with + /// the destination stride specified by \p destStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for dataT are all scalar and vector types, except boolean. template - device_event async_work_group_copy(global_ptr dest, - local_ptr src, - size_t numElements) const { + detail::enable_if_t::value, device_event> + async_work_group_copy(global_ptr dest, local_ptr src, + size_t numElements, size_t destStride) const { using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t e = OpGroupAsyncCopyLocalToGlobal( + __ocl_event_t E = OpGroupAsyncCopyLocalToGlobal( __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, 1, 0); - return device_event(&e); + numElements, destStride, 0); + return device_event(&E); + } + + /// Specialization for scalar bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + detail::enable_if_t::value, device_event> + async_work_group_copy(multi_ptr Dest, multi_ptr Src, + size_t NumElements, size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + auto DestP = + multi_ptr(reinterpret_cast(Dest.get())); + auto SrcP = + multi_ptr(reinterpret_cast(Src.get())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); + } + + /// Specialization for vector bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + detail::enable_if_t::value, device_event> + async_work_group_copy(multi_ptr Dest, multi_ptr Src, + size_t NumElements, size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + using VecT = detail::change_base_type_t; + auto DestP = multi_ptr(reinterpret_cast(Dest.get())); + auto SrcP = multi_ptr(reinterpret_cast(Src.get())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for dataT are all scalar and vector types. template device_event async_work_group_copy(local_ptr dest, global_ptr src, - size_t numElements, - size_t srcStride) const { - using DestT = detail::ConvertToOpenCLType_t; - using SrcT = detail::ConvertToOpenCLType_t; - - __ocl_event_t e = OpGroupAsyncCopyGlobalToLocal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, srcStride, 0); - return device_event(&e); + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for dataT are all scalar and vector types. template device_event async_work_group_copy(global_ptr dest, local_ptr src, - size_t numElements, - size_t destStride) const { - using DestT = detail::ConvertToOpenCLType_t; - using SrcT = detail::ConvertToOpenCLType_t; - - __ocl_event_t e = OpGroupAsyncCopyLocalToGlobal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, destStride, 0); - return device_event(&e); + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); } template diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 77db78115de0..370abcfb0b01 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -1589,7 +1589,7 @@ class __SYCL_EXPORT handler { // Make sure data shared_ptr points to is not released until we finish // work with it. MSharedPtrStorage.push_back(Dst); - T_Dst *RawDstPtr = Dst.get(); + typename shared_ptr_class::element_type *RawDstPtr = Dst.get(); copy(Src, RawDstPtr); } @@ -1612,7 +1612,7 @@ class __SYCL_EXPORT handler { // Make sure data shared_ptr points to is not released until we finish // work with it. MSharedPtrStorage.push_back(Src); - T_Src *RawSrcPtr = Src.get(); + typename shared_ptr_class::element_type *RawSrcPtr = Src.get(); copy(RawSrcPtr, Dst); } diff --git a/sycl/test/basic_tests/group_async_copy.cpp b/sycl/test/basic_tests/group_async_copy.cpp new file mode 100644 index 000000000000..e41b7414a19c --- /dev/null +++ b/sycl/test/basic_tests/group_async_copy.cpp @@ -0,0 +1,160 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.run +// RUN: %GPU_RUN_PLACEHOLDER %t.run +// RUN: %CPU_RUN_PLACEHOLDER %t.run +// RUN: %ACC_RUN_PLACEHOLDER %t.run +// RUN: env SYCL_DEVICE_FILTER=host %t.run + +#include +#include +#include + +using namespace cl::sycl; + +template class KernelName; + +// Define the number of work items to enqueue. +const size_t NElems = 32; +const size_t WorkGroupSize = 8; +const size_t NWorkGroups = NElems / WorkGroupSize; + +template void initInputBuffer(buffer &Buf, size_t Stride) { + auto Acc = Buf.template get_access(); + for (size_t I = 0; I < Buf.get_count(); I += WorkGroupSize) { + for (size_t J = 0; J < WorkGroupSize; J++) + Acc[I + J] = I + J + ((J % Stride == 0) ? 100 : 0); + } +} + +template void initOutputBuffer(buffer &Buf) { + auto Acc = Buf.template get_access(); + for (size_t I = 0; I < Buf.get_count(); I++) + Acc[I] = 0; +} + +template struct is_vec : std::false_type {}; +template struct is_vec> : std::true_type {}; + +template bool checkEqual(vec A, size_t B) { + T TB = B; + return A.s0() == TB; +} + +template bool checkEqual(vec A, size_t B) { + T TB = B; + return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB; +} + +template +typename std::enable_if::value, bool>::type checkEqual(T A, + size_t B) { + T TB = B; + return A == TB; +} + +template std::string toString(vec A) { + std::string R("("); + return R + std::to_string(A.s0()) + ")"; +} + +template std::string toString(vec A) { + std::string R("("); + R += std::to_string(A.x()) + "," + std::to_string(A.y()) + "," + + std::to_string(A.z()) + "," + std::to_string(A.w()) + ")"; + return R; +} + +template +typename std::enable_if::value, std::string>::type toString(T A) { + return std::to_string(A); +} + +template int checkResults(buffer &OutBuf, size_t Stride) { + auto Out = OutBuf.template get_access(); + int EarlyFailout = 20; + + for (size_t I = 0; I < OutBuf.get_count(); I += WorkGroupSize) { + for (size_t J = 0; J < WorkGroupSize; J++) { + size_t ExpectedVal = (J % Stride == 0) ? (100 + I + J) : 0; + if (!checkEqual(Out[I + J], ExpectedVal)) { + std::cerr << std::string(typeid(T).name()) + ": Stride=" << Stride + << " : Incorrect value at index " << I + J + << " : Expected: " << toString(ExpectedVal) + << ", Computed: " << toString(Out[I + J]) << "\n"; + if (--EarlyFailout == 0) + return 1; + } + } + } + return EarlyFailout - 20; +} + +template int test(size_t Stride) { + queue Q; + + buffer InBuf(NElems); + buffer OutBuf(NElems); + + initInputBuffer(InBuf, Stride); + initOutputBuffer(OutBuf); + + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + auto Out = OutBuf.template get_access(CGH); + accessor Local( + range<1>{WorkGroupSize}, CGH); + + nd_range<1> NDR{range<1>(NElems), range<1>(WorkGroupSize)}; + CGH.parallel_for>(NDR, [=](nd_item<1> NDId) { + auto GrId = NDId.get_group_linear_id(); + auto Group = NDId.get_group(); + size_t NElemsToCopy = + WorkGroupSize / Stride + ((WorkGroupSize % Stride) ? 1 : 0); + size_t Offset = GrId * WorkGroupSize; + if (Stride == 1) { // Check the version without stride arg. + auto E = NDId.async_work_group_copy( + Local.get_pointer(), In.get_pointer() + Offset, NElemsToCopy); + E.wait(); + } else { + auto E = NDId.async_work_group_copy(Local.get_pointer(), + In.get_pointer() + Offset, + NElemsToCopy, Stride); + E.wait(); + } + + if (Stride == 1) { // Check the version without stride arg. + auto E = Group.async_work_group_copy( + Out.get_pointer() + Offset, Local.get_pointer(), NElemsToCopy); + Group.wait_for(E); + } else { + auto E = Group.async_work_group_copy(Out.get_pointer() + Offset, + Local.get_pointer(), NElemsToCopy, + Stride); + Group.wait_for(E); + } + }); + }).wait(); + + return checkResults(OutBuf, Stride); +} + +int main() { + for (int Stride = 1; Stride < WorkGroupSize; Stride++) { + if (test(Stride)) + return 1; + if (test>(Stride)) + return 1; + if (test(Stride)) + return 1; + if (test(Stride)) + return 1; + if (test>(Stride)) + return 1; + if (test>(Stride)) + return 1; + if (test(Stride)) + return 1; + } + + std::cout << "Test passed.\n"; + return 0; +} diff --git a/sycl/test/basic_tests/handler/handler_mem_op.cpp b/sycl/test/basic_tests/handler/handler_mem_op.cpp index 2f9460051ed2..c0d3b1b10d4e 100644 --- a/sycl/test/basic_tests/handler/handler_mem_op.cpp +++ b/sycl/test/basic_tests/handler/handler_mem_op.cpp @@ -348,7 +348,7 @@ template void test_copy_acc_ptr() { template void test_copy_shared_ptr_acc() { const size_t Size = 10; T Data[Size] = {0}; - std::shared_ptr Values(new T[Size]()); + std::shared_ptr Values(new T[Size]()); for (size_t I = 0; I < Size; ++I) { Values.get()[I] = I; } @@ -369,7 +369,7 @@ template void test_copy_shared_ptr_acc() { template void test_copy_shared_ptr_const_acc() { constexpr size_t Size = 10; T Data[Size] = {0}; - std::shared_ptr Values(new T[Size]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + std::shared_ptr Values(new T[Size]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); { buffer Buffer(Data, range<1>(Size)); queue Queue; @@ -390,7 +390,7 @@ template void test_copy_acc_shared_ptr() { for (size_t I = 0; I < Size; ++I) { Data[I] = I; } - std::shared_ptr Values(new T[Size]()); + std::shared_ptr Values(new T[Size]()); { buffer Buffer(Data, range<1>(Size)); queue Queue;