From bd178006c87aaa57699b54548a65b105e114bc8b Mon Sep 17 00:00:00 2001 From: Buildbot for SYCL Date: Tue, 10 Sep 2024 22:00:39 +0800 Subject: [PATCH 01/19] [GHA] Uplift Linux GPU RT version to 24.31.30508.7 (#15339) Scheduled drivers uplift Co-authored-by: GitHub Actions --- devops/dependencies.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/devops/dependencies.json b/devops/dependencies.json index 5398ad7d5a383..ea84983b95eb2 100644 --- a/devops/dependencies.json +++ b/devops/dependencies.json @@ -19,9 +19,9 @@ "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "level_zero": { - "github_tag": "v1.17.39", - "version": "v1.17.39", - "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.17.39", + "github_tag": "v1.17.42", + "version": "v1.17.42", + "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.17.42", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "tbb": { From 594ae74fae19b65484230df7a8677402d18e2b9d Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Tue, 10 Sep 2024 23:00:59 +0900 Subject: [PATCH 02/19] [SYCL] Simplify arguments to computeModuleProperties (#15271) Computing information about spec consts is difficult for callers when module properties generation and splitting are separate. Simplify it by storing information in module we can use invisibly to the caller. This will be used for thinLTO, where we split early and compute module properties later. This should be basically NFC. Signed-off-by: Sarnie, Nick --- .../llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h | 4 +--- llvm/include/llvm/SYCLLowerIR/SpecConstants.h | 10 ++++++++++ llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp | 13 +++++++++---- llvm/lib/SYCLLowerIR/SpecConstants.cpp | 6 +++--- .../spec-constants/default-value/bool.ll | 1 + llvm/tools/sycl-post-link/sycl-post-link.cpp | 5 ++--- 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h b/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h index eaeecb44deb03..e7cff6c730051 100644 --- a/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h +++ b/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h @@ -34,9 +34,7 @@ using EntryPointSet = SetVector; PropSetRegTy computeModuleProperties(const Module &M, const EntryPointSet &EntryPoints, - const GlobalBinImageProps &GlobProps, - bool SpecConstsMet, - bool IsSpecConstantDefault); + const GlobalBinImageProps &GlobProps); std::string computeModuleSymbolTable(const Module &M, const EntryPointSet &EntryPoints); diff --git a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h index 8bf8bdf894d07..114bf431a279a 100644 --- a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h +++ b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h @@ -72,6 +72,16 @@ class SpecConstantsPass : public PassInfoMixin { collectSpecConstantDefaultValuesMetadata(const Module &M, std::vector &DefaultValues); + // Name of the metadata which holds a list of all specialization constants + // (with associated information) encountered in the module + static constexpr char SPEC_CONST_MD_STRING[] = + "sycl.specialization-constants"; + + // Name of the metadata which indicates this module was proccessed with the + // default values handing mode. + static constexpr char SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING[] = + "sycl.specialization-constants-default-values-module"; + private: HandlingMode Mode; }; diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp index 4a84786b6af9b..67ac13c569f10 100644 --- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp +++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp @@ -138,9 +138,7 @@ uint32_t getKernelWorkGroupNumDim(const Function &Func) { PropSetRegTy computeModuleProperties(const Module &M, const EntryPointSet &EntryPoints, - const GlobalBinImageProps &GlobProps, - bool SpecConstsMet, - bool IsSpecConstantDefault) { + const GlobalBinImageProps &GlobProps) { PropSetRegTy PropSet; { @@ -152,6 +150,10 @@ PropSetRegTy computeModuleProperties(const Module &M, PropSet.add(PropSetRegTy::SYCL_DEVICE_REQUIREMENTS, computeDeviceRequirements(M, EntryPoints).asMap()); } + auto *SpecConstsMD = + M.getNamedMetadata(SpecConstantsPass::SPEC_CONST_MD_STRING); + bool SpecConstsMet = + SpecConstsMD != nullptr && SpecConstsMD->getNumOperands() != 0; if (SpecConstsMet) { // extract spec constant maps per each module SpecIDMapTy TmpSpecIDMap; @@ -369,7 +371,10 @@ PropSetRegTy computeModuleProperties(const Module &M, if (!HostPipePropertyMap.empty()) { PropSet.add(PropSetRegTy::SYCL_HOST_PIPES, HostPipePropertyMap); } - + bool IsSpecConstantDefault = + M.getNamedMetadata( + SpecConstantsPass::SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING) != + nullptr; if (IsSpecConstantDefault) PropSet.add(PropSetRegTy::SYCL_MISC_PROP, "specConstsReplacedWithDefault", 1); diff --git a/llvm/lib/SYCLLowerIR/SpecConstants.cpp b/llvm/lib/SYCLLowerIR/SpecConstants.cpp index 4f43a22e95fd9..bf8215db94028 100644 --- a/llvm/lib/SYCLLowerIR/SpecConstants.cpp +++ b/llvm/lib/SYCLLowerIR/SpecConstants.cpp @@ -47,9 +47,6 @@ constexpr char SPIRV_GET_SPEC_CONST_VAL[] = "__spirv_SpecConstant"; constexpr char SPIRV_GET_SPEC_CONST_COMPOSITE[] = "__spirv_SpecConstantComposite"; -// Name of the metadata which holds a list of all specialization constants (with -// associated information) encountered in the module -constexpr char SPEC_CONST_MD_STRING[] = "sycl.specialization-constants"; // Name of the metadata which holds a default value list of all specialization // constants encountered in the module constexpr char SPEC_CONST_DEFAULT_VAL_MD_STRING[] = @@ -1029,6 +1026,9 @@ PreservedAnalyses SpecConstantsPass::run(Module &M, for (const auto &P : DefaultsMetadata) MDDefaults->addOperand(P); + if (Mode == HandlingMode::default_values) + M.getOrInsertNamedMetadata(SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING); + return IRModified ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll b/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll index d96cfa1f333f2..914ad2fe3a306 100644 --- a/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll +++ b/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll @@ -6,6 +6,7 @@ ; CHECK: %bool1 = trunc i8 1 to i1 ; CHECK: %frombool = zext i1 %bool1 to i8 +; CHECK: !sycl.specialization-constants-default-values-module = !{} ; CHECK-LOG: sycl.specialization-constants ; CHECK-LOG:[[UNIQUE_PREFIX:[0-9a-zA-Z]+]]={0, 0, 1} ; CHECK-LOG: sycl.specialization-constants-default-values diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index d1953743f500a..00bacce06d08b 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -307,9 +307,8 @@ std::string saveModuleIR(Module &M, int I, StringRef Suff) { std::string saveModuleProperties(module_split::ModuleDesc &MD, const GlobalBinImageProps &GlobProps, int I, StringRef Suff, StringRef Target = "") { - auto PropSet = computeModuleProperties(MD.getModule(), MD.entries(), - GlobProps, MD.Props.SpecConstsMet, - MD.isSpecConstantDefault()); + auto PropSet = + computeModuleProperties(MD.getModule(), MD.entries(), GlobProps); std::string NewSuff = Suff.str(); if (!Target.empty()) { From 37d1d51ca8b1efa3858b2ddb64cba0563578e4ac Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Tue, 10 Sep 2024 10:12:27 -0500 Subject: [PATCH 03/19] [SYCL][E2E][Joint Matrix] update performance test to add SLM (#15229) --- .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 23 ++++ .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 23 ++++ .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 19 +++ .../joint_matrix_bf16_fill_k_cache_SLM.cpp | 19 +++ .../joint_matrix_bf16_fill_k_cache_impl.hpp | 114 ++++++++++-------- sycl/test-e2e/Matrix/slm_utils.hpp | 70 +++++++++++ 6 files changed, 217 insertions(+), 51 deletions(-) create mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp create mode 100644 sycl/test-e2e/Matrix/slm_utils.hpp diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..bd22fa19354b1 --- /dev/null +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,23 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// TODO: add row major compilation and run once Sub-group size 32 +// support becomes available in IGC for row major + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" +#define SG_SZ 32 + +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..62cef33b3beb7 --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,23 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// TODO: add row major compilation and run once Sub-group size 32 +// support becomes available in IGC for row major + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../../common.hpp" +#define SG_SZ 32 + +#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..d81e7dbd685ba --- /dev/null +++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,19 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DSLM +// RUN: %{run} %t_gpu.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "../common.hpp" +#include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp new file mode 100644 index 0000000000000..a30d6320038a8 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp @@ -0,0 +1,19 @@ +//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix, gpu + +// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI +// RUN: %{run} %t_gpu_vnni.out + +// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DSLM +// RUN: %{run} %t_gpu.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 1b31a993bb179..b561bd073038a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -9,6 +9,10 @@ #include #include +#ifdef SLM +#include "slm_utils.hpp" +#endif + // number of test iterations constexpr unsigned int testIterations = 100; // start recording time after X iterations @@ -51,6 +55,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { std::chrono::high_resolution_clock::now(); q.submit([&](handler &h) { +#ifdef SLM + local_accessor tileA{{MCache2, KCache2}, h}; + local_accessor tileB{ + {KCache2 / vnniFactor, NCache2 * vnniFactor}, h}; +#endif + h.parallel_for>( // cache layer#1 nd_range<2>{global, cachelocal}, // loop global @@ -60,15 +70,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { [[intel::reqd_sub_group_size(SG_SZ)]] #endif // SG_SZ { + // sg::load and sg::store expect decorations to be ON auto pA = address_space_cast(A); + sycl::access::decorated::yes>(A); auto pB = address_space_cast(B); + sycl::access::decorated::yes>(B); auto pC = address_space_cast(C); + sycl::access::decorated::yes>(C); auto m2 = it.get_group(0); auto n2 = it.get_group(1); auto m1 = it.get_local_id(0); @@ -112,7 +123,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { colsA, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#ifdef VNNI for (int p = 0; p < prefDistance; p++) joint_matrix_prefetch( sg, @@ -122,15 +132,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { (n2 * NCache2 * vnniFactor + pn1B * prefCol), colsB * vnniFactor, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#else // VNNI - for (int p = 0; p < prefDistance; p++) - joint_matrix_prefetch( - sg, - B + (p * KCache2 + pm1B * prefRow) * colsB + n2 * NCache2 + - pn1B * prefCol, - colsB, layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); -#endif // VNNI #endif // PREFETCH joint_matrix @@ -157,7 +158,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { } #endif // MANUAL_UNROLL +#ifdef SLM + constexpr unsigned int SGs = + (MCache2 / MCache1) * (NCache2 / NCache1); +#endif // SLM for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { +#ifdef SLM + slm_read_write(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize); + it.barrier(access::fence_space::local_space); +#endif // SLM joint_matrix tA[MCache1 / TM][KCache2 / KCache1] #ifdef INIT_LIST @@ -192,6 +202,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { #else // MANUAL_UNROLL for (unsigned int m = 0; m < MCache1 / TM; m++) { #endif // MANUAL_UNROLL +#ifdef SLM + joint_matrix_load(sg, tA[m][k1], + tileA.template get_multi_ptr< + sycl::access::decorated::no>() + + (m1 * MCache1 + m * TM) * KCache2 + + k1 * TK, + KCache2); +#else // SLM #ifdef OOB ext::intel::experimental::matrix::joint_matrix_load_checked( sg, tA[m][k1], pA, colsA, rowsA, colsA, @@ -203,6 +221,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { k * TK, colsA); #endif // OOB +#endif // SLM #ifdef MANUAL_UNROLL }); // m #else // MANUAL_UNROLL @@ -213,32 +232,28 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { #else // MANUAL_UNROLL for (unsigned int n = 0; n < NCache1 / TN; n++) { #endif // MANUAL_UNROLL +#ifdef SLM + joint_matrix_load(sg, tB[n][k1], + tileB.template get_multi_ptr< + sycl::access::decorated::no>() + + (k1 * TK / vnniFactor) * + (NCache2 * vnniFactor) + + (n1 * NCache1 + n * TN) * vnniFactor, + NCache2 * vnniFactor); +#else // SLM #ifdef OOB -#ifdef VNNI ext::intel::experimental::matrix::joint_matrix_load_checked( sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor, colsB * vnniFactor, k * TK / vnniFactor, (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor); -#else // VNNI - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, tB[n][k1], pB, colsB, rowsB, colsB, k * TK, - n2 * NCache2 + n1 * NCache1 + n * TN); - -#endif // VNNI #else // OOB -#ifdef VNNI joint_matrix_load( sg, tB[n][k1], pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, colsB * vnniFactor); -#else // VNNI - joint_matrix_load(sg, tB[n][k1], - pB + (k * TK) * (colsB) + - (n2 * NCache2 + n1 * NCache1 + n * TN), - colsB); -#endif // VNNI #endif // OOB +#endif // SLM #ifdef MANUAL_UNROLL }); // n #else // MANUAL_UNROLL @@ -266,6 +281,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { } // m } // k1 #endif // MANUAL_UNROLL +#ifdef SLM + it.barrier(access::fence_space::local_space); +#endif // SLM #ifdef PREFETCH auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA + (k2 + prefDistance) * prefCol; @@ -275,7 +293,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { sg, A + prefetch_offsetA, colsA, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#ifdef VNNI auto prefetch_offsetB = ((k2 + prefDistance) * (KCache2 / vnniFactor) + pm1B * prefRow) * @@ -287,16 +304,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { sg, B + prefetch_offsetB, colsB * vnniFactor, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); -#else // VNNI - auto prefetch_offsetB = - ((k2 + prefDistance) * KCache2 + pm1B * prefRow) * (colsB) + - (n2 * NCache2 + pn1B * prefCol); - if ((prefetch_offsetB + (prefRow * MATRIX_SIZE) + prefCol) < - (MATRIX_SIZE * MATRIX_SIZE)) - joint_matrix_prefetch( - sg, B + prefetch_offsetB, colsB, layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); -#endif // VNNI #endif // PREFETCH } // for k2 #ifdef MANUAL_UNROLL @@ -411,29 +418,33 @@ int main() { constexpr size_t NCache2 = 256; constexpr size_t KCache2 = 32; +#ifdef VNNI + constexpr unsigned int VnniFactor = 2; +#else // VNNI + constexpr unsigned int VnniFactor = 1; +#endif // VNNI + for (unsigned int i = 0; i < combinations.size(); i++) { if (combinations[i].nsize == 0) { // Intel AMX constexpr size_t NCache1 = 32; constexpr size_t KCache1 = 32; - - test(); + test(); break; } if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc constexpr size_t NCache1 = 4 * /*TN*/ 16; constexpr size_t KCache1 = 16; - - test(); + test(); #if (!defined(SG_SZ) || SG_SZ != 32) // These combination are not currently supported for subgroup size = 32 in // IGC - test(); - test(); + test(); + test(); #endif break; } @@ -442,9 +453,10 @@ int main() { constexpr size_t NCache1 = 4 * /*TN*/ 8; constexpr size_t KCache1 = 16; - test(); - // test(); + // test(); break; } diff --git a/sycl/test-e2e/Matrix/slm_utils.hpp b/sycl/test-e2e/Matrix/slm_utils.hpp new file mode 100644 index 0000000000000..28ac1264b8cc0 --- /dev/null +++ b/sycl/test-e2e/Matrix/slm_utils.hpp @@ -0,0 +1,70 @@ +//==------------------ slm_utils.hpp - DPC++ joint_matrix------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +template +inline void +slm_read_write(multi_ptr pA, + multi_ptr pB, + local_accessor tileA, + local_accessor tileB, sub_group sg, unsigned int k2, + size_t m2, size_t n2, size_t sgSize) { + // Number of elements to be loaded into SLM per WI + size_t elemsPerLoadA = KCache2 / sgSize; + for (int i = 0; i < MCache2 / SGs; i++) { + size_t GlOffsetA = + (m2 * MCache2 + sg.get_group_id() * (MCache2 / SGs) + i) * colsA + + k2 * KCache2; + size_t LocOffsetA = (sg.get_group_id() * (MCache2 / SGs) + i) * KCache2; + + if (elemsPerLoadA == 2) { + vec slmVecA = sg.load<2>(pA + GlOffsetA); + sg.store<2>(tileA.template get_multi_ptr() + + LocOffsetA, + slmVecA); + } else if (elemsPerLoadA == 4) { + vec slmVecA = sg.load<4>(pA + GlOffsetA); + sg.store<4>(tileA.template get_multi_ptr() + + LocOffsetA, + slmVecA); + } else if (elemsPerLoadA == 1) { + TOperand slmScaA = sg.load(pA + GlOffsetA); + + sg.store(tileA.template get_multi_ptr() + + LocOffsetA, + slmScaA); + } else + assert(elemsPerLoadA == 1 || elemsPerLoadA == 2 || elemsPerLoadA == 4); + } + // how much each SG will load to SLM --> has to be contiguous + // NCache2*KCache2/(SGs*SG_SIZE) = 16 + size_t elemsPerLoadB = NCache2 * KCache2 / (SGs * sgSize); + size_t sgsPerRow = (NCache2 * vnniFactor) / (elemsPerLoadB * sgSize); + size_t GlOffsetB = + (k2 * (KCache2 / vnniFactor) + (uint)(sg.get_group_id() / sgsPerRow)) * + (colsB * vnniFactor) + + n2 * NCache2 * vnniFactor + + (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize); + size_t LocOffsetB = + ((uint)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor + + (sg.get_group_id() % sgsPerRow) * elemsPerLoadB * sgSize; + if (elemsPerLoadB == 16) { + vec slmVecB = sg.load<16>(pB + GlOffsetB); + + sg.store<16>(tileB.template get_multi_ptr() + + LocOffsetB, + slmVecB); + } else if (elemsPerLoadB == 8) { + vec slmVecB = sg.load<8>(pB + GlOffsetB); + + sg.store<8>(tileB.template get_multi_ptr() + + LocOffsetB, + slmVecB); + } else + assert(elemsPerLoadB == 8 || elemsPerLoadB == 16); +} From 3b0be29bd8629c84fb151d08355b848d74db9373 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Tue, 10 Sep 2024 09:50:23 -0700 Subject: [PATCH 04/19] [SYCL] re-enable Scheduler StreamBufferDeallocation unit test (#15292) A month ago, we had a strange intermittent failure of one of the unit tests: https://github.com/intel/llvm/issues/15049 and in response to that we disabled the test. It is unclear why that might have been failing. But that was when there were some other disrupting changes, and fallout from that has since been fixed. I have retested this on Windows for 100,000 iterations, and not a single failure seen. Tested on the CI, ran successfully there too. I'd like to reenable this test with this PR. If the CI system starts failing on PR, we can just revert this PR in coming days. --- sycl/unittests/scheduler/GraphCleanup.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp index c0e1dc136a2d8..6f628f99a84e9 100644 --- a/sycl/unittests/scheduler/GraphCleanup.cpp +++ b/sycl/unittests/scheduler/GraphCleanup.cpp @@ -307,12 +307,7 @@ struct AttachSchedulerWrapper { }; // Check that stream buffers are released alongside graph cleanup. -// https://github.com/intel/llvm/issues/15049 -#ifdef _WIN32 -TEST_F(SchedulerTest, DISABLED_StreamBufferDeallocation) { -#else TEST_F(SchedulerTest, StreamBufferDeallocation) { -#endif unittest::UrMock<> Mock; platform Plt = sycl::platform(); context Ctx{Plt}; From 40e2f623fed1df92206d78982ea72d13bb211c26 Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Tue, 10 Sep 2024 14:38:58 -0700 Subject: [PATCH 05/19] Revert '[CI] pre-commit/aws pointed back to old image' (#15348) Reverts https://github.com/intel/llvm/pull/14074. This will allow us to use latest docker images. --- .github/workflows/sycl-linux-precommit-aws.yml | 2 +- .github/workflows/sycl-linux-precommit.yml | 1 - .github/workflows/sycl-post-commit.yml | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml index 19c21b6eec977..07d309900d764 100644 --- a/.github/workflows/sycl-linux-precommit-aws.yml +++ b/.github/workflows/sycl-linux-precommit-aws.yml @@ -66,7 +66,7 @@ jobs: with: name: CUDA E2E runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab + image: ghcr.io/intel/llvm/ubuntu2204_build:latest image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1 target_devices: ext_oneapi_cuda:gpu # No idea why but that seems to work and be in sync with the main diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index b4a38a2cef24e..460e1737b194a 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -46,7 +46,6 @@ jobs: build_artifact_suffix: "default" build_cache_suffix: "default" changes: ${{ needs.detect_changes.outputs.filters }} - build_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab" determine_arc_tests: name: Decide which Arc tests to run diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml index 29b29f9891a31..d67edc83a050c 100644 --- a/.github/workflows/sycl-post-commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -60,7 +60,7 @@ jobs: reset_intel_gpu: true - name: AMD/HIP runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab + image: ghcr.io/intel/llvm/ubuntu2204_build:latest image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: ext_oneapi_hip:gpu reset_intel_gpu: false From 811db848c397c4e2403e37f2b02fc28e167b56d0 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Tue, 10 Sep 2024 18:22:14 -0700 Subject: [PATCH 06/19] [SYCL] Fix handling of interop events for barrier with waitlist (#15352) Currently Command::getUrEventsBlocking is responsible for preparing a waitlist of UR events for the barrier. This method used wrong assumption that if isEnqueued() returns false for the event then it doesn't have UR handle because it was not enqueued. So if there is an associated command we would enqueue it to get the desired UR handle, or we would just ignore this event if there is no associated command. Problem is that sycl::event created with interoperability constructor has isEnqueued() as false (as it is not enqueued by SYCL RT) but it has UR handle provided by user. Before this patch we just ignored such event as it doesn't have associated command and we didn't put it to the resulting list. This patch fixes this problem by handling interop events properly in this code path. --- sycl/source/detail/event_impl.hpp | 7 +++ sycl/source/detail/scheduler/commands.cpp | 9 ++-- .../barrier_waitlist_with_interop_event.cpp | 47 +++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index b560d721728a6..312bb589760b7 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -329,6 +329,13 @@ class event_impl { bool isProfilingTagEvent() const noexcept { return MProfilingTagEvent; } + // Check if this event is an interoperability event. + bool isInterop() const noexcept { + // As an indication of interoperability event, we use the absence of the + // queue and command, as well as the fact that it is not in enqueued state. + return MEvent && MQueue.expired() && !MIsEnqueued && !MCommand; + } + protected: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 953ad2bee0444..c5e8fc2c3a2cd 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -265,9 +265,12 @@ std::vector Command::getUrEventsBlocking( if (EventImpl->isDefaultConstructed() || EventImpl->isHost() || EventImpl->isNOP()) continue; - // In this path nullptr native event means that the command has not been - // enqueued. It may happen if async enqueue in a host task is involved. - if (!EventImpl->isEnqueued()) { + + // If command has not been enqueued then we have to enqueue it. + // It may happen if async enqueue in a host task is involved. + // Interoperability events are special cases and they are not enqueued, as + // they don't have an associated queue and command. + if (!EventImpl->isInterop() && !EventImpl->isEnqueued()) { if (!EventImpl->getCommand() || !static_cast(EventImpl->getCommand())->producesPiEvent()) continue; diff --git a/sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp b/sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp new file mode 100644 index 0000000000000..f5a54f1a67dc2 --- /dev/null +++ b/sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp @@ -0,0 +1,47 @@ +// REQUIRES: level_zero, level_zero_dev_kit +// RUN: %{build} %level_zero_options -o %t.out +// RUN: %{run} %t.out +// UNSUPPORTED: ze_debug + +#include +#include +#include +#include + +// Test checks the case when an interoperability event is passed as a dependency +// to the barrier. In such case, waiting for the event produced by barrier must +// guarantee completion of the interoperability event. + +using namespace sycl; + +int main() { + sycl::queue Queue; + if (!Queue.get_device().get_info()) + return 0; + + const size_t N = 1024; + int *Data = sycl::malloc_shared(N, Queue); + auto FillEvent = Queue.fill(Data, 0, N); + auto FillZeEvent = get_native(FillEvent); + + backend_input_t EventInteropInput = { + FillZeEvent}; + EventInteropInput.Ownership = sycl::ext::oneapi::level_zero::ownership::keep; + auto EventInterop = make_event( + EventInteropInput, Queue.get_context()); + + auto BarrierEvent = Queue.ext_oneapi_submit_barrier({EventInterop}); + BarrierEvent.wait(); + + if (EventInterop.get_info() != + sycl::info::event_command_status::complete) { + Queue.wait(); + sycl::free(Data, Queue); + return -1; + } + + // Free the USM memory + sycl::free(Data, Queue); + + return 0; +} From 098416abc713fef0ef83d3bee4d020c52830d674 Mon Sep 17 00:00:00 2001 From: Justin Cai Date: Wed, 11 Sep 2024 02:10:37 -0700 Subject: [PATCH 07/19] [SYCL] Fix image selection for AOT on intel_cpu_{spr, gnr} (#15208) When AOT compiling for cpu, the generic `spir64_x86_64` target is used with `-fsycl-targets`. intel/llvm#14909, functionality was added to select device images based on their `compile_target` property in the image. The selection mechanism had to consider CPU as a special case due to not having explicit targets. However, the mechanism only considered `x86_64` and not `intel_cpu_spr` or `intel_cpu_gnr`; therefore on a `intel_cpu_spr` or `intel_cpu_gnr` device, trying to launch a program compiled with `-fsycl-targets=spir64_x86_64`, device image selection would fail to find an image (and thus fail to launch any kernels). This PR updates the logic to include `intel_cpu_spr` and `intel_cpu_gnr`. Note: for tests, this functionality is checked by any test that AOT compiled for CPU and launches a kernel (includes [AOT/cpu.cpp](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AOT/cpu.cpp), [AOT/double.cpp](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AOT/double.cpp), [AOT/half.cpp](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AOT/half.cpp)). --- .../program_manager/program_manager.cpp | 10 ++- .../program_manager/CompileTarget.cpp | 80 +++++++++++++------ 2 files changed, 62 insertions(+), 28 deletions(-) diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 99f3c5204dc74..d6f063e5fada6 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1321,9 +1321,15 @@ RTDeviceBinaryImage *getBinImageFromMultiMap( reinterpret_cast(&CompileTargetByteArray[0]), CompileTargetByteArray.size()); // Note: there are no explicit targets for CPUs, so on x86_64, - // so we use a spir64_x86_64 compile target image. + // intel_cpu_spr, and intel_cpu_gnr, we use a spir64_x86_64 + // compile target image. + // TODO: When dedicated targets for CPU are added, (i.e. + // -fsycl-targets=intel_cpu_spr etc.) remove this special + // handling of CPU targets. if ((ArchName == CompileTarget) || - (ArchName == "x86_64" && CompileTarget == "spir64_x86_64")) { + (CompileTarget == "spir64_x86_64" && + (ArchName == "x86_64" || ArchName == "intel_cpu_spr" || + ArchName == "intel_cpu_gnr"))) { AddImg(); } } diff --git a/sycl/unittests/program_manager/CompileTarget.cpp b/sycl/unittests/program_manager/CompileTarget.cpp index 095a4af2a996c..281530a8e43e4 100644 --- a/sycl/unittests/program_manager/CompileTarget.cpp +++ b/sycl/unittests/program_manager/CompileTarget.cpp @@ -97,20 +97,25 @@ static sycl::unittest::UrImage Img[] = { static sycl::unittest::UrImageArray ImgArray{Img}; -ur_device_handle_t MockSklDeviceHandle = - reinterpret_cast(1); -ur_device_handle_t MockPvcDeviceHandle = - reinterpret_cast(2); -ur_device_handle_t MockX86DeviceHandle = - reinterpret_cast(3); -constexpr int SklIp = 0x02400009; -constexpr int PvcIp = 0x030f0000; -constexpr int X86Ip = 0; - -ur_device_handle_t MockDevices[] = { - MockSklDeviceHandle, - MockPvcDeviceHandle, - MockX86DeviceHandle, +struct MockDeviceData { + int Ip; + ur_device_type_t DeviceType; + ur_device_handle_t getHandle() { + return reinterpret_cast(this); + } + static MockDeviceData *fromHandle(ur_device_handle_t handle) { + return reinterpret_cast(handle); + } +}; + +// IP are from IntelGPUArchitectures/IntelCPUArchitectures in +// sycl/source/detail/device_info.hpp +MockDeviceData MockDevices[] = { + {0x02400009, UR_DEVICE_TYPE_GPU}, // Skl + {0x030f0000, UR_DEVICE_TYPE_GPU}, // Pvc + {0, UR_DEVICE_TYPE_CPU}, // X86 + {8, UR_DEVICE_TYPE_CPU}, // Spr + {9, UR_DEVICE_TYPE_CPU}, // Gnr }; static ur_result_t redefinedDeviceGet(void *pParams) { @@ -123,7 +128,7 @@ static ur_result_t redefinedDeviceGet(void *pParams) { if (*params.pphDevices) { assert(*params.pNumEntries <= std::size(MockDevices)); for (uint32_t i = 0; i < *params.pNumEntries; ++i) { - (*params.pphDevices)[i] = MockDevices[i]; + (*params.pphDevices)[i] = MockDevices[i].getHandle(); } } @@ -149,27 +154,22 @@ static ur_result_t redefinedDeviceGetInfo(void *pParams) { auto params = *static_cast(pParams); if (*params.ppropName == UR_DEVICE_INFO_IP_VERSION && *params.ppPropValue) { int &ret = *static_cast(*params.ppPropValue); - if (*params.phDevice == MockSklDeviceHandle) - ret = SklIp; - if (*params.phDevice == MockPvcDeviceHandle) - ret = PvcIp; - if (*params.phDevice == MockX86DeviceHandle) - ret = X86Ip; + ret = MockDeviceData::fromHandle(*params.phDevice)->Ip; } - if (*params.ppropName == UR_DEVICE_INFO_TYPE && - *params.phDevice == MockX86DeviceHandle) { + if (*params.ppropName == UR_DEVICE_INFO_TYPE) { if (*params.ppPropValue) *static_cast(*params.ppPropValue) = - UR_DEVICE_TYPE_CPU; + MockDeviceData::fromHandle(*params.phDevice)->DeviceType; if (*params.ppPropSizeRet) - **params.ppPropSizeRet = sizeof(UR_DEVICE_TYPE_CPU); + **params.ppPropSizeRet = sizeof(ur_device_type_t); } return UR_RESULT_SUCCESS; } static ur_result_t redefinedDeviceSelectBinary(void *pParams) { auto params = *static_cast(pParams); - auto target = *params.phDevice == MockX86DeviceHandle + auto target = MockDeviceData::fromHandle(*params.phDevice)->DeviceType == + UR_DEVICE_TYPE_CPU ? UR_DEVICE_BINARY_TARGET_SPIRV64_X86_64 : UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; uint32_t fallback = *params.pNumBinaries; @@ -246,6 +246,16 @@ TEST_F(CompileTargetTest, SingleTask) { checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { launchSingleTaskKernel(queue{archSelector(syclex::architecture::x86_64)}); }); + + checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { + launchSingleTaskKernel( + queue{archSelector(syclex::architecture::intel_cpu_spr)}); + }); + + checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { + launchSingleTaskKernel( + queue{archSelector(syclex::architecture::intel_cpu_gnr)}); + }); } void launchNDRangeKernel(queue q) { @@ -268,6 +278,16 @@ TEST_F(CompileTargetTest, NDRangeKernel) { checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { launchNDRangeKernel(queue{archSelector(syclex::architecture::x86_64)}); }); + + checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { + launchNDRangeKernel( + queue{archSelector(syclex::architecture::intel_cpu_spr)}); + }); + + checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { + launchNDRangeKernel( + queue{archSelector(syclex::architecture::intel_cpu_gnr)}); + }); } void launchRangeKernel(queue q) { @@ -288,6 +308,14 @@ TEST_F(CompileTargetTest, RangeKernel) { checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { launchRangeKernel(queue{archSelector(syclex::architecture::x86_64)}); }); + + checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { + launchRangeKernel(queue{archSelector(syclex::architecture::intel_cpu_spr)}); + }); + + checkUsedImageWithCompileTarget("spir64_x86_64", [&]() { + launchRangeKernel(queue{archSelector(syclex::architecture::intel_cpu_gnr)}); + }); } TEST_F(CompileTargetTest, NoDeviceKernel) { From 729d6f62ff2e0676f01d58d256392bd94e7b71f7 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Wed, 11 Sep 2024 11:18:37 +0200 Subject: [PATCH 08/19] [SYCL][E2E] Fix online compiler test for accelerator (#15270) This commit fixes an issue where the online_compiler_OpenCL test would assume that the clCreateProgramWithIL function is supported by all OpenCL targets, while it was only supported with >=2.1 and had an extension in older versions. This also enables the test for accelerator. --------- Signed-off-by: Larsen, Steffen --- .../OnlineCompiler/online_compiler_L0.cpp | 6 +- .../OnlineCompiler/online_compiler_OpenCL.cpp | 81 +++++++++++++++++-- .../OnlineCompiler/online_compiler_common.hpp | 7 ++ 3 files changed, 88 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp index fc92d8957ef31..64a902bcbb1a4 100644 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp +++ b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp @@ -7,8 +7,8 @@ // All Level-Zero specific code is kept here and the common part that can be // re-used by other backends is kept in online_compiler_common.hpp file. -#include #include +#include #include @@ -20,6 +20,10 @@ using byte = unsigned char; #ifdef RUN_KERNELS +bool testSupported(sycl::queue &Queue) { + return Queue.get_backend() == sycl::backend::ext_oneapi_level_zero; +} + sycl::kernel getSYCLKernelWithIL(sycl::queue &Queue, const std::vector &IL) { diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp index 8a0d67d3b02c3..161eb65478653 100644 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp +++ b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp @@ -1,5 +1,4 @@ // REQUIRES: opencl, opencl_icd, cm-compiler -// UNSUPPORTED: accelerator // RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %opencl_lib -o %t.out // RUN: %{run} %t.out @@ -18,14 +17,86 @@ using byte = unsigned char; #ifdef RUN_KERNELS +std::tuple GetOCLVersion(sycl::device Device) { + cl_int Err; + cl_device_id ClDevice = sycl::get_native(Device); + + size_t VersionSize = 0; + Err = clGetDeviceInfo(ClDevice, CL_DEVICE_VERSION, 0, nullptr, &VersionSize); + assert(Err == CL_SUCCESS); + + std::string Version(VersionSize, '\0'); + Err = clGetDeviceInfo(ClDevice, CL_DEVICE_VERSION, VersionSize, + Version.data(), nullptr); + assert(Err == CL_SUCCESS); + + std::string_view Prefix = "OpenCL "; + size_t VersionBegin = Version.find_first_of(" "); + size_t VersionEnd = Version.find_first_of(" ", VersionBegin + 1); + size_t VersionSeparator = Version.find_first_of(".", VersionBegin + 1); + + bool HaveOCLPrefix = + std::equal(Prefix.begin(), Prefix.end(), Version.begin()); + + assert(HaveOCLPrefix && VersionBegin != std::string::npos && + VersionEnd != std::string::npos && + VersionSeparator != std::string::npos); + + std::string VersionMajor{Version.begin() + VersionBegin + 1, + Version.begin() + VersionSeparator}; + std::string VersionMinor{Version.begin() + VersionSeparator + 1, + Version.begin() + VersionEnd}; + + unsigned long OCLMajor = strtoul(VersionMajor.c_str(), nullptr, 10); + unsigned long OCLMinor = strtoul(VersionMinor.c_str(), nullptr, 10); + + assert(OCLMajor > 0 && (OCLMajor > 2 || OCLMinor <= 2) && + OCLMajor != UINT_MAX && OCLMinor != UINT_MAX); + + return std::make_tuple(OCLMajor, OCLMinor); +} + +bool testSupported(sycl::queue &Queue) { + if (Queue.get_backend() != sycl::backend::opencl) + return false; + + sycl::device Device = Queue.get_device(); + auto [OCLMajor, OCLMinor] = GetOCLVersion(Device); + + // Creating a program from IL is only supported on >=2.1 or if + // cl_khr_il_program is supported on the device. + return (OCLMajor == 2 && OCLMinor >= 1) || OCLMajor > 2 || + Device.has_extension("cl_khr_il_program"); +} + sycl::kernel getSYCLKernelWithIL(sycl::queue &Queue, const std::vector &IL) { sycl::context Context = Queue.get_context(); - cl_int Err; - cl_program ClProgram = - clCreateProgramWithIL(sycl::get_native(Context), - IL.data(), IL.size(), &Err); + cl_int Err = 0; + cl_program ClProgram = 0; + + sycl::device Device = Queue.get_device(); + auto [OCLMajor, OCLMinor] = GetOCLVersion(Device); + if ((OCLMajor == 2 && OCLMinor >= 1) || OCLMajor > 2) { + // clCreateProgramWithIL is supported if OCL version >=2.1. + ClProgram = + clCreateProgramWithIL(sycl::get_native(Context), + IL.data(), IL.size(), &Err); + } else { + // Fall back to using extension function for building IR. + using ApiFuncT = + cl_program(CL_API_CALL *)(cl_context, const void *, size_t, cl_int *); + ApiFuncT FuncPtr = + reinterpret_cast(clGetExtensionFunctionAddressForPlatform( + sycl::get_native(Context.get_platform()), + "clCreateProgramWithILKHR")); + + assert(FuncPtr != nullptr); + + ClProgram = FuncPtr(sycl::get_native(Context), + IL.data(), IL.size(), &Err); + } assert(Err == CL_SUCCESS); Err = clBuildProgram(ClProgram, 0, nullptr, nullptr, nullptr, nullptr); diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp b/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp index 505802d11f3b1..b585126f95674 100644 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp +++ b/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp @@ -53,6 +53,13 @@ int main(int argc, char **argv) { sycl::queue Q; sycl::device Device = Q.get_device(); +#ifdef RUN_KERNELS + if (!testSupported(Q)) { + std::cout << "Building for IL is not supported. Skipping!" << std::endl; + return 0; + } +#endif + { // Compile and run a trivial OpenCL kernel. std::cout << "Test case1\n"; sycl::ext::intel::experimental::online_compiler< From 81aacfa9af9b99fb6658e4b906c509968da18e43 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Wed, 11 Sep 2024 14:02:24 +0100 Subject: [PATCH 09/19] [SYCL] Implement max_num_work_groups from the launch queries extension (#14333) This PR implements the `max_num_work_groups ` query from the `sycl_ext_oneapi_launch_queries` extension. Additionally, this PR introduces changes that overload `ext_oneapi_get_info` for another kernel-queue-specific query - `max_num_work_group_sync` to take extra parameters for local work-group size and dynamic local memory size (in bytes) in order to allow users to pass those runtime resource limiting factors to the query, so they are taken into account in the final group count suggestion. --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 14 +- .../sycl_ext_oneapi_launch_queries.asciidoc | 8 +- .../include/sycl/detail/info_desc_helpers.hpp | 12 + .../ext/oneapi/experimental/root_group.hpp | 9 +- ...xt_oneapi_kernel_queue_specific_traits.def | 4 + sycl/include/sycl/info/info_desc.hpp | 2 + sycl/include/sycl/kernel.hpp | 24 +- sycl/source/detail/kernel_impl.cpp | 32 +++ sycl/source/detail/kernel_impl.hpp | 90 +++++++- sycl/source/kernel.cpp | 26 ++- .../launch_queries/max_num_work_groups.cpp | 215 ++++++++++++++++++ sycl/test-e2e/GroupAlgorithm/root_group.cpp | 15 +- sycl/test/abi/sycl_symbols_linux.dump | 4 +- sycl/test/abi/sycl_symbols_windows.dump | 4 +- sycl/test/include_deps/sycl_accessor.hpp.cpp | 1 + .../include_deps/sycl_detail_core.hpp.cpp | 1 + 16 files changed, 424 insertions(+), 37 deletions(-) create mode 100644 sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def create mode 100644 sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 956c33bec68df..e569981589ac4 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5 - # Merge: a99dbcee 3abe18cf - # Author: Piotr Balcer - # Date: Fri Sep 6 17:21:17 2024 +0200 - # Merge pull request #1820 from pbalcer/static-linking - # Add support for static linking of the L0 adapter - set(UNIFIED_RUNTIME_TAG 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5) + # commit eb63d1a21729f6928bb6cccc5f92856b0690aca6 + # Merge: e26bba51 45a781f4 + # Author: Omar Ahmed + # Date: Tue Sep 10 12:08:57 2024 +0100 + # Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs + # [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda + set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") # Due to the use of dependentloadflag and no installer for UMF and hwloc we need diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc index 8221000502642..ee52d75b8fd21 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc @@ -204,9 +204,11 @@ otherwise it is 0. |Returns the maximum number of work-groups, when the kernel is submitted to the specified queue with the specified work-group size and the specified amount of dynamic work-group local memory (in bytes), accounting for any kernel -properties or features. If the kernel can be submitted to the specified queue -without an error, the minimum value returned by this query is 1, otherwise it -is 0. +properties or features. If the specified work-group size is 0, which is +invalid, then the implementation will throw a synchronous exception with the +`errc::invalid` error code. If the kernel can be submitted to the specified +queue without an error, the minimum value returned by this query is 1, +otherwise it is 0. |=== diff --git a/sycl/include/sycl/detail/info_desc_helpers.hpp b/sycl/include/sycl/detail/info_desc_helpers.hpp index e8bc8f76c83db..d3b4bfd977139 100644 --- a/sycl/include/sycl/detail/info_desc_helpers.hpp +++ b/sycl/include/sycl/detail/info_desc_helpers.hpp @@ -31,6 +31,8 @@ template struct is_queue_info_desc : std::false_type {}; template struct is_kernel_info_desc : std::false_type {}; template struct is_kernel_device_specific_info_desc : std::false_type {}; +template +struct is_kernel_queue_specific_info_desc : std::false_type {}; template struct is_event_info_desc : std::false_type {}; template struct is_event_profiling_info_desc : std::false_type {}; // Normally we would just use std::enable_if to limit valid get_info template @@ -134,6 +136,16 @@ struct IsKernelInfo #include #include #undef __SYCL_PARAM_TRAITS_SPEC + +#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode) \ + template <> \ + struct is_##DescType##_info_desc \ + : std::true_type { \ + using return_type = Namespace::info::DescType::Desc::return_type; \ + }; +#include +#undef __SYCL_PARAM_TRAITS_SPEC + #define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode) \ template <> \ struct is_backend_info_desc : std::true_type { \ diff --git a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp index 558396bb6f9c8..b8c90683bbaaf 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp @@ -24,13 +24,8 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { -namespace info::kernel_queue_specific { -// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once -// #7598 is merged. -struct max_num_work_group_sync { - using return_type = size_t; -}; -} // namespace info::kernel_queue_specific +// See 'sycl/info/kernel_device_specific_traits.def' for the kernel +// device-specific properties that relate to 'root_group'. template class root_group { public: diff --git a/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def new file mode 100644 index 0000000000000..0ec11af0bb6b1 --- /dev/null +++ b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def @@ -0,0 +1,4 @@ +// TODO: Revisit 'max_num_work_group_sync' and align it with the +// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged. +__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,) +__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,) diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp index 8f6a86e5f60bb..b84f98f350d0d 100644 --- a/sycl/include/sycl/info/info_desc.hpp +++ b/sycl/include/sycl/info/info_desc.hpp @@ -247,6 +247,8 @@ struct work_item_progress_capabilities; #include #include #include +#include + #undef __SYCL_PARAM_TRAITS_SPEC #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC } // namespace _V1 diff --git a/sycl/include/sycl/kernel.hpp b/sycl/include/sycl/kernel.hpp index 40db1d8461dde..dac7f619d745e 100644 --- a/sycl/include/sycl/kernel.hpp +++ b/sycl/include/sycl/kernel.hpp @@ -159,9 +159,29 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase { get_info(const device &Device, const range<3> &WGSize) const; // TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension - // once #7598 is merged. + // once #7598 is merged. (regarding the 'max_num_work_group_sync' query) + + /// Query queue/launch-specific information from a kernel using the + /// info::kernel_queue_specific descriptor for a specific Queue. + /// + /// \param Queue is a valid SYCL queue. + /// \return depends on information being queried. + template + typename detail::is_kernel_queue_specific_info_desc::return_type + ext_oneapi_get_info(queue Queue) const; + + /// Query queue/launch-specific information from a kernel using the + /// info::kernel_queue_specific descriptor for a specific Queue and values. + /// max_num_work_groups is the only valid descriptor for this function. + /// + /// \param Queue is a valid SYCL queue. + /// \param WorkGroupSize is the work-group size the number of work-groups is + /// requested for. + /// \return depends on information being queried. template - typename Param::return_type ext_oneapi_get_info(const queue &q) const; + typename detail::is_kernel_queue_specific_info_desc::return_type + ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize, + size_t DynamicLocalMemorySize) const; private: /// Constructs a SYCL kernel object from a valid kernel_impl instance. diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp index faf3695c04e94..50af09831f207 100644 --- a/sycl/source/detail/kernel_impl.cpp +++ b/sycl/source/detail/kernel_impl.cpp @@ -106,6 +106,38 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const { "interoperability function or to query a device built-in kernel"); } +bool kernel_impl::exceedsOccupancyResourceLimits( + const device &Device, const range<3> &WorkGroupSize, + size_t DynamicLocalMemorySize) const { + // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize. + // Generally, exceeding hardware resource limits will yield in an error when + // the kernel is launched. + const size_t MaxWorkGroupSize = + get_info(Device); + const size_t MaxLocalMemorySizeInBytes = + Device.get_info(); + + if (WorkGroupSize.size() > MaxWorkGroupSize) + return true; + + if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes) + return true; + + // It will be impossible to launch a kernel for Cuda when the hardware limit + // for the 32-bit registers page file size is exceeded. + if (Device.get_backend() == backend::ext_oneapi_cuda) { + const uint32_t RegsPerWorkItem = + get_info(Device); + const uint32_t MaxRegsPerWorkGroup = + Device.get_info(); + if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup) + return true; + } + + return false; +} + template <> typename info::platform::version::return_type kernel_impl::get_backend_info() const { diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index ab2950e26a856..040b5cbccf965 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -114,8 +114,26 @@ class kernel_impl { typename Param::return_type get_info(const device &Device, const range<3> &WGSize) const; + /// Query queue/launch-specific information from a kernel using the + /// info::kernel_queue_specific descriptor for a specific Queue. + /// + /// \param Queue is a valid SYCL queue. + /// \return depends on information being queried. + template + typename Param::return_type ext_oneapi_get_info(queue Queue) const; + + /// Query queue/launch-specific information from a kernel using the + /// info::kernel_queue_specific descriptor for a specific Queue and values. + /// max_num_work_groups is the only valid descriptor for this function. + /// + /// \param Queue is a valid SYCL queue. + /// \param WorkGroupSize is the work-group size the number of work-groups is + /// requested for. + /// \return depends on information being queried. template - typename Param::return_type ext_oneapi_get_info(const queue &q) const; + typename Param::return_type + ext_oneapi_get_info(queue Queue, const range<3> &MaxWorkGroupSize, + size_t DynamicLocalMemorySize) const; /// Get a constant reference to a raw kernel object. /// @@ -171,6 +189,12 @@ class kernel_impl { bool isBuiltInKernel(const device &Device) const; void checkIfValidForNumArgsInfoQuery() const; + + /// Check if the occupancy limits are exceeded for the given kernel launch + /// configuration. + bool exceedsOccupancyResourceLimits(const device &Device, + const range<3> &WorkGroupSize, + size_t DynamicLocalMemorySize) const; }; template @@ -217,20 +241,66 @@ kernel_impl::get_info(const device &Device, getPlugin()); } +namespace syclex = ext::oneapi::experimental; + template <> -inline typename ext::oneapi::experimental::info::kernel_queue_specific:: - max_num_work_group_sync::return_type +inline typename syclex::info::kernel_queue_specific::max_num_work_groups:: + return_type kernel_impl::ext_oneapi_get_info< - ext::oneapi::experimental::info::kernel_queue_specific:: - max_num_work_group_sync>(const queue &Queue) const { + syclex::info::kernel_queue_specific::max_num_work_groups>( + queue Queue, const range<3> &WorkGroupSize, + size_t DynamicLocalMemorySize) const { + if (WorkGroupSize.size() == 0) + throw exception(sycl::make_error_code(errc::invalid), + "The launch work-group size cannot be zero."); + const auto &Plugin = getPlugin(); const auto &Handle = getHandleRef(); + auto Device = Queue.get_device(); + + uint32_t GroupCount{0}; + if (auto Result = Plugin->call_nocheck< + UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>( + Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount); + Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + // The feature is supported. Check for other errors and throw if any. + Plugin->checkUrResult(Result); + return GroupCount; + } + + // Fallback. If the backend API is unsupported, this query will return either + // 0 or 1 based on the kernel resource usage and the user-requested resources. + return exceedsOccupancyResourceLimits(Device, WorkGroupSize, + DynamicLocalMemorySize) + ? 0 + : 1; +} + +template <> +inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync:: + return_type + kernel_impl::ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_group_sync>( + queue Queue, const range<3> &WorkGroupSize, + size_t DynamicLocalMemorySize) const { + return ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_groups>( + Queue, WorkGroupSize, DynamicLocalMemorySize); +} + +template <> +inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync:: + return_type + kernel_impl::ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_group_sync>( + queue Queue) const { + auto Device = Queue.get_device(); const auto MaxWorkGroupSize = - Queue.get_device().get_info(); - uint32_t GroupCount = 0; - Plugin->call( - Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount); - return GroupCount; + get_info(Device); + const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1}; + return ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_group_sync>( + Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0); } } // namespace detail diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp index f4ec76bcf9e7d..a4aae60bece08 100644 --- a/sycl/source/kernel.cpp +++ b/sycl/source/kernel.cpp @@ -106,16 +106,36 @@ kernel::get_info( const device &, const sycl::range<3> &) const; template -typename Param::return_type -kernel::ext_oneapi_get_info(const queue &Queue) const { +typename detail::is_kernel_queue_specific_info_desc::return_type +kernel::ext_oneapi_get_info(queue Queue) const { return impl->ext_oneapi_get_info(Queue); } +template +typename detail::is_kernel_queue_specific_info_desc::return_type +kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize, + size_t DynamicLocalMemorySize) const { + return impl->ext_oneapi_get_info(Queue, WorkGroupSize, + DynamicLocalMemorySize); +} + template __SYCL_EXPORT typename ext::oneapi::experimental::info:: kernel_queue_specific::max_num_work_group_sync::return_type kernel::ext_oneapi_get_info< ext::oneapi::experimental::info::kernel_queue_specific:: - max_num_work_group_sync>(const queue &Queue) const; + max_num_work_group_sync>(queue Queue) const; + +#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \ + template __SYCL_EXPORT ReturnT \ + kernel::ext_oneapi_get_info( \ + queue, const range<3> &, size_t) const; +// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all +// kernel_queue_specific queries require the above-defined get_info interface. +// clang-format off +__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t) +__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t) +// clang-format on +#undef __SYCL_PARAM_TRAITS_SPEC kernel::kernel(std::shared_ptr Impl) : impl(Impl) {} diff --git a/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp new file mode 100644 index 0000000000000..7b76327d015b1 --- /dev/null +++ b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp @@ -0,0 +1,215 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include + +#include +#include + +namespace syclex = sycl::ext::oneapi::experimental; +using namespace sycl::info::device; +using namespace sycl::info::kernel_device_specific; + +using value_type = int64_t; + +namespace kernels { + +template +using sycl_global_accessor = + sycl::accessor; + +class TestKernel { +public: + static constexpr bool HasLocalMemory{false}; + + TestKernel(sycl_global_accessor acc) : acc_{acc} {} + + void operator()(sycl::nd_item<1> item) const { + const auto gtid = item.get_global_linear_id(); + acc_[gtid] = gtid + 42; + } + +private: + sycl_global_accessor acc_; +}; + +class TestLocalMemoryKernel { +public: + static constexpr bool HasLocalMemory{true}; + + TestLocalMemoryKernel(sycl_global_accessor acc, + sycl::local_accessor loc_acc) + : acc_{acc}, loc_acc_{loc_acc} {} + + void operator()(sycl::nd_item<1> item) const { + const auto ltid = item.get_local_linear_id(); + const auto gtid = item.get_global_linear_id(); + if (ltid < loc_acc_.size()) { + loc_acc_[ltid] = ltid + 42; + item.barrier(sycl::access::fence_space::local_space); + acc_[gtid] = loc_acc_[ltid]; + } else { + acc_[gtid] = 0; + } + } + +private: + sycl_global_accessor acc_; + sycl::local_accessor loc_acc_; +}; + +} // namespace kernels + +namespace { + +template +int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) { + const auto ctx = q.get_context(); + auto bundle = sycl::get_kernel_bundle(ctx); + auto kernel = bundle.template get_kernel(); + + const size_t maxWorkGroupSize = + kernel.template get_info(dev); + const size_t NumWorkItems = maxWorkGroupSize * maxWorkGroupSize; + + size_t workGroupSize = 32; + size_t localMemorySizeInBytes{0}; + if constexpr (KernelName::HasLocalMemory) { + localMemorySizeInBytes = workGroupSize * sizeof(value_type); + } + + sycl::buffer buf{sycl::range<1>{NumWorkItems}}; + + // Tests + + // ==================== // + // Test 1 - return type // + // ==================== // + sycl::range<3> workGroupRange{workGroupSize, 1, 1}; + auto maxWGs = kernel.template ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_groups>( + q, workGroupRange, localMemorySizeInBytes); + + // Test the return type is as specified in the extension document. + static_assert(std::is_same_v, size_t>, + "max_num_work_groups query must return size_t"); + + // ===================== // + // Test 2 - return value // + // ===================== // + // We must have at least one active group if we are below resource limits. + assert(maxWGs > 0 && "max_num_work_groups query failed"); + if (maxWGs == 0) + return 1; + + // Run the kernel + auto launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems}, + sycl::range<1>{workGroupSize}}; + q.submit([&](sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + if constexpr (KernelName::HasLocalMemory) { + sycl::local_accessor loc_acc{ + sycl::range<1>{workGroupSize}, cgh}; + cgh.parallel_for(launch_range, KernelName{acc, loc_acc}); + } else { + cgh.parallel_for(launch_range, KernelName{acc}); + } + }).wait(); + assert(sycl::host_accessor{buf}[0] == 42); + + // ========================== // + // Test 3 - use max resources // + // ========================== // + // A little over the maximum work-group size for the purpose of exceeding. + workGroupSize = maxWorkGroupSize; + workGroupRange[0] = workGroupSize; + size_t localSize = + (dev.get_info() / sizeof(value_type)); + if constexpr (KernelName::HasLocalMemory) { + localMemorySizeInBytes = localSize * sizeof(value_type); + } + maxWGs = kernel.template ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_groups>( + q, workGroupRange, localMemorySizeInBytes); + + assert(maxWGs > 0 && "max_num_work_groups query failed"); + if (maxWGs == 0) + return 1; + + launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems}, + sycl::range<1>{workGroupSize}}; + + q.submit([&](sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + if constexpr (KernelName::HasLocalMemory) { + sycl::local_accessor loc_acc{sycl::range<1>{localSize}, + cgh}; + cgh.parallel_for(launch_range, KernelName{acc, loc_acc}); + } else { + cgh.parallel_for(launch_range, KernelName{acc}); + } + }).wait(); + assert(sycl::host_accessor{buf}[0] == 42); + + // =============================== // + // Test 4 - exceed resource limits // + // =============================== // + workGroupSize = maxWorkGroupSize + 32; + workGroupRange[0] = workGroupSize; + maxWGs = kernel.template ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_groups>( + q, workGroupRange, localMemorySizeInBytes); + // It cannot be possible to launch a kernel successfully with a configuration + // that exceeds the available resources as in the above defined workGroupSize. + // workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0. + if (dev.get_backend() == sycl::backend::ext_oneapi_cuda) { + assert(maxWGs == 0 && "max_num_work_groups query failed"); + if (maxWGs > 0) + return 1; + } + + // As we ensured that the 'max_num_work_groups' query correctly + // returns 0 possible work-groups, test that the kernel launch will fail. + // A configuration that defines a work-group size larger than the maximum + // possible should result in failure. + try { + launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems}, + sycl::range<1>{workGroupSize}}; + + q.submit([&](sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + if constexpr (KernelName::HasLocalMemory) { + sycl::local_accessor loc_acc{sycl::range<1>{localSize}, + cgh}; + cgh.parallel_for(launch_range, KernelName{acc, loc_acc}); + } else { + cgh.parallel_for(launch_range, KernelName{acc}); + } + }).wait(); + } catch (const sycl::exception &e) { + // 'nd_range' error is the expected outcome from the above launch config. + if (e.code() == sycl::make_error_code(sycl::errc::nd_range)) { + return 0; + } + std::cerr << e.code() << ":\t"; + std::cerr << e.what() << std::endl; + return 1; + } + + return 0; +} + +} // namespace + +int main() { + sycl::queue q{}; + sycl::device dev = q.get_device(); + + using namespace kernels; + + int ret{0}; + ret &= test_max_num_work_groups(q, dev); + ret &= test_max_num_work_groups(q, dev); + return ret; +} diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index d8393f35c6253..92e5d69ffcab4 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -30,12 +30,21 @@ void testQueriesAndProperties() { const auto maxWGs = kernel.ext_oneapi_get_info< sycl::ext::oneapi::experimental::info::kernel_queue_specific:: max_num_work_group_sync>(q); + const auto wgRange = sycl::range{WorkGroupSize, 1, 1}; + const auto maxWGsWithLimits = kernel.ext_oneapi_get_info< + sycl::ext::oneapi::experimental::info::kernel_queue_specific:: + max_num_work_group_sync>(q, wgRange, wgRange.size() * sizeof(int)); const auto props = sycl::ext::oneapi::experimental::properties{ sycl::ext::oneapi::experimental::use_root_sync}; q.single_task(props, []() {}); - static_assert(std::is_same_v::type, size_t>, - "max_num_work_group_sync query must return size_t"); - assert(maxWGs >= 1 && "max_num_work_group_sync query failed"); + + static auto check_max_num_work_group_sync = [](auto Result) { + static_assert(std::is_same_v, size_t>, + "max_num_work_group_sync query must return size_t"); + assert(Result >= 1 && "max_num_work_group_sync query failed"); + }; + check_max_num_work_group_sync(maxWGs); + check_max_num_work_group_sync(maxWGsWithLimits); } void testRootGroup() { diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 4c73f43ed6ba2..ec6ec2096403f 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3915,8 +3915,10 @@ _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device15backend_versionEEENS0_6 _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info8platform7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv _ZNK4sycl3_V16kernel17get_kernel_bundleEv -_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENT_11return_typeERKNS0_5queueE _ZNK4sycl3_V16kernel3getEv +_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueE +_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm +_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific19max_num_work_groupsEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific15work_group_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16global_work_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16private_mem_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index e2c3643c557be..55ce460c64559 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -13,7 +13,9 @@ ??$create_sub_devices@$0BAIH@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@_KV?$allocator@_K@std@@@4@@Z ??$create_sub_devices@$0BAII@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4partition_affinity_domain@info@12@@Z ??$create_sub_devices@$0BAIJ@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ -??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVqueue@12@@Z +??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@@Z +??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z +??$ext_oneapi_get_info@Umax_num_work_groups@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@event@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp index 36e9a5ad3961e..fd2628dfb6e1c 100644 --- a/sycl/test/include_deps/sycl_accessor.hpp.cpp +++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp @@ -110,6 +110,7 @@ // CHECK-NEXT: info/ext_codeplay_device_traits.def // CHECK-NEXT: info/ext_intel_device_traits.def // CHECK-NEXT: info/ext_oneapi_device_traits.def +// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def // CHECK-NEXT: info/sycl_backend_traits.def // CHECK-NEXT: platform.hpp // CHECK-NEXT: detail/string_view.hpp diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 6cb1be75681ee..33dc01b8a4e74 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -111,6 +111,7 @@ // CHECK-NEXT: info/ext_codeplay_device_traits.def // CHECK-NEXT: info/ext_intel_device_traits.def // CHECK-NEXT: info/ext_oneapi_device_traits.def +// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def // CHECK-NEXT: info/sycl_backend_traits.def // CHECK-NEXT: platform.hpp // CHECK-NEXT: detail/string_view.hpp From da2ff3c159ec1fe7a96e6645644ba66bdfbd601d Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Wed, 11 Sep 2024 22:57:08 +0900 Subject: [PATCH 10/19] BasicAA: Fix assert when indexing address spaces with different sizes (#103713) (#15347) Cherry pick https://github.com/llvm/llvm-project/pull/103713 to fix AMD postcommit testing. Closes: https://github.com/intel/llvm/issues/15227 Co-authored-by: Matt Arsenault --- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 3 +++ llvm/test/Analysis/BasicAA/issue103500.ll | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 llvm/test/Analysis/BasicAA/issue103500.ll diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index e474899fb548e..e318cab3531c3 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -350,6 +350,9 @@ struct CastedValue { } bool hasSameCastsAs(const CastedValue &Other) const { + if (V->getType() != Other.V->getType()) + return false; + if (ZExtBits == Other.ZExtBits && SExtBits == Other.SExtBits && TruncBits == Other.TruncBits) return true; diff --git a/llvm/test/Analysis/BasicAA/issue103500.ll b/llvm/test/Analysis/BasicAA/issue103500.ll new file mode 100644 index 0000000000000..3532e6e9479e5 --- /dev/null +++ b/llvm/test/Analysis/BasicAA/issue103500.ll @@ -0,0 +1,18 @@ +; RUN: opt -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "p0:64:64-p5:32:32" + +; CHECK: Function: indexing_different_sized_addrspace: 2 pointers, 0 call sites +; CHECK: MayAlias: i32* %gep.in.0, i32 addrspace(5)* %gep.in.5.1 + +define i1 @indexing_different_sized_addrspace(ptr addrspace(5) %arg, i64 %arg1, i32 %arg2) { +bb: + %arg.addrspacecast = addrspacecast ptr addrspace(5) %arg to ptr + %gep.in.5 = getelementptr i8, ptr addrspace(5) %arg, i32 16 + %gep.in.0 = getelementptr i8, ptr %arg.addrspacecast, i64 %arg1 + %gep.in.5.1 = getelementptr i8, ptr addrspace(5) %gep.in.5, i32 %arg2 + %load.0 = load i32, ptr %gep.in.0, align 4 + %load.1 = load i32, ptr addrspace(5) %gep.in.5.1, align 4 + %cmp = icmp slt i32 %load.0, %load.1 + ret i1 %cmp +} From 4f2ad4fc6dfee9f58352c18ba17e71d3164d7b3b Mon Sep 17 00:00:00 2001 From: lucyli-ca <107629053+lucyli-ca@users.noreply.github.com> Date: Wed, 11 Sep 2024 11:22:31 -0400 Subject: [PATCH 11/19] Bump cryptography to 43.0.1 in llvm/utils/git/requirements.txt (#15360) PR to bump dependency version to resolve security vulnerability found. pyca/cryptography's wheels include a statically linked copy of OpenSSL. The versions of OpenSSL included in cryptography 37.0.0-43.0.0 are vulnerable to a security issue. More details about the vulnerability itself can be found in https://openssl-library.org/news/secadv/20240903.txt. --- llvm/utils/git/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt index 35d38df674a89..9080e76558841 100644 --- a/llvm/utils/git/requirements.txt +++ b/llvm/utils/git/requirements.txt @@ -158,7 +158,7 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via requests -cryptography==42.0.5 \ +cryptography==43.0.1 \ --hash=sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee \ --hash=sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576 \ --hash=sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d \ From 1194277e47c2a69458d3e3b8dff6f3f2b303c7b3 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Wed, 11 Sep 2024 17:02:50 +0100 Subject: [PATCH 12/19] Update docs regarding the kernel-fusion extension (#15356) --- sycl/doc/design/CompilerAndRuntimeDesign.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index 52ae88a2c0ef1..1935356d0a3a8 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -776,7 +776,7 @@ Note: Kernel naming is not fully stable for now. ##### Kernel Fusion Support The [experimental kernel fusion -extension](../extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc) +extension](../extensions/removed/sycl_ext_codeplay_kernel_fusion.asciidoc) also supports the CUDA and HIP backends. However, as the CUBIN, PTX and AMD assembly are not suitable input formats for the [kernel fusion JIT compiler](KernelFusionJIT.md), a suitable IR has to be added as an additional device binary. From 783d2b93d7924879489943e3a8fa7d0c05f97ea5 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 12 Sep 2024 01:49:56 +0900 Subject: [PATCH 13/19] [SYCL][ClangLinkerWrapper] Support old-style objects and static archives (#15216) This PR finishes up work our intern Jason was working on [here](https://github.com/intel/llvm/pull/15156). Most of the code here is not new, it is old code that was removed [here](https://github.com/intel/llvm/commit/ece73ad61b49eaf9ecb6e2060e5f20e09e26def6). This code is not intended to be permanent or upstreamed. It's intended to be temporary to ease the work to enabling the new offload model by default. Both object files and static archives are supported and tested, but SPIR-V fat objects are not, I don't think any customers are using that anyway. Signed-off-by: Sarnie, Nick Co-authored-by: Li, Jason --- .../ClangLinkerWrapper.cpp | 139 +++++++++++++++++- .../test-e2e/NewOffloadDriver/multisource.cpp | 17 +++ 2 files changed, 154 insertions(+), 2 deletions(-) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index eb37fa583d63a..65e25fa1e034e 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -251,8 +251,8 @@ Expected getInputBitcodeLibrary(StringRef Input) { Image.StringData["arch"] = Arch; Image.Image = std::move(*ImageOrError); - std::unique_ptr Binary = - MemoryBuffer::getMemBufferCopy(OffloadBinary::write(Image)); + std::unique_ptr Binary = MemoryBuffer::getMemBufferCopy( + OffloadBinary::write(Image), Image.Image->getBufferIdentifier()); auto NewBinaryOrErr = OffloadBinary::create(*Binary); if (!NewBinaryOrErr) return NewBinaryOrErr.takeError(); @@ -1358,6 +1358,135 @@ static Expected linkDevice(ArrayRef InputFiles, return *DeviceLinkedFile; } +static bool isStaticArchiveFile(const StringRef Filename) { + if (!llvm::sys::path::has_extension(Filename)) + // Any file with no extension should not be considered an Archive. + return false; + llvm::file_magic Magic; + llvm::identify_magic(Filename, Magic); + // Only archive files are to be considered. + // TODO: .lib check to be added + return (Magic == llvm::file_magic::archive); +} + +static Expected listSection(StringRef Filename, + const ArgList &Args) { + Expected OffloadBundlerPath = findProgram( + "clang-offload-bundler", {getMainExecutable("clang-offload-bundler")}); + if (!OffloadBundlerPath) + return OffloadBundlerPath.takeError(); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + + SmallVector CmdArgs; + CmdArgs.push_back(*OffloadBundlerPath); + bool IsArchive = isStaticArchiveFile(Filename); + CmdArgs.push_back(IsArchive ? "-type=aoo" : "-type=o"); + CmdArgs.push_back(Saver.save("-input=" + Filename)); + CmdArgs.push_back("-list"); + auto Output = createOutputFile("bundled-targets", "list"); + if (!Output) + return Output.takeError(); + SmallVector> Redirects{std::nullopt, *Output, + std::nullopt}; + int ErrCode = llvm::sys::ExecuteAndWait(*OffloadBundlerPath, CmdArgs, + std::nullopt, Redirects); + if (ErrCode != 0) + return createStringError(inconvertibleErrorCode(), + "Failed to list targets"); + return *Output; +} + +// This routine is used to run the clang-offload-bundler tool and unbundle +// device inputs that have been created with an older compiler where the +// device object is bundled into a host object. +static Expected unbundle(StringRef Filename, const ArgList &Args, + llvm::Triple Triple) { + Expected OffloadBundlerPath = findProgram( + "clang-offload-bundler", {getMainExecutable("clang-offload-bundler")}); + if (!OffloadBundlerPath) + return OffloadBundlerPath.takeError(); + + // Create a new file to write the unbundled file to. + auto TempFileOrErr = + createOutputFile(sys::path::filename(ExecutableName), "ir"); + if (!TempFileOrErr) + return TempFileOrErr.takeError(); + + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + + SmallVector CmdArgs; + CmdArgs.push_back(*OffloadBundlerPath); + bool IsArchive = isStaticArchiveFile(Filename); + CmdArgs.push_back(IsArchive ? "-type=aoo" : "-type=o"); + auto *Target = Args.MakeArgString(Twine("-targets=sycl-") + Triple.str()); + CmdArgs.push_back(Target); + CmdArgs.push_back(Saver.save("-input=" + Filename)); + CmdArgs.push_back(Saver.save("-output=" + *TempFileOrErr)); + CmdArgs.push_back("-unbundle"); + CmdArgs.push_back("-allow-missing-bundles"); + if (Error Err = executeCommands(*OffloadBundlerPath, CmdArgs)) + return std::move(Err); + return *TempFileOrErr; +} + +Error extractBundledObjects(StringRef Filename, const ArgList &Args, + SmallVector &Binaries) { + auto List = listSection(Filename, Args); + if (!List) + return List.takeError(); + SmallVector TriplesInFile; + llvm::ErrorOr> TripleList = + llvm::MemoryBuffer::getFileOrSTDIN(*List, /*isText=*/true); + if (std::error_code EC = TripleList.getError()) + return createFileError(*List, EC); + (*TripleList) + ->getBuffer() + .split(TriplesInFile, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false); + for (StringRef TripleStr : TriplesInFile) { + StringRef SYCLPrefix = "sycl-"; + if (!TripleStr.starts_with(SYCLPrefix)) + continue; + llvm::Triple Triple(TripleStr.substr(SYCLPrefix.size())); + auto UnbundledFile = unbundle(Filename, Args, Triple); + if (!UnbundledFile) + return UnbundledFile.takeError(); + if (*UnbundledFile == Filename) + continue; + + SmallVector ObjectFilePaths; + if (sycl::isStaticArchiveFile(Filename)) { + llvm::ErrorOr> ObjList = + llvm::MemoryBuffer::getFileOrSTDIN(*UnbundledFile, /*isText=*/true); + if (std::error_code EC = ObjList.getError()) + return createFileError(*UnbundledFile, EC); + (*ObjList)->getBuffer().split(ObjectFilePaths, '\n', /*MaxSplit=*/-1, + /*KeepEmpty=*/false); + } else { + ObjectFilePaths.push_back(*UnbundledFile); + } + for (StringRef ObjectFilePath : ObjectFilePaths) { + llvm::file_magic Magic; + llvm::identify_magic(ObjectFilePath, Magic); + if (Magic == file_magic::spirv_object) + return createStringError( + "SPIR-V fat objects must be generated with --offload-new-driver"); + auto Arg = Args.MakeArgString( + "sycl-" + + (Triple.isSPIROrSPIRV() ? Triple.str() + "-" : Triple.str()) + "=" + + ObjectFilePath); + auto Binary = getInputBitcodeLibrary(Arg); + + if (!Binary) + return Binary.takeError(); + + Binaries.push_back(std::move(*Binary)); + } + } + return Error::success(); +} + } // namespace sycl namespace generic { @@ -2634,8 +2763,14 @@ getDeviceInput(const ArgList &Args) { if (identify_magic(Buffer.getBuffer()) == file_magic::elf_shared_object) continue; SmallVector Binaries; + size_t OldSize = Binaries.size(); if (Error Err = extractOffloadBinaries(Buffer, Binaries)) return std::move(Err); + if (Binaries.size() == OldSize) { + if (Error Err = sycl::extractBundledObjects(*Filename, Args, Binaries)) + return std::move(Err); + } + for (auto &OffloadFile : Binaries) { if (identify_magic(Buffer.getBuffer()) == file_magic::archive && !WholeArchive) diff --git a/sycl/test-e2e/NewOffloadDriver/multisource.cpp b/sycl/test-e2e/NewOffloadDriver/multisource.cpp index 83223d4b68e09..0d7bec07654fb 100644 --- a/sycl/test-e2e/NewOffloadDriver/multisource.cpp +++ b/sycl/test-e2e/NewOffloadDriver/multisource.cpp @@ -21,6 +21,23 @@ // RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t.fat // RUN: %{run} %t.fat +// Multiple sources with kernel code with old-style objects +// Test with `--offload-new-driver` +// RUN: %{build} --no-offload-new-driver -c -o %t.init.o -DINIT_KERNEL +// RUN: %{build} --no-offload-new-driver -c -o %t.calc.o -DCALC_KERNEL +// RUN: %{build} --no-offload-new-driver -c -o %t.main.o -DMAIN_APP +// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t.fat +// RUN: %{run} %t.fat + +// Multiple sources with kernel code with old-style objects in a static archive +// Test with `--offload-new-driver` +// RUN: %{build} --no-offload-new-driver -c -o %t.init.o -DINIT_KERNEL +// RUN: %{build} --no-offload-new-driver -c -o %t.calc.o -DCALC_KERNEL +// RUN: %{build} --no-offload-new-driver -c -o %t.main.o -DMAIN_APP +// RUN: llvm-ar r %t.a %t.init.o %t.calc.o +// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.main.o %t.a -o %t.fat +// RUN: %{run} %t.fat + #include #include From 1b05b81c69eb11890986ae3f8938ebc28f0536f7 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 12 Sep 2024 01:56:26 +0900 Subject: [PATCH 14/19] [SYCL][NFCI] Rework spec constants metadata used for split (#15346) Addressing review feedback from https://github.com/intel/llvm/pull/15271 Signed-off-by: Sarnie, Nick --- llvm/include/llvm/SYCLLowerIR/SpecConstants.h | 5 ----- llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp | 13 +++++-------- llvm/lib/SYCLLowerIR/SpecConstants.cpp | 6 +++--- llvm/tools/sycl-post-link/sycl-post-link.cpp | 4 ++++ 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h index 114bf431a279a..217d9f90913ca 100644 --- a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h +++ b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h @@ -72,11 +72,6 @@ class SpecConstantsPass : public PassInfoMixin { collectSpecConstantDefaultValuesMetadata(const Module &M, std::vector &DefaultValues); - // Name of the metadata which holds a list of all specialization constants - // (with associated information) encountered in the module - static constexpr char SPEC_CONST_MD_STRING[] = - "sycl.specialization-constants"; - // Name of the metadata which indicates this module was proccessed with the // default values handing mode. static constexpr char SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING[] = diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp index 67ac13c569f10..0bfcd53784af3 100644 --- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp +++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp @@ -150,14 +150,11 @@ PropSetRegTy computeModuleProperties(const Module &M, PropSet.add(PropSetRegTy::SYCL_DEVICE_REQUIREMENTS, computeDeviceRequirements(M, EntryPoints).asMap()); } - auto *SpecConstsMD = - M.getNamedMetadata(SpecConstantsPass::SPEC_CONST_MD_STRING); - bool SpecConstsMet = - SpecConstsMD != nullptr && SpecConstsMD->getNumOperands() != 0; - if (SpecConstsMet) { - // extract spec constant maps per each module - SpecIDMapTy TmpSpecIDMap; - SpecConstantsPass::collectSpecConstantMetadata(M, TmpSpecIDMap); + + // extract spec constant maps per each module + SpecIDMapTy TmpSpecIDMap; + SpecConstantsPass::collectSpecConstantMetadata(M, TmpSpecIDMap); + if (!TmpSpecIDMap.empty()) { PropSet.add(PropSetRegTy::SYCL_SPECIALIZATION_CONSTANTS, TmpSpecIDMap); // Add property with the default values of spec constants diff --git a/llvm/lib/SYCLLowerIR/SpecConstants.cpp b/llvm/lib/SYCLLowerIR/SpecConstants.cpp index bf8215db94028..4f43a22e95fd9 100644 --- a/llvm/lib/SYCLLowerIR/SpecConstants.cpp +++ b/llvm/lib/SYCLLowerIR/SpecConstants.cpp @@ -47,6 +47,9 @@ constexpr char SPIRV_GET_SPEC_CONST_VAL[] = "__spirv_SpecConstant"; constexpr char SPIRV_GET_SPEC_CONST_COMPOSITE[] = "__spirv_SpecConstantComposite"; +// Name of the metadata which holds a list of all specialization constants (with +// associated information) encountered in the module +constexpr char SPEC_CONST_MD_STRING[] = "sycl.specialization-constants"; // Name of the metadata which holds a default value list of all specialization // constants encountered in the module constexpr char SPEC_CONST_DEFAULT_VAL_MD_STRING[] = @@ -1026,9 +1029,6 @@ PreservedAnalyses SpecConstantsPass::run(Module &M, for (const auto &P : DefaultsMetadata) MDDefaults->addOperand(P); - if (Mode == HandlingMode::default_values) - M.getOrInsertNamedMetadata(SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING); - return IRModified ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 00bacce06d08b..3ea65b2492a5c 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -509,6 +509,10 @@ processSpecConstantsWithDefaultValues(const module_split::ModuleDesc &MD) { assert(NewModuleDesc->Props.SpecConstsMet && "This property should be true since the presence of SpecConsts " "has been checked before the run of the pass"); + // Add metadata to the module so we can identify it as the default value split + // later. + NewModuleDesc->getModule().getOrInsertNamedMetadata( + SpecConstantsPass::SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING); NewModuleDesc->rebuildEntryPoints(); return NewModuleDesc; } From 8d1c9f02126cab95a0b84f5429f639050c993711 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 12 Sep 2024 02:07:40 +0900 Subject: [PATCH 15/19] [SYCL] Remove device_global_static.cpp test (#15362) We don't think it provides value. Signed-off-by: Sarnie, Nick --- .../device_global_static.cpp | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100644 sycl/test/check_device_code/device_global_static.cpp diff --git a/sycl/test/check_device_code/device_global_static.cpp b/sycl/test/check_device_code/device_global_static.cpp deleted file mode 100644 index a2d7768067de6..0000000000000 --- a/sycl/test/check_device_code/device_global_static.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// Tests that the llvm.compiler.used symbol, which is used to implement static -// device globals, is removed at some point in compilation. For SPIR-V this -// symbol is removed at sycl-post-link and for NVPTX/AMDGCN it is removed at -// lowering. -// -// It also checks that the symbol can be found in an object file for a given -// triple, thus validating that `llvm-strings` can successfully be used to -// check for the presence of the symbol. - -// UNSUPPORTED: windows - -// RUN: %clangxx -fsycl -fsycl-device-only %s -o %t -// RUN: llvm-strings %t | grep "llvm.compiler.used" -// RUN: %clangxx -fsycl %s -o %t -// RUN: llvm-strings %t | not grep "llvm.compiler.used" - -// RUN: %if cuda %{ %clangxx -fsycl -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda %s -o %t %} -// RUN: %if cuda %{ llvm-strings %t | grep "llvm.compiler.used" %} -// RUN: %if cuda %{ %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s -o %t %} -// RUN: %if cuda %{ llvm-strings %t | not grep "llvm.compiler.used" %} - -// RUN: %if hip_amd %{ %clangxx -fsycl -fsycl-device-only -fsycl-targets=amd_gpu_gfx906 %s -o %t %} -// RUN: %if hip_amd %{ llvm-strings %t | grep "llvm.compiler.used" %} -// RUN: %if hip_amd %{ %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -o %t %} -// RUN: %if hip_amd %{ llvm-strings %t | not grep "llvm.compiler.used" %} - -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental; - -static device_global DeviceGlobalVar; - -int main() { - sycl::queue{}.single_task([=] { volatile int ReadVal = DeviceGlobalVar; }); -} From d286ca2c1aa4377c51faf282a4d972d8b728c711 Mon Sep 17 00:00:00 2001 From: Kseniya Tikhomirova Date: Wed, 11 Sep 2024 19:20:56 +0200 Subject: [PATCH 16/19] [SYCL] Add sync for host task after barrier (#15345) PR includes the following fixes: * When submitting a command to an out-of-order queue we don't need to add the dependency from the last barrier to the scheduler if command is enqueued via UR because that means that UR backend will take care of the command from barrier. * We used to update the last barrier only if barrier which is being submitted to the sycl::queue is not enqueued to the backend which is not correct, because there might be host tasks following that barrier which have to depend on it. Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.hpp | 15 ++++++++------- sycl/unittests/scheduler/HostTaskAndBarrier.cpp | 9 ++++++--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e15e9bc69503e..a67fdb5e19102 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -784,18 +784,19 @@ class queue_impl { if (Type == CGType::Barrier && !Deps.UnenqueuedCmdEvents.empty()) { Handler.depends_on(Deps.UnenqueuedCmdEvents); } - if (Deps.LastBarrier) + if (Deps.LastBarrier && (Type == CGType::CodeplayHostTask || + (!Deps.LastBarrier->isEnqueued()))) Handler.depends_on(Deps.LastBarrier); + EventRet = Handler.finalize(); EventImplPtr EventRetImpl = getSyclObjImpl(EventRet); if (Type == CGType::CodeplayHostTask) Deps.UnenqueuedCmdEvents.push_back(EventRetImpl); - else if (!EventRetImpl->isEnqueued()) { - if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) { - Deps.LastBarrier = EventRetImpl; - Deps.UnenqueuedCmdEvents.clear(); - } else - Deps.UnenqueuedCmdEvents.push_back(EventRetImpl); + else if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) { + Deps.LastBarrier = EventRetImpl; + Deps.UnenqueuedCmdEvents.clear(); + } else if (!EventRetImpl->isEnqueued()) { + Deps.UnenqueuedCmdEvents.push_back(EventRetImpl); } } } diff --git a/sycl/unittests/scheduler/HostTaskAndBarrier.cpp b/sycl/unittests/scheduler/HostTaskAndBarrier.cpp index 1a2e981c2ae99..c244e7b60a0b3 100644 --- a/sycl/unittests/scheduler/HostTaskAndBarrier.cpp +++ b/sycl/unittests/scheduler/HostTaskAndBarrier.cpp @@ -196,7 +196,8 @@ TEST_F(BarrierHandlingWithHostTask, BarrierHostTaskKernel) { sycl::event HTEvent = AddTask(TestCGType::HOST_TASK); EventImplPtr HostTaskEventImpl = sycl::detail::getSyclObjImpl(HTEvent); auto HostTaskWaitList = HostTaskEventImpl->getWaitList(); - ASSERT_EQ(HostTaskWaitList.size(), 0u); + ASSERT_EQ(HostTaskWaitList.size(), 1u); + EXPECT_EQ(HostTaskWaitList[0], BarrierEventImpl); EXPECT_EQ(HostTaskEventImpl->isEnqueued(), true); sycl::event KernelEvent = AddTask(TestCGType::KERNEL_TASK); @@ -225,7 +226,8 @@ TEST_F(BarrierHandlingWithHostTask, BarrierKernelHostTask) { sycl::event HTEvent = AddTask(TestCGType::HOST_TASK); EventImplPtr HostTaskEventImpl = sycl::detail::getSyclObjImpl(HTEvent); auto HostTaskWaitList = HostTaskEventImpl->getWaitList(); - ASSERT_EQ(HostTaskWaitList.size(), 0u); + ASSERT_EQ(HostTaskWaitList.size(), 1u); + EXPECT_EQ(HostTaskWaitList[0], BarrierEventImpl); EXPECT_EQ(HostTaskEventImpl->isEnqueued(), true); MainLock.unlock(); @@ -272,7 +274,8 @@ TEST_F(BarrierHandlingWithHostTask, KernelBarrierHostTask) { sycl::event HTEvent = AddTask(TestCGType::HOST_TASK); EventImplPtr HostTaskEventImpl = sycl::detail::getSyclObjImpl(HTEvent); auto HostTaskWaitList = HostTaskEventImpl->getWaitList(); - ASSERT_EQ(HostTaskWaitList.size(), 0u); + ASSERT_EQ(HostTaskWaitList.size(), 1u); + EXPECT_EQ(HostTaskWaitList[0], BarrierEventImpl); EXPECT_EQ(HostTaskEventImpl->isEnqueued(), true); MainLock.unlock(); From 4152ed323077ea5bc8c8c61063484e9db2395cfd Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 11 Sep 2024 10:50:09 -0700 Subject: [PATCH 17/19] [SYCL][NFC] Fix InOrderQueueIsolatedDeps when run repeatedly (#15340) The test needs to reset the variable used to check when a UR call is made. --- sycl/unittests/scheduler/InOrderQueueDeps.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp index 4e53f0ed73a18..b763cbef85a7a 100644 --- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp @@ -107,6 +107,7 @@ TEST_F(SchedulerTest, InOrderQueueIsolatedDeps) { sycl::platform Plt = sycl::platform(); mock::getCallbacks().set_before_callback( "urEnqueueEventsWaitWithBarrier", &redefinedEnqueueEventsWaitWithBarrier); + BarrierCalled = false; context Ctx{Plt.get_devices()[0]}; queue Q1{Ctx, default_selector_v, property::queue::in_order()}; From d3d95215454af4ba8491888cc030273b32cb5c8d Mon Sep 17 00:00:00 2001 From: David Garcia Orozco Date: Wed, 11 Sep 2024 13:30:41 -0700 Subject: [PATCH 18/19] [SYCL][E2E] Fix warnings from using GNU style options with MSVC compiler driver (#15364) Due to the addition of the `-Werror` flag, these warnings were causing test failures when compiling tests with clang-cl. This patch does the following: - Add `-Wno-unused-command-line-argument` to linking run lines. This is needed because when testing the MSVC driver the `/EHsc` flag is added to all run lines that call the compiler. This flag however is only needed at the compile stage so it is reported as unused if a run line is only linking. - Add new expansion for no optimizations `%no_opt`, which is either `-O0` or `/Od` depending on the compiler driver. - Use the expansion `%cxx_std_option` in place of `-std=` - For flags that do not necessarily have an MSVC equivalent, prepend `/clang:` to the flag when using MSVC driver. --- sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp | 8 ++++---- sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp | 3 ++- sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp | 2 +- sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp | 2 +- sycl/test-e2e/Basic/group_async_copy.cpp | 2 +- sycl/test-e2e/Basic/group_async_copy_legacy.cpp | 2 +- sycl/test-e2e/Basic/group_local_id.cpp | 2 +- sycl/test-e2e/Basic/multisource.cpp | 4 ++-- sycl/test-e2e/Basic/multisource_spv_obj.cpp | 6 +++--- sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp | 2 +- sycl/test-e2e/Basic/vector/byte.cpp | 2 +- sycl/test-e2e/Config/config.cpp | 2 +- sycl/test-e2e/Config/env_vars.cpp | 2 +- sycl/test-e2e/DeviceLib/separate_compile_test.cpp | 4 ++-- sycl/test-e2e/ESIMD/rotate.cpp | 4 ++-- sycl/test-e2e/ESIMD/slm_alloc.cpp | 2 +- .../test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp | 2 +- sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp | 2 +- .../GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp | 2 +- sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp | 2 +- sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp | 2 +- .../test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp | 2 +- .../is_compatible/is_compatible_with_aspects.cpp | 2 +- .../sycl-external-with-optional-features.cpp | 2 +- .../throw-exception-for-unsupported-aspect.cpp | 2 +- sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp | 2 +- .../PropagateOptionsToBackend/sycl-opt-level-opencl.cpp | 4 ++-- .../Regression/2020-spec-constants-debug-info.cpp | 2 +- sycl/test-e2e/Regression/DAE-separate-compile.cpp | 2 +- sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp | 2 +- .../Regression/default-constructed-local-accessor.cpp | 2 +- .../Regression/optimization_level_debug_info_intopt.cpp | 2 +- .../Regression/optimization_level_debug_info_specopt.cpp | 9 ++++++--- sycl/test-e2e/Regression/unoptimized_stream.cpp | 2 +- sycl/test-e2e/SeparateCompile/same-kernel.cpp | 2 +- .../SeparateCompile/sycl-external-within-staticlib.cpp | 4 ++-- sycl/test-e2e/SeparateCompile/sycl-external.cpp | 4 ++-- sycl/test-e2e/SeparateCompile/test.cpp | 2 +- sycl/test-e2e/lit.cfg.py | 2 ++ 39 files changed, 57 insertions(+), 51 deletions(-) diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp index d23ece6677b06..08cc0e8199cfd 100644 --- a/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp +++ b/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp @@ -18,10 +18,10 @@ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // Build any image archive binaries from early archives. -// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_sub.a -o %t_early_image_sub.a -// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_add.a -o %t_early_image_add.a -// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_sub_x.a -o %t_early_image_sub_x.a -// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_add_x.a -o %t_early_image_add_x.a +// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_sub.a -o %t_early_image_sub.a +// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_add.a -o %t_early_image_add.a +// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_sub_x.a -o %t_early_image_sub_x.a +// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_add_x.a -o %t_early_image_add_x.a //////////////////////////////////////////////////////////////////////////////// // Use a variety of archive orders //////////////////////////////////////////////////////////////////////////////// diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp index 75fe3f2fc50f9..190ca3bedf095 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp @@ -1,4 +1,5 @@ -// RUN: %{build} -fno-fast-math -o %t.out +// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%} +// RUN: %{build} %{mathflags} -o %t.out // RUN: %{run} %t.out // Test new, ABI-breaking for all platforms. diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp index 578602dccb0e2..6f2ca5f76f1ec 100644 --- a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp +++ b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp @@ -20,7 +20,7 @@ // AOCX with source // RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp %t_image.a -o %t_aocx_src.out // AOCX with object -// RUN: %clangxx -fsycl -fintelfpga %t.o %t_image.a -o %t_aocx_obj.out +// RUN: %clangxx -fsycl -fintelfpga %t.o %t_image.a -Wno-unused-command-line-argument -o %t_aocx_obj.out // // RUN: %{run} %t_aocx_src.out // RUN: %{run} %t_aocx_obj.out diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp index 44f768ace53df..d648eaaff46ca 100644 --- a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp +++ b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp @@ -21,7 +21,7 @@ // AOCX with source // RUN: %clangxx -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp %t_image.lib -o %t_aocx_src.out // AOCX with object -// RUN: %clangxx -fsycl -fintelfpga %t.obj %t_image.lib -o %t_aocx_obj.out +// RUN: %clangxx -fsycl -fintelfpga %t.obj %t_image.lib -Wno-unused-command-line-argument -o %t_aocx_obj.out // // RUN: %{run} %t_aocx_src.out // RUN: %{run} %t_aocx_obj.out diff --git a/sycl/test-e2e/Basic/group_async_copy.cpp b/sycl/test-e2e/Basic/group_async_copy.cpp index 0e539d9912fe8..2eceadd3e9898 100644 --- a/sycl/test-e2e/Basic/group_async_copy.cpp +++ b/sycl/test-e2e/Basic/group_async_copy.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -std=c++17 -o %t.run +// RUN: %{build} %cxx_std_optionc++17 -o %t.run // RUN: %{run} %t.run // Windows doesn't yet have full shutdown(). Skipping TC MemLeak tests. diff --git a/sycl/test-e2e/Basic/group_async_copy_legacy.cpp b/sycl/test-e2e/Basic/group_async_copy_legacy.cpp index dbdaaeada882c..ac5fac91e0cdb 100644 --- a/sycl/test-e2e/Basic/group_async_copy_legacy.cpp +++ b/sycl/test-e2e/Basic/group_async_copy_legacy.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -std=c++17 -o %t.run -Wno-deprecated-declarations +// RUN: %{build} %cxx_std_optionc++17 -o %t.run -Wno-deprecated-declarations // RUN: %{run} %t.run // Variant of group_asymc_copy.cpp using legacy multi_ptr and the corresponding diff --git a/sycl/test-e2e/Basic/group_local_id.cpp b/sycl/test-e2e/Basic/group_local_id.cpp index 83e43ef4be188..a523ea70cc8de 100644 --- a/sycl/test-e2e/Basic/group_local_id.cpp +++ b/sycl/test-e2e/Basic/group_local_id.cpp @@ -1,6 +1,6 @@ // REQUIRES: cpu -// RUN: %{build} -std=c++17 -o %t.out +// RUN: %{build} %cxx_std_optionc++17 -o %t.out // RUN: %{run} %t.out #include diff --git a/sycl/test-e2e/Basic/multisource.cpp b/sycl/test-e2e/Basic/multisource.cpp index 778fc7007e0a8..87ac6ccfe4888 100644 --- a/sycl/test-e2e/Basic/multisource.cpp +++ b/sycl/test-e2e/Basic/multisource.cpp @@ -9,14 +9,14 @@ // Separate kernel sources and host code sources // RUN: %{build} -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL // RUN: %{build} -c -o %t.main.o -DMAIN_APP -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -o %t.fat +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t.fat // RUN: %{run} %t.fat // Multiple sources with kernel code // RUN: %{build} -c -o %t.init.o -DINIT_KERNEL // RUN: %{build} -c -o %t.calc.o -DCALC_KERNEL // RUN: %{build} -c -o %t.main.o -DMAIN_APP -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -o %t.fat +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t.fat // RUN: %{run} %t.fat #include diff --git a/sycl/test-e2e/Basic/multisource_spv_obj.cpp b/sycl/test-e2e/Basic/multisource_spv_obj.cpp index 46bbfceac0fb9..9756de3f24fd1 100644 --- a/sycl/test-e2e/Basic/multisource_spv_obj.cpp +++ b/sycl/test-e2e/Basic/multisource_spv_obj.cpp @@ -11,21 +11,21 @@ // Separate kernel sources and host code sources // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.main.o -DMAIN_APP -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -o %t.fat +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t.fat // RUN: %{run} %t.fat // Multiple sources with kernel code // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.init.o -DINIT_KERNEL // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.calc.o -DCALC_KERNEL // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.main.o -DMAIN_APP -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -o %t.fat +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t.fat // RUN: %{run} %t.fat // Multiple sources with kernel code, mixed SPIR-V and LLVM-IR objects // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.init.o -DINIT_KERNEL // RUN: %{build} -fsycl-device-obj=llvmir -c -o %t.calc.o -DCALC_KERNEL // RUN: %{build} -c -o %t.main.o -DMAIN_APP -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -o %t.fat +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t.fat // RUN: %{run} %t.fat #include diff --git a/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp b/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp index 4c6f371814b69..9dd2194e9e33c 100644 --- a/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp +++ b/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // RUN: %clangxx -fsycl -fsycl-device-obj=spirv -c -o %t.o %s -// RUN: %clangxx -fsycl -o %t.out %t.o +// RUN: %clangxx -fsycl -Wno-unused-command-line-argument -o %t.out %t.o // RUN: %{run} %t.out // This test verifies SPIR-V based fat objects. diff --git a/sycl/test-e2e/Basic/vector/byte.cpp b/sycl/test-e2e/Basic/vector/byte.cpp index c0f94e687559a..278a49a31b955 100644 --- a/sycl/test-e2e/Basic/vector/byte.cpp +++ b/sycl/test-e2e/Basic/vector/byte.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -std=c++17 -o %t.out +// RUN: %{build} %cxx_std_optionc++17 -o %t.out // RUN: %{run} %t.out // RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -std=c++17 -o %t2.out %} diff --git a/sycl/test-e2e/Config/config.cpp b/sycl/test-e2e/Config/config.cpp index 1499a6f9908d5..1374e6e56cfde 100644 --- a/sycl/test-e2e/Config/config.cpp +++ b/sycl/test-e2e/Config/config.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// RUN: %{build} %debug_option -O0 -o %t.out +// RUN: %{build} %debug_option %no_opt -o %t.out // RUN: echo SYCL_PRINT_EXECUTION_GRAPH=always > %t.cfg // RUN: env SYCL_CONFIG_FILE_NAME=%t.cfg %t.out // RUN: cat *.dot > /dev/null diff --git a/sycl/test-e2e/Config/env_vars.cpp b/sycl/test-e2e/Config/env_vars.cpp index 00f17c508ccae..67ec5debd4986 100644 --- a/sycl/test-e2e/Config/env_vars.cpp +++ b/sycl/test-e2e/Config/env_vars.cpp @@ -1,7 +1,7 @@ // REQUIRES: opencl // Env vars are used to pass OpenCL-specific flags to PI compiling/linking. // -// RUN: %{build} -O0 -o %t.out +// RUN: %{build} %no_opt -o %t.out // // RUN: env SYCL_PROGRAM_COMPILE_OPTIONS="-g" %{run} %t.out // RUN: env SYCL_PROGRAM_APPEND_COMPILE_OPTIONS="-g" %{run} %t.out diff --git a/sycl/test-e2e/DeviceLib/separate_compile_test.cpp b/sycl/test-e2e/DeviceLib/separate_compile_test.cpp index ad137fb14e5b8..09085fe2c5ab0 100644 --- a/sycl/test-e2e/DeviceLib/separate_compile_test.cpp +++ b/sycl/test-e2e/DeviceLib/separate_compile_test.cpp @@ -4,12 +4,12 @@ // RUN: %clangxx -fsycl-device-only -Xclang -fsycl-int-header=std_complex_math_test_ihdr.h %S/std_complex_math_test.cpp -Wno-sycl-strict %{mathflags} // >> host compilation... // RUN: %clangxx -Wno-error=unused-command-line-argument -Wno-error=ignored-attributes %cxx_std_optionc++17 %include_option std_complex_math_test_ihdr.h -c %S/std_complex_math_test.cpp -o %t_host.o %sycl_options -Wno-sycl-strict %{mathflags} -// RUN: %clangxx %t_host.o %t_device.o -o %t.out %sycl_options %{mathflags} +// RUN: %clangxx %t_host.o %t_device.o -Wno-unused-command-line-argument -o %t.out %sycl_options %{mathflags} // RUN: %{run} %t.out // RUN: %clangxx -fsycl -fsycl-link %S/std_complex_math_fp64_test.cpp -o %t_fp64_device.o %{mathflags} // RUN: %clangxx -fsycl-device-only -Xclang -fsycl-int-header=std_complex_math_fp64_test_ihdr.h %S/std_complex_math_fp64_test.cpp -Wno-sycl-strict %{mathflags} // >> host compilation... // RUN: %clangxx -Wno-error=unused-command-line-argument -Wno-error=ignored-attributes %cxx_std_optionc++17 %include_option std_complex_math_fp64_test_ihdr.h -c %S/std_complex_math_fp64_test.cpp -o %t_fp64_host.o %sycl_options -Wno-sycl-strict %{mathflags} -// RUN: %clangxx %t_fp64_host.o %t_fp64_device.o -o %t_fp64.out %sycl_options %{mathflags} +// RUN: %clangxx %t_fp64_host.o %t_fp64_device.o -Wno-unused-command-line-argument -o %t_fp64.out %sycl_options %{mathflags} // RUN: %{run} %t.out diff --git a/sycl/test-e2e/ESIMD/rotate.cpp b/sycl/test-e2e/ESIMD/rotate.cpp index 484be469752e7..246304b46f4e6 100644 --- a/sycl/test-e2e/ESIMD/rotate.cpp +++ b/sycl/test-e2e/ESIMD/rotate.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -// RUN: %{build} -fsycl-device-code-split=per_kernel -std=c++20 -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel %cxx_std_optionc++20 -o %t.out // RUN: %{run} %t.out -// RUN: %{build} -fsycl-device-code-split=per_kernel -std=c++20 -o %t1.out -DEXP +// RUN: %{build} -fsycl-device-code-split=per_kernel %cxx_std_optionc++20 -o %t1.out -DEXP // RUN: %{run} %t1.out // This is a basic test to validate the ror/rol functions. diff --git a/sycl/test-e2e/ESIMD/slm_alloc.cpp b/sycl/test-e2e/ESIMD/slm_alloc.cpp index 06bfa06498746..fe2ebdf23d484 100644 --- a/sycl/test-e2e/ESIMD/slm_alloc.cpp +++ b/sycl/test-e2e/ESIMD/slm_alloc.cpp @@ -8,7 +8,7 @@ // RUN: %{run} %t.2.out // Check if the test sill passes with O0 -// RUN: %{build} -O0 -o %t.3.out +// RUN: %{build} %no_opt -o %t.3.out // RUN: %{run} %t.3.out // This is end-to-end test for the slm_allocator API used together with the diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp index bd0ed45866ea4..dce107b5019a4 100644 --- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp +++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp @@ -11,7 +11,7 @@ // RUN: %{run} %t.2.out // Check if the test sill passes with O0 -// RUN: %{build} -O0 -o %t.3.out +// RUN: %{build} %no_opt -o %t.3.out // RUN: %{run} %t.3.out // Checks validity of SLM frame offsets in case of complex call graph with two diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp index 2e744fdf88aeb..311220039db62 100644 --- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp +++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp @@ -11,7 +11,7 @@ // RUN: %{run} %t.2.out // Check if the test sill passes with O0 -// RUN: %{build} -O0 -o %t.3.out +// RUN: %{build} %no_opt -o %t.3.out // RUN: %{run} %t.3.out // Check that SLM frame offset of a function foo called from two kernels Test1 diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp index ed705979ac4fe..1c1483dfec33e 100644 --- a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp +++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp @@ -5,7 +5,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// RUN: %{build} -O0 -o %t_O0.out +// RUN: %{build} %no_opt -o %t_O0.out // RUN: %{run} %t_O0.out /* diff --git a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp index 6f19d9f3cdf2a..a24e9e6d6afd1 100644 --- a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp +++ b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// RUN: %{build} -O0 -o %t.out +// RUN: %{build} %no_opt -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp b/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp index 09ff48d97173c..94b698d6ca6ca 100644 --- a/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp +++ b/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp @@ -6,5 +6,5 @@ // RUN: %{run} %t.out // Test -O0 with `--offload-new-driver` -// RUN: %clangxx -O0 -fsycl -fsycl-targets=spir64-x86_64 %S/Inputs/aot.cpp +// RUN: %clangxx %no_opt -fsycl -fsycl-targets=spir64-x86_64 %S/Inputs/aot.cpp // RUN: %{run} %t.out diff --git a/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp b/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp index 63351c8cab3c6..d7f0fb22ffb70 100644 --- a/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp +++ b/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp @@ -1,7 +1,7 @@ // UNSUPPORTED: cuda || hip // Test with `--offload-new-driver` // RUN: %clangxx -fsycl -fsycl-device-obj=spirv --offload-new-driver -c -o %t.o %s -// RUN: %clangxx -fsycl --offload-new-driver -o %t.out %t.o +// RUN: %clangxx -fsycl --offload-new-driver -Wno-unused-command-line-argument -o %t.out %t.o // RUN: %{run} %t.out // This test verifies SPIR-V based fat objects. diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp index 68a8c19f1de0a..7bf2c5e1edb12 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp @@ -1,7 +1,7 @@ // requires: cpu, gpu, accelerator // UNSUPPORTED: hip // FIXME: enable the test back, see intel/llvm#8146 -// RUN: %{build} -Wno-error=incorrect-sub-group-size -O0 -o %t.out +// RUN: %{build} -Wno-error=incorrect-sub-group-size %no_opt -o %t.out // RUN: %{run} %t.out #include diff --git a/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp b/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp index 3f38a4df3023d..5a04ea4ed55df 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -DSOURCE1 -c -o %t1.o // RUN: %{build} -DSOURCE2 -c -o %t2.o -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -o %t.exe +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -Wno-unused-command-line-argument -o %t.exe // RUN: %{run} %t.exe #ifdef SOURCE1 diff --git a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp index c9c9f54eaff98..245fb3a28d718 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp @@ -1,6 +1,6 @@ // REQUIRES: cpu -// RUN: %{build} -O0 -o %t.out +// RUN: %{build} %no_opt -o %t.out // RUN: %{run} %t.out #include diff --git a/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp b/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp index e21309db20e64..6304f4bbfd5b5 100644 --- a/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp +++ b/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp @@ -4,7 +4,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 -I %S/Inputs/ %S/uneven_kernel_split.cpp -c -o %t.o // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts -I %S/Inputs/ %S/Inputs/gpu_kernel1.cpp -c -o %t1.o // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts -I %S/Inputs/ %S/Inputs/gpu_kernel2.cpp -c -o %t2.o -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts %t.o %t1.o %t2.o -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts -Wno-unused-command-line-argument %t.o %t1.o %t2.o -o %t.out // RUN: %{run} %t.out // Test require the following device image structure: cpu target device image diff --git a/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp b/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp index c2877438392f7..29f49ef69b74b 100644 --- a/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp +++ b/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp @@ -1,6 +1,6 @@ // REQUIRES: opencl -// RUN: %{build} %if cl_options %{/Od%} %else %{-O0%} -o %t0.out +// RUN: %{build} %no_opt -o %t0.out // RUN: %if !acc %{ env SYCL_UR_TRACE=2 %{run} %t0.out 2>&1 | FileCheck %s --check-prefixes=CHECKOCL0 %} // RUN: %{build} -O1 -o %t1.out // RUN: %if !acc %{ env SYCL_UR_TRACE=2 %{run} %t1.out 2>&1 | FileCheck %s --check-prefixes=CHECKOCL1 %} @@ -9,7 +9,7 @@ // RUN: %{build} -O3 -o %t3.out // RUN: %if !acc %{ env SYCL_UR_TRACE=2 %{run} %t3.out 2>&1 | FileCheck %s --check-prefixes=CHECKOCL3 %} -// RUN: %{build} -O0 -o %t.out +// RUN: %{build} %no_opt -o %t.out // RUN: %{run} %t.out // This test verifies the propagation of front-end compiler optimization diff --git a/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp b/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp index 99617f8ddab87..946ae9a75a0d4 100644 --- a/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp +++ b/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp @@ -1,5 +1,5 @@ // RUN: %{build} -g -o %t.out -// RUN: %{build} -g -O0 -o %t.out +// RUN: %{build} -g %no_opt -o %t.out // RUN: %{build} -g -O2 -o %t.out // // The idea of this test is to make sure that we can compile the following diff --git a/sycl/test-e2e/Regression/DAE-separate-compile.cpp b/sycl/test-e2e/Regression/DAE-separate-compile.cpp index a6c8ae885cb39..7bfc1028b2644 100644 --- a/sycl/test-e2e/Regression/DAE-separate-compile.cpp +++ b/sycl/test-e2e/Regression/DAE-separate-compile.cpp @@ -5,7 +5,7 @@ // The test checks that the scenario works correctly. // // RUN: %{build} -O2 -c -o %t.o -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.o -O0 -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.o %no_opt -Wno-unused-command-line-argument -o %t.out // RUN: %{run} %t.out // Failing on HIP AMD, enable after fixed diff --git a/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp b/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp index f33000f22f269..57826c81ece2e 100644 --- a/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp +++ b/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp @@ -1,7 +1,7 @@ // REQUIRES: windows // RUN: %clangxx --driver-mode=cl -fsycl /MDd -c %s -o %t.obj -// RUN: %clangxx --driver-mode=cl -fsycl %t.obj -o %t.out +// RUN: %clangxx --driver-mode=cl -fsycl %t.obj -Wno-unused-command-line-argument -o %t.out // RUN: %{run} %t.out // The test aims to prevent regressions similar to the one which caused by diff --git a/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp b/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp index e136e2833a59a..e5c1df272a358 100644 --- a/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp +++ b/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp @@ -1,6 +1,6 @@ // -O0 is necessary; on higher levels of optimization, an error // would not occur because of dead argument elimination of the local_accessor. -// RUN: %{build} -o %t.out -O0 +// RUN: %{build} -o %t.out %no_opt // RUN: %{run} %t.out #include diff --git a/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp b/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp index 106460629b242..9091eae989310 100644 --- a/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp +++ b/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} %debug_option -O0 -o %t.out +// RUN: %{build} %debug_option %no_opt -o %t.out // RUN: %{build} %debug_option -O1 -o %t.out // RUN: %{build} %debug_option -O2 -o %t.out // RUN: %{build} %debug_option -O3 -o %t.out diff --git a/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp b/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp index c085d744cb075..57246c6cf4542 100644 --- a/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp +++ b/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp @@ -1,8 +1,11 @@ -// RUN: %{build} %debug_option -ffp-model=fast -o %t.out +// DEFINE: %{ffpflags} = %if cl_options %{/clang:-ffp-model=fast%} %else %{-ffp-model=fast%} +// DEFINE: %{Oz} = %if cl_options %{/clang:-Oz%} %else %{-Oz%} +// DEFINE: %{O} = %if cl_options %{/clang:-O%} %else %{-O%} +// RUN: %{build} %debug_option %{ffpflags} -o %t.out // RUN: %{build} %debug_option -Os -o %t.out -// RUN: %{build} %debug_option -Oz -o %t.out +// RUN: %{build} %debug_option %{Oz} -o %t.out // RUN: %{build} %debug_option -Og -o %t.out -// RUN: %{build} %debug_option -O -o %t.out +// RUN: %{build} %debug_option %{O} -o %t.out // NOTE: Tests that debugging information can be generated for all special-name // optimization levels. diff --git a/sycl/test-e2e/Regression/unoptimized_stream.cpp b/sycl/test-e2e/Regression/unoptimized_stream.cpp index e00d160f5d167..44f3c9bf39e80 100644 --- a/sycl/test-e2e/Regression/unoptimized_stream.cpp +++ b/sycl/test-e2e/Regression/unoptimized_stream.cpp @@ -1,4 +1,4 @@ -// RUN: %{build} -O0 -o %t.out +// RUN: %{build} %no_opt -o %t.out // RUN: %{run} %t.out #include diff --git a/sycl/test-e2e/SeparateCompile/same-kernel.cpp b/sycl/test-e2e/SeparateCompile/same-kernel.cpp index 15a2bfc9bc5c5..8bdadbe7ad62b 100644 --- a/sycl/test-e2e/SeparateCompile/same-kernel.cpp +++ b/sycl/test-e2e/SeparateCompile/same-kernel.cpp @@ -12,7 +12,7 @@ // RUN: %{build} -DB_CPP=1 -c -o %t-same-kernel-b.o // // >> ---- link the full hetero app -// RUN: %clangxx %t-same-kernel-a.o %t-same-kernel-b.o -o %t-same-kernel.exe -fsycl -fsycl-targets=%{sycl_triple} +// RUN: %clangxx %t-same-kernel-a.o %t-same-kernel-b.o -Wno-unused-command-line-argument -o %t-same-kernel.exe -fsycl -fsycl-targets=%{sycl_triple} // RUN: %{run} %t-same-kernel.exe #include diff --git a/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp b/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp index 912df03aadccd..219634f47646c 100644 --- a/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp +++ b/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp @@ -5,7 +5,7 @@ // RUN: %{build} -O3 -DSOURCE3 -c -o %t3.o // RUN: rm -f %t.a // RUN: llvm-ar crv %t.a %t1.o %t2.o -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t.a -o %t1.exe +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t.a -Wno-unused-command-line-argument -o %t1.exe // RUN: %{run} %t1.exe // Check the repacked case as it can behave differently. @@ -13,7 +13,7 @@ // RUN: echo addlib %t.a >> %t.txt // RUN: echo save >> %t.txt // RUN: cat %t.txt | llvm-ar -M -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t_repacked.a -o %t2.exe +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t_repacked.a -Wno-unused-command-line-argument -o %t2.exe // RUN: %{run} %t2.exe #include diff --git a/sycl/test-e2e/SeparateCompile/sycl-external.cpp b/sycl/test-e2e/SeparateCompile/sycl-external.cpp index d455ba7e71d3e..8b6fc12382198 100644 --- a/sycl/test-e2e/SeparateCompile/sycl-external.cpp +++ b/sycl/test-e2e/SeparateCompile/sycl-external.cpp @@ -2,14 +2,14 @@ // different object file. // RUN: %{build} -DSOURCE1 -c -o %t1.o // RUN: %{build} -DSOURCE2 -c -o %t2.o -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -o %t.exe +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -Wno-unused-command-line-argument -o %t.exe // RUN: %{run} %t.exe // // Test2 - check that kernel can call a SYCL_EXTERNAL function defined in a // static library. // RUN: rm -f %t.a // RUN: llvm-ar crv %t.a %t1.o -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t2.o %t.a -o %t.exe +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t2.o %t.a -Wno-unused-command-line-argument -o %t.exe // RUN: %{run} %t.exe #include diff --git a/sycl/test-e2e/SeparateCompile/test.cpp b/sycl/test-e2e/SeparateCompile/test.cpp index bd5d489ffddeb..8e38711ac974c 100644 --- a/sycl/test-e2e/SeparateCompile/test.cpp +++ b/sycl/test-e2e/SeparateCompile/test.cpp @@ -51,7 +51,7 @@ // RUN: %clangxx -Wno-error=override-module -c wrapper.bc -o wrapper.o // // >> ---- link the full hetero app -// RUN: %clangxx wrapper.o a.o b.o -o app.exe %sycl_options +// RUN: %clangxx wrapper.o a.o b.o -Wno-unused-command-line-argument -o app.exe %sycl_options // RUN: %{run} ./app.exe | FileCheck %s // CHECK: pass diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index bbef264be9c24..f7d56aaee8f8e 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -382,6 +382,7 @@ def open_check_file(file_name): config.substitutions.append(("%cxx_std_option", "/std:")) config.substitutions.append(("%fPIC", "")) config.substitutions.append(("%shared_lib", "/LD")) + config.substitutions.append(("%no_opt", "/Od")) else: config.substitutions.append( ( @@ -406,6 +407,7 @@ def open_check_file(file_name): ("%fPIC", ("" if platform.system() == "Windows" else "-fPIC")) ) config.substitutions.append(("%shared_lib", "-shared")) + config.substitutions.append(("%no_opt", "-O0")) # Check if user passed verbose-print parameter, if yes, add VERBOSE_PRINT macro if "verbose-print" in lit_config.params: From 404fb8a3f1439ecb648c17cc3989dd8926fee0e3 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 12 Sep 2024 11:20:06 +0100 Subject: [PATCH 19/19] [SYCL][UR] Bump UR tag to 2fea679 (#15350) Pull in fix from https://github.com/oneapi-src/unified-runtime/pull/2078 This should fix the post-commit E2E failures on opencl:fpga --------- Co-authored-by: Omar Ahmed --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index e569981589ac4..fce554d30582d 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit eb63d1a21729f6928bb6cccc5f92856b0690aca6 - # Merge: e26bba51 45a781f4 + # commit 24a8299efc59c715a1c2dd180692a5e12a12283a + # Merge: eb63d1a2 2fea679d # Author: Omar Ahmed - # Date: Tue Sep 10 12:08:57 2024 +0100 - # Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs - # [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda - set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6) + # Date: Wed Sep 11 10:40:59 2024 +0100 + # Merge pull request #2078 from callumfare/callum/fix_device_extensions_fpga + # Add workaround for silently supported OpenCL extensions on Intel FPGA + set(UNIFIED_RUNTIME_TAG 24a8299efc59c715a1c2dd180692a5e12a12283a) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") # Due to the use of dependentloadflag and no installer for UMF and hwloc we need