From bd178006c87aaa57699b54548a65b105e114bc8b Mon Sep 17 00:00:00 2001
From: Buildbot for SYCL <bb-sycl@intel.com>
Date: Tue, 10 Sep 2024 22:00:39 +0800
Subject: [PATCH 01/19] [GHA] Uplift Linux GPU RT version to 24.31.30508.7
 (#15339)

Scheduled drivers uplift

Co-authored-by: GitHub Actions <actions@github.com>
---
 devops/dependencies.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/devops/dependencies.json b/devops/dependencies.json
index 5398ad7d5a383..ea84983b95eb2 100644
--- a/devops/dependencies.json
+++ b/devops/dependencies.json
@@ -19,9 +19,9 @@
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "level_zero": {
-      "github_tag": "v1.17.39",
-      "version": "v1.17.39",
-      "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.17.39",
+      "github_tag": "v1.17.42",
+      "version": "v1.17.42",
+      "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.17.42",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "tbb": {

From 594ae74fae19b65484230df7a8677402d18e2b9d Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Tue, 10 Sep 2024 23:00:59 +0900
Subject: [PATCH 02/19] [SYCL] Simplify arguments to computeModuleProperties
 (#15271)

Computing information about spec consts is difficult for callers when
module properties generation and splitting are separate. Simplify it by
storing information in module we can use invisibly to the caller.

This will be used for thinLTO, where we split early and compute module
properties later.

This should be basically NFC.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h     |  4 +---
 llvm/include/llvm/SYCLLowerIR/SpecConstants.h       | 10 ++++++++++
 llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp   | 13 +++++++++----
 llvm/lib/SYCLLowerIR/SpecConstants.cpp              |  6 +++---
 .../spec-constants/default-value/bool.ll            |  1 +
 llvm/tools/sycl-post-link/sycl-post-link.cpp        |  5 ++---
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h b/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h
index eaeecb44deb03..e7cff6c730051 100644
--- a/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h
+++ b/llvm/include/llvm/SYCLLowerIR/ComputeModuleRuntimeInfo.h
@@ -34,9 +34,7 @@ using EntryPointSet = SetVector<Function *>;
 
 PropSetRegTy computeModuleProperties(const Module &M,
                                      const EntryPointSet &EntryPoints,
-                                     const GlobalBinImageProps &GlobProps,
-                                     bool SpecConstsMet,
-                                     bool IsSpecConstantDefault);
+                                     const GlobalBinImageProps &GlobProps);
 
 std::string computeModuleSymbolTable(const Module &M,
                                      const EntryPointSet &EntryPoints);
diff --git a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h
index 8bf8bdf894d07..114bf431a279a 100644
--- a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h
+++ b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h
@@ -72,6 +72,16 @@ class SpecConstantsPass : public PassInfoMixin<SpecConstantsPass> {
   collectSpecConstantDefaultValuesMetadata(const Module &M,
                                            std::vector<char> &DefaultValues);
 
+  // Name of the metadata which holds a list of all specialization constants
+  // (with associated information) encountered in the module
+  static constexpr char SPEC_CONST_MD_STRING[] =
+      "sycl.specialization-constants";
+
+  // Name of the metadata which indicates this module was proccessed with the
+  // default values handing mode.
+  static constexpr char SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING[] =
+      "sycl.specialization-constants-default-values-module";
+
 private:
   HandlingMode Mode;
 };
diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
index 4a84786b6af9b..67ac13c569f10 100644
--- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
+++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
@@ -138,9 +138,7 @@ uint32_t getKernelWorkGroupNumDim(const Function &Func) {
 
 PropSetRegTy computeModuleProperties(const Module &M,
                                      const EntryPointSet &EntryPoints,
-                                     const GlobalBinImageProps &GlobProps,
-                                     bool SpecConstsMet,
-                                     bool IsSpecConstantDefault) {
+                                     const GlobalBinImageProps &GlobProps) {
 
   PropSetRegTy PropSet;
   {
@@ -152,6 +150,10 @@ PropSetRegTy computeModuleProperties(const Module &M,
     PropSet.add(PropSetRegTy::SYCL_DEVICE_REQUIREMENTS,
                 computeDeviceRequirements(M, EntryPoints).asMap());
   }
+  auto *SpecConstsMD =
+      M.getNamedMetadata(SpecConstantsPass::SPEC_CONST_MD_STRING);
+  bool SpecConstsMet =
+      SpecConstsMD != nullptr && SpecConstsMD->getNumOperands() != 0;
   if (SpecConstsMet) {
     // extract spec constant maps per each module
     SpecIDMapTy TmpSpecIDMap;
@@ -369,7 +371,10 @@ PropSetRegTy computeModuleProperties(const Module &M,
   if (!HostPipePropertyMap.empty()) {
     PropSet.add(PropSetRegTy::SYCL_HOST_PIPES, HostPipePropertyMap);
   }
-
+  bool IsSpecConstantDefault =
+      M.getNamedMetadata(
+          SpecConstantsPass::SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING) !=
+      nullptr;
   if (IsSpecConstantDefault)
     PropSet.add(PropSetRegTy::SYCL_MISC_PROP, "specConstsReplacedWithDefault",
                 1);
diff --git a/llvm/lib/SYCLLowerIR/SpecConstants.cpp b/llvm/lib/SYCLLowerIR/SpecConstants.cpp
index 4f43a22e95fd9..bf8215db94028 100644
--- a/llvm/lib/SYCLLowerIR/SpecConstants.cpp
+++ b/llvm/lib/SYCLLowerIR/SpecConstants.cpp
@@ -47,9 +47,6 @@ constexpr char SPIRV_GET_SPEC_CONST_VAL[] = "__spirv_SpecConstant";
 constexpr char SPIRV_GET_SPEC_CONST_COMPOSITE[] =
     "__spirv_SpecConstantComposite";
 
-// Name of the metadata which holds a list of all specialization constants (with
-// associated information) encountered in the module
-constexpr char SPEC_CONST_MD_STRING[] = "sycl.specialization-constants";
 // Name of the metadata which holds a default value list of all specialization
 // constants encountered in the module
 constexpr char SPEC_CONST_DEFAULT_VAL_MD_STRING[] =
@@ -1029,6 +1026,9 @@ PreservedAnalyses SpecConstantsPass::run(Module &M,
   for (const auto &P : DefaultsMetadata)
     MDDefaults->addOperand(P);
 
+  if (Mode == HandlingMode::default_values)
+    M.getOrInsertNamedMetadata(SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING);
+
   return IRModified ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
diff --git a/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll b/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll
index d96cfa1f333f2..914ad2fe3a306 100644
--- a/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll
+++ b/llvm/test/tools/sycl-post-link/spec-constants/default-value/bool.ll
@@ -6,6 +6,7 @@
 
 ; CHECK: %bool1 = trunc i8 1 to i1
 ; CHECK: %frombool = zext i1 %bool1 to i8
+; CHECK: !sycl.specialization-constants-default-values-module = !{}
 ; CHECK-LOG: sycl.specialization-constants
 ; CHECK-LOG:[[UNIQUE_PREFIX:[0-9a-zA-Z]+]]={0, 0, 1}
 ; CHECK-LOG: sycl.specialization-constants-default-values
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index d1953743f500a..00bacce06d08b 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -307,9 +307,8 @@ std::string saveModuleIR(Module &M, int I, StringRef Suff) {
 std::string saveModuleProperties(module_split::ModuleDesc &MD,
                                  const GlobalBinImageProps &GlobProps, int I,
                                  StringRef Suff, StringRef Target = "") {
-  auto PropSet = computeModuleProperties(MD.getModule(), MD.entries(),
-                                         GlobProps, MD.Props.SpecConstsMet,
-                                         MD.isSpecConstantDefault());
+  auto PropSet =
+      computeModuleProperties(MD.getModule(), MD.entries(), GlobProps);
 
   std::string NewSuff = Suff.str();
   if (!Target.empty()) {

From 37d1d51ca8b1efa3858b2ddb64cba0563578e4ac Mon Sep 17 00:00:00 2001
From: Dounia Khaldi <dounia.khaldi@intel.com>
Date: Tue, 10 Sep 2024 10:12:27 -0500
Subject: [PATCH 03/19] [SYCL][E2E][Joint Matrix] update performance test to
 add SLM (#15229)

---
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  23 ++++
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  23 ++++
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  19 +++
 .../joint_matrix_bf16_fill_k_cache_SLM.cpp    |  19 +++
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   | 114 ++++++++++--------
 sycl/test-e2e/Matrix/slm_utils.hpp            |  70 +++++++++++
 6 files changed, 217 insertions(+), 51 deletions(-)
 create mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
 create mode 100644 sycl/test-e2e/Matrix/slm_utils.hpp

diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..bd22fa19354b1
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,23 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// TODO: add row major compilation and run once Sub-group size 32
+// support becomes available in IGC for row major
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../common.hpp"
+#define SG_SZ 32
+
+#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..62cef33b3beb7
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/SG32/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,23 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// TODO: add row major compilation and run once Sub-group size 32
+// support becomes available in IGC for row major
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../../common.hpp"
+#define SG_SZ 32
+
+#include "../../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..d81e7dbd685ba
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SPVCooperativeMatrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,19 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// RUN: %{build} -D__SPIRV_USE_COOPERATIVE_MATRIX -o %t_gpu.out -ffp-model=precise -DSLM
+// RUN: %{run} %t_gpu.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "../common.hpp"
+#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
new file mode 100644
index 0000000000000..a30d6320038a8
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -0,0 +1,19 @@
+//==--- joint_matrix_bf16_fill_k_cache_SLM.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix, gpu
+
+// RUN: %{build} -o %t_gpu_vnni.out -ffp-model=precise -DSLM -DVNNI
+// RUN: %{run} %t_gpu_vnni.out
+
+// RUN: %{build} -o %t_gpu.out -ffp-model=precise -DSLM
+// RUN: %{run} %t_gpu.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 1b31a993bb179..b561bd073038a 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -9,6 +9,10 @@
 #include <random>
 #include <sycl/usm.hpp>
 
+#ifdef SLM
+#include "slm_utils.hpp"
+#endif
+
 // number of test iterations
 constexpr unsigned int testIterations = 100;
 // start recording time after X iterations
@@ -51,6 +55,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
       std::chrono::high_resolution_clock::now();
 
   q.submit([&](handler &h) {
+#ifdef SLM
+    local_accessor<TOperand, 2> tileA{{MCache2, KCache2}, h};
+    local_accessor<TOperand, 2> tileB{
+        {KCache2 / vnniFactor, NCache2 * vnniFactor}, h};
+#endif
+
     h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
@@ -60,15 +70,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
             [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif // SG_SZ
         {
+          // sg::load and sg::store expect decorations to be ON
           auto pA =
               address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(A);
+                                 sycl::access::decorated::yes>(A);
           auto pB =
               address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(B);
+                                 sycl::access::decorated::yes>(B);
           auto pC =
               address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(C);
+                                 sycl::access::decorated::yes>(C);
           auto m2 = it.get_group(0);
           auto n2 = it.get_group(1);
           auto m1 = it.get_local_id(0);
@@ -112,7 +123,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                 colsA, layout::row_major,
                 syclex::properties{syclex::prefetch_hint_L1});
 
-#ifdef VNNI
           for (int p = 0; p < prefDistance; p++)
             joint_matrix_prefetch<prefRow, prefCol>(
                 sg,
@@ -122,15 +132,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                     (n2 * NCache2 * vnniFactor + pn1B * prefCol),
                 colsB * vnniFactor, layout::row_major,
                 syclex::properties{syclex::prefetch_hint_L1});
-#else  // VNNI
-          for (int p = 0; p < prefDistance; p++)
-            joint_matrix_prefetch<prefRow, prefCol>(
-                sg,
-                B + (p * KCache2 + pm1B * prefRow) * colsB + n2 * NCache2 +
-                    pn1B * prefCol,
-                colsB, layout::row_major,
-                syclex::properties{syclex::prefetch_hint_L1});
-#endif // VNNI
 #endif // PREFETCH
 
           joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
@@ -157,7 +158,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           }
 #endif // MANUAL_UNROLL
 
+#ifdef SLM
+          constexpr unsigned int SGs =
+              (MCache2 / MCache1) * (NCache2 / NCache1);
+#endif // SLM
           for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+#ifdef SLM
+            slm_read_write<colsA, colsB, MCache2, NCache2, KCache2, vnniFactor,
+                           SGs>(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize);
+            it.barrier(access::fence_space::local_space);
+#endif // SLM
             joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
                 tA[MCache1 / TM][KCache2 / KCache1]
 #ifdef INIT_LIST
@@ -192,6 +202,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #else  // MANUAL_UNROLL
               for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif // MANUAL_UNROLL
+#ifdef SLM
+                joint_matrix_load(sg, tA[m][k1],
+                                  tileA.template get_multi_ptr<
+                                      sycl::access::decorated::no>() +
+                                      (m1 * MCache1 + m * TM) * KCache2 +
+                                      k1 * TK,
+                                  KCache2);
+#else // SLM
 #ifdef OOB
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
                     sg, tA[m][k1], pA, colsA, rowsA, colsA,
@@ -203,6 +221,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                         k * TK,
                     colsA);
 #endif // OOB
+#endif // SLM
 #ifdef MANUAL_UNROLL
               }); // m
 #else             // MANUAL_UNROLL
@@ -213,32 +232,28 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #else  // MANUAL_UNROLL
               for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif // MANUAL_UNROLL
+#ifdef SLM
+                joint_matrix_load(sg, tB[n][k1],
+                                  tileB.template get_multi_ptr<
+                                      sycl::access::decorated::no>() +
+                                      (k1 * TK / vnniFactor) *
+                                          (NCache2 * vnniFactor) +
+                                      (n1 * NCache1 + n * TN) * vnniFactor,
+                                  NCache2 * vnniFactor);
+#else // SLM
 #ifdef OOB
-#ifdef VNNI
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
                     sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
                     colsB * vnniFactor, k * TK / vnniFactor,
                     (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor);
-#else // VNNI
-                ext::intel::experimental::matrix::joint_matrix_load_checked(
-                    sg, tB[n][k1], pB, colsB, rowsB, colsB, k * TK,
-                    n2 * NCache2 + n1 * NCache1 + n * TN);
-
-#endif // VNNI
 #else  // OOB
-#ifdef VNNI
                 joint_matrix_load(
                     sg, tB[n][k1],
                     pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
                         (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
                     colsB * vnniFactor);
-#else  // VNNI
-                joint_matrix_load(sg, tB[n][k1],
-                                  pB + (k * TK) * (colsB) +
-                                      (n2 * NCache2 + n1 * NCache1 + n * TN),
-                                  colsB);
-#endif // VNNI
 #endif // OOB
+#endif // SLM
 #ifdef MANUAL_UNROLL
               }); // n
 #else             // MANUAL_UNROLL
@@ -266,6 +281,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
               } // m
             } // k1
 #endif              // MANUAL_UNROLL
+#ifdef SLM
+            it.barrier(access::fence_space::local_space);
+#endif // SLM
 #ifdef PREFETCH
             auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
                                     (k2 + prefDistance) * prefCol;
@@ -275,7 +293,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                   sg, A + prefetch_offsetA, colsA, layout::row_major,
                   syclex::properties{syclex::prefetch_hint_L1});
 
-#ifdef VNNI
             auto prefetch_offsetB =
                 ((k2 + prefDistance) * (KCache2 / vnniFactor) +
                  pm1B * prefRow) *
@@ -287,16 +304,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
                   sg, B + prefetch_offsetB, colsB * vnniFactor,
                   layout::row_major,
                   syclex::properties{syclex::prefetch_hint_L1});
-#else  // VNNI
-            auto prefetch_offsetB =
-                ((k2 + prefDistance) * KCache2 + pm1B * prefRow) * (colsB) +
-                (n2 * NCache2 + pn1B * prefCol);
-            if ((prefetch_offsetB + (prefRow * MATRIX_SIZE) + prefCol) <
-                (MATRIX_SIZE * MATRIX_SIZE))
-              joint_matrix_prefetch<prefRow, prefCol>(
-                  sg, B + prefetch_offsetB, colsB, layout::row_major,
-                  syclex::properties{syclex::prefetch_hint_L1});
-#endif // VNNI
 #endif // PREFETCH
           } // for k2
 #ifdef MANUAL_UNROLL
@@ -411,29 +418,33 @@ int main() {
   constexpr size_t NCache2 = 256;
   constexpr size_t KCache2 = 32;
 
+#ifdef VNNI
+  constexpr unsigned int VnniFactor = 2;
+#else  // VNNI
+  constexpr unsigned int VnniFactor = 1;
+#endif // VNNI
+
   for (unsigned int i = 0; i < combinations.size(); i++) {
     if (combinations[i].nsize == 0) { // Intel AMX
       constexpr size_t NCache1 = 32;
       constexpr size_t KCache1 = 32;
-
-      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
       break;
     }
 
     if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
       constexpr size_t NCache1 = 4 * /*TN*/ 16;
       constexpr size_t KCache1 = 16;
-
-      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1, NCache1,
-           KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
 #if (!defined(SG_SZ) || SG_SZ != 32)
       // These combination are not currently supported for subgroup size = 32 in
       // IGC
-      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
-      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
 #endif
       break;
     }
@@ -442,9 +453,10 @@ int main() {
       constexpr size_t NCache1 = 4 * /*TN*/ 8;
       constexpr size_t KCache1 = 16;
 
-      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1, NCache1,
-           KCache1, MCache2, NCache2, KCache2>();
-      // test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16, MCache1,
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
+      // MCache1,
       //      NCache1, KCache1, MCache2, NCache2, KCache2>();
       break;
     }
diff --git a/sycl/test-e2e/Matrix/slm_utils.hpp b/sycl/test-e2e/Matrix/slm_utils.hpp
new file mode 100644
index 0000000000000..28ac1264b8cc0
--- /dev/null
+++ b/sycl/test-e2e/Matrix/slm_utils.hpp
@@ -0,0 +1,70 @@
+//==------------------ slm_utils.hpp  - DPC++ joint_matrix------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+template <unsigned int colsA, unsigned int colsB, unsigned int MCache2,
+          unsigned int NCache2, unsigned int KCache2, unsigned int vnniFactor,
+          size_t SGs, typename TOperand, access::address_space space>
+inline void
+slm_read_write(multi_ptr<TOperand, space, access::decorated::yes> pA,
+               multi_ptr<TOperand, space, access::decorated::yes> pB,
+               local_accessor<TOperand, 2> tileA,
+               local_accessor<TOperand, 2> tileB, sub_group sg, unsigned int k2,
+               size_t m2, size_t n2, size_t sgSize) {
+  // Number of elements to be loaded into SLM per WI
+  size_t elemsPerLoadA = KCache2 / sgSize;
+  for (int i = 0; i < MCache2 / SGs; i++) {
+    size_t GlOffsetA =
+        (m2 * MCache2 + sg.get_group_id() * (MCache2 / SGs) + i) * colsA +
+        k2 * KCache2;
+    size_t LocOffsetA = (sg.get_group_id() * (MCache2 / SGs) + i) * KCache2;
+
+    if (elemsPerLoadA == 2) {
+      vec<TOperand, 2> slmVecA = sg.load<2>(pA + GlOffsetA);
+      sg.store<2>(tileA.template get_multi_ptr<sycl::access::decorated::no>() +
+                      LocOffsetA,
+                  slmVecA);
+    } else if (elemsPerLoadA == 4) {
+      vec<TOperand, 4> slmVecA = sg.load<4>(pA + GlOffsetA);
+      sg.store<4>(tileA.template get_multi_ptr<sycl::access::decorated::no>() +
+                      LocOffsetA,
+                  slmVecA);
+    } else if (elemsPerLoadA == 1) {
+      TOperand slmScaA = sg.load(pA + GlOffsetA);
+
+      sg.store(tileA.template get_multi_ptr<sycl::access::decorated::no>() +
+                   LocOffsetA,
+               slmScaA);
+    } else
+      assert(elemsPerLoadA == 1 || elemsPerLoadA == 2 || elemsPerLoadA == 4);
+  }
+  // how much each SG will load to SLM --> has to be contiguous
+  // NCache2*KCache2/(SGs*SG_SIZE) = 16
+  size_t elemsPerLoadB = NCache2 * KCache2 / (SGs * sgSize);
+  size_t sgsPerRow = (NCache2 * vnniFactor) / (elemsPerLoadB * sgSize);
+  size_t GlOffsetB =
+      (k2 * (KCache2 / vnniFactor) + (uint)(sg.get_group_id() / sgsPerRow)) *
+          (colsB * vnniFactor) +
+      n2 * NCache2 * vnniFactor +
+      (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize);
+  size_t LocOffsetB =
+      ((uint)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor +
+      (sg.get_group_id() % sgsPerRow) * elemsPerLoadB * sgSize;
+  if (elemsPerLoadB == 16) {
+    vec<TOperand, 16> slmVecB = sg.load<16>(pB + GlOffsetB);
+
+    sg.store<16>(tileB.template get_multi_ptr<sycl::access::decorated::no>() +
+                     LocOffsetB,
+                 slmVecB);
+  } else if (elemsPerLoadB == 8) {
+    vec<TOperand, 8> slmVecB = sg.load<8>(pB + GlOffsetB);
+
+    sg.store<8>(tileB.template get_multi_ptr<sycl::access::decorated::no>() +
+                    LocOffsetB,
+                slmVecB);
+  } else
+    assert(elemsPerLoadB == 8 || elemsPerLoadB == 16);
+}

From 3b0be29bd8629c84fb151d08355b848d74db9373 Mon Sep 17 00:00:00 2001
From: Chris Perkins <chris.perkins@intel.com>
Date: Tue, 10 Sep 2024 09:50:23 -0700
Subject: [PATCH 04/19] [SYCL] re-enable Scheduler StreamBufferDeallocation
 unit test (#15292)

A month ago, we had a strange intermittent failure of one of the unit
tests: https://github.com/intel/llvm/issues/15049 and in response to
that we disabled the test. It is unclear why that might have been
failing.

But that was when there were some other disrupting changes, and fallout
from that has since been fixed.


I have retested this on Windows for 100,000 iterations, and not a single
failure seen. Tested on the CI, ran successfully there too.

I'd like to reenable this test with this PR. If the CI system starts
failing on PR, we can just revert this PR in coming days.
---
 sycl/unittests/scheduler/GraphCleanup.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp
index c0e1dc136a2d8..6f628f99a84e9 100644
--- a/sycl/unittests/scheduler/GraphCleanup.cpp
+++ b/sycl/unittests/scheduler/GraphCleanup.cpp
@@ -307,12 +307,7 @@ struct AttachSchedulerWrapper {
 };
 
 // Check that stream buffers are released alongside graph cleanup.
-// https://github.com/intel/llvm/issues/15049
-#ifdef _WIN32
-TEST_F(SchedulerTest, DISABLED_StreamBufferDeallocation) {
-#else
 TEST_F(SchedulerTest, StreamBufferDeallocation) {
-#endif
   unittest::UrMock<> Mock;
   platform Plt = sycl::platform();
   context Ctx{Plt};

From 40e2f623fed1df92206d78982ea72d13bb211c26 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <udit.agarwal@intel.com>
Date: Tue, 10 Sep 2024 14:38:58 -0700
Subject: [PATCH 05/19] Revert '[CI] pre-commit/aws pointed back to old image'
 (#15348)

Reverts https://github.com/intel/llvm/pull/14074. This will allow us to
use latest docker images.
---
 .github/workflows/sycl-linux-precommit-aws.yml | 2 +-
 .github/workflows/sycl-linux-precommit.yml     | 1 -
 .github/workflows/sycl-post-commit.yml         | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml
index 19c21b6eec977..07d309900d764 100644
--- a/.github/workflows/sycl-linux-precommit-aws.yml
+++ b/.github/workflows/sycl-linux-precommit-aws.yml
@@ -66,7 +66,7 @@ jobs:
     with:
       name: CUDA E2E
       runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]'
-      image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
+      image: ghcr.io/intel/llvm/ubuntu2204_build:latest
       image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1
       target_devices: ext_oneapi_cuda:gpu
       # No idea why but that seems to work and be in sync with the main
diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
index b4a38a2cef24e..460e1737b194a 100644
--- a/.github/workflows/sycl-linux-precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -46,7 +46,6 @@ jobs:
       build_artifact_suffix: "default"
       build_cache_suffix: "default"
       changes: ${{ needs.detect_changes.outputs.filters }}
-      build_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab"
 
   determine_arc_tests:
     name: Decide which Arc tests to run
diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml
index 29b29f9891a31..d67edc83a050c 100644
--- a/.github/workflows/sycl-post-commit.yml
+++ b/.github/workflows/sycl-post-commit.yml
@@ -60,7 +60,7 @@ jobs:
             reset_intel_gpu: true
           - name: AMD/HIP
             runner: '["Linux", "amdgpu"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
             image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
             target_devices: ext_oneapi_hip:gpu
             reset_intel_gpu: false

From 811db848c397c4e2403e37f2b02fc28e167b56d0 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Tue, 10 Sep 2024 18:22:14 -0700
Subject: [PATCH 06/19] [SYCL] Fix handling of interop events for barrier with
 waitlist (#15352)

Currently Command::getUrEventsBlocking is responsible for preparing a
waitlist of UR events for the barrier.
This method used wrong assumption that if isEnqueued() returns false for
the event then it doesn't have UR handle because it was not enqueued. So
if there is an associated command we would enqueue it to get the desired
UR handle, or we would just ignore this event if there is no associated
command.

Problem is that sycl::event created with interoperability constructor
has isEnqueued() as false (as it is not enqueued by SYCL RT) but it has
UR handle provided by user.
Before this patch we just ignored such event as it doesn't have
associated command and we didn't put it to the resulting list.

This patch fixes this problem by handling interop events properly in
this code path.
---
 sycl/source/detail/event_impl.hpp             |  7 +++
 sycl/source/detail/scheduler/commands.cpp     |  9 ++--
 .../barrier_waitlist_with_interop_event.cpp   | 47 +++++++++++++++++++
 3 files changed, 60 insertions(+), 3 deletions(-)
 create mode 100644 sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp

diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index b560d721728a6..312bb589760b7 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -329,6 +329,13 @@ class event_impl {
 
   bool isProfilingTagEvent() const noexcept { return MProfilingTagEvent; }
 
+  // Check if this event is an interoperability event.
+  bool isInterop() const noexcept {
+    // As an indication of interoperability event, we use the absence of the
+    // queue and command, as well as the fact that it is not in enqueued state.
+    return MEvent && MQueue.expired() && !MIsEnqueued && !MCommand;
+  }
+
 protected:
   // When instrumentation is enabled emits trace event for event wait begin and
   // returns the telemetry event generated for the wait
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 953ad2bee0444..c5e8fc2c3a2cd 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -265,9 +265,12 @@ std::vector<ur_event_handle_t> Command::getUrEventsBlocking(
     if (EventImpl->isDefaultConstructed() || EventImpl->isHost() ||
         EventImpl->isNOP())
       continue;
-    // In this path nullptr native event means that the command has not been
-    // enqueued. It may happen if async enqueue in a host task is involved.
-    if (!EventImpl->isEnqueued()) {
+
+    // If command has not been enqueued then we have to enqueue it.
+    // It may happen if async enqueue in a host task is involved.
+    // Interoperability events are special cases and they are not enqueued, as
+    // they don't have an associated queue and command.
+    if (!EventImpl->isInterop() && !EventImpl->isEnqueued()) {
       if (!EventImpl->getCommand() ||
           !static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
         continue;
diff --git a/sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp b/sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp
new file mode 100644
index 0000000000000..f5a54f1a67dc2
--- /dev/null
+++ b/sycl/test-e2e/Regression/barrier_waitlist_with_interop_event.cpp
@@ -0,0 +1,47 @@
+// REQUIRES: level_zero, level_zero_dev_kit
+// RUN: %{build} %level_zero_options -o %t.out
+// RUN: %{run} %t.out
+// UNSUPPORTED: ze_debug
+
+#include <level_zero/ze_api.h>
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+#include <sycl/usm.hpp>
+
+// Test checks the case when an interoperability event is passed as a dependency
+// to the barrier. In such case, waiting for the event produced by barrier must
+// guarantee completion of the interoperability event.
+
+using namespace sycl;
+
+int main() {
+  sycl::queue Queue;
+  if (!Queue.get_device().get_info<info::device::usm_shared_allocations>())
+    return 0;
+
+  const size_t N = 1024;
+  int *Data = sycl::malloc_shared<int>(N, Queue);
+  auto FillEvent = Queue.fill(Data, 0, N);
+  auto FillZeEvent = get_native<backend::ext_oneapi_level_zero>(FillEvent);
+
+  backend_input_t<backend::ext_oneapi_level_zero, event> EventInteropInput = {
+      FillZeEvent};
+  EventInteropInput.Ownership = sycl::ext::oneapi::level_zero::ownership::keep;
+  auto EventInterop = make_event<backend::ext_oneapi_level_zero>(
+      EventInteropInput, Queue.get_context());
+
+  auto BarrierEvent = Queue.ext_oneapi_submit_barrier({EventInterop});
+  BarrierEvent.wait();
+
+  if (EventInterop.get_info<sycl::info::event::command_execution_status>() !=
+      sycl::info::event_command_status::complete) {
+    Queue.wait();
+    sycl::free(Data, Queue);
+    return -1;
+  }
+
+  // Free the USM memory
+  sycl::free(Data, Queue);
+
+  return 0;
+}

From 098416abc713fef0ef83d3bee4d020c52830d674 Mon Sep 17 00:00:00 2001
From: Justin Cai <justin.cai@intel.com>
Date: Wed, 11 Sep 2024 02:10:37 -0700
Subject: [PATCH 07/19] [SYCL] Fix image selection for AOT on intel_cpu_{spr,
 gnr} (#15208)

When AOT compiling for cpu, the generic `spir64_x86_64` target is used
with `-fsycl-targets`. intel/llvm#14909,
functionality was added to select device images based on their
`compile_target` property in the image. The selection mechanism had to
consider CPU as a special case due to not having explicit targets.
However, the mechanism only considered `x86_64` and not `intel_cpu_spr`
or `intel_cpu_gnr`; therefore on a `intel_cpu_spr` or `intel_cpu_gnr`
device, trying to launch a program compiled with
`-fsycl-targets=spir64_x86_64`, device image selection would fail to
find an image (and thus fail to launch any kernels).

This PR updates the logic to include `intel_cpu_spr` and
`intel_cpu_gnr`. Note: for tests, this functionality is checked by any
test that AOT compiled for CPU and launches a kernel (includes
[AOT/cpu.cpp](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AOT/cpu.cpp),
[AOT/double.cpp](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AOT/double.cpp),
[AOT/half.cpp](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/AOT/half.cpp)).
---
 .../program_manager/program_manager.cpp       | 10 ++-
 .../program_manager/CompileTarget.cpp         | 80 +++++++++++++------
 2 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 99f3c5204dc74..d6f063e5fada6 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1321,9 +1321,15 @@ RTDeviceBinaryImage *getBinImageFromMultiMap(
         reinterpret_cast<const char *>(&CompileTargetByteArray[0]),
         CompileTargetByteArray.size());
     // Note: there are no explicit targets for CPUs, so on x86_64,
-    // so we use a spir64_x86_64 compile target image.
+    // intel_cpu_spr, and intel_cpu_gnr, we use a spir64_x86_64
+    // compile target image.
+    // TODO: When dedicated targets for CPU are added, (i.e.
+    // -fsycl-targets=intel_cpu_spr etc.) remove this special
+    // handling of CPU targets.
     if ((ArchName == CompileTarget) ||
-        (ArchName == "x86_64" && CompileTarget == "spir64_x86_64")) {
+        (CompileTarget == "spir64_x86_64" &&
+         (ArchName == "x86_64" || ArchName == "intel_cpu_spr" ||
+          ArchName == "intel_cpu_gnr"))) {
       AddImg();
     }
   }
diff --git a/sycl/unittests/program_manager/CompileTarget.cpp b/sycl/unittests/program_manager/CompileTarget.cpp
index 095a4af2a996c..281530a8e43e4 100644
--- a/sycl/unittests/program_manager/CompileTarget.cpp
+++ b/sycl/unittests/program_manager/CompileTarget.cpp
@@ -97,20 +97,25 @@ static sycl::unittest::UrImage Img[] = {
 
 static sycl::unittest::UrImageArray<std::size(Img)> ImgArray{Img};
 
-ur_device_handle_t MockSklDeviceHandle =
-    reinterpret_cast<ur_device_handle_t>(1);
-ur_device_handle_t MockPvcDeviceHandle =
-    reinterpret_cast<ur_device_handle_t>(2);
-ur_device_handle_t MockX86DeviceHandle =
-    reinterpret_cast<ur_device_handle_t>(3);
-constexpr int SklIp = 0x02400009;
-constexpr int PvcIp = 0x030f0000;
-constexpr int X86Ip = 0;
-
-ur_device_handle_t MockDevices[] = {
-    MockSklDeviceHandle,
-    MockPvcDeviceHandle,
-    MockX86DeviceHandle,
+struct MockDeviceData {
+  int Ip;
+  ur_device_type_t DeviceType;
+  ur_device_handle_t getHandle() {
+    return reinterpret_cast<ur_device_handle_t>(this);
+  }
+  static MockDeviceData *fromHandle(ur_device_handle_t handle) {
+    return reinterpret_cast<MockDeviceData *>(handle);
+  }
+};
+
+// IP are from IntelGPUArchitectures/IntelCPUArchitectures in
+// sycl/source/detail/device_info.hpp
+MockDeviceData MockDevices[] = {
+    {0x02400009, UR_DEVICE_TYPE_GPU}, // Skl
+    {0x030f0000, UR_DEVICE_TYPE_GPU}, // Pvc
+    {0, UR_DEVICE_TYPE_CPU},          // X86
+    {8, UR_DEVICE_TYPE_CPU},          // Spr
+    {9, UR_DEVICE_TYPE_CPU},          // Gnr
 };
 
 static ur_result_t redefinedDeviceGet(void *pParams) {
@@ -123,7 +128,7 @@ static ur_result_t redefinedDeviceGet(void *pParams) {
   if (*params.pphDevices) {
     assert(*params.pNumEntries <= std::size(MockDevices));
     for (uint32_t i = 0; i < *params.pNumEntries; ++i) {
-      (*params.pphDevices)[i] = MockDevices[i];
+      (*params.pphDevices)[i] = MockDevices[i].getHandle();
     }
   }
 
@@ -149,27 +154,22 @@ static ur_result_t redefinedDeviceGetInfo(void *pParams) {
   auto params = *static_cast<ur_device_get_info_params_t *>(pParams);
   if (*params.ppropName == UR_DEVICE_INFO_IP_VERSION && *params.ppPropValue) {
     int &ret = *static_cast<int *>(*params.ppPropValue);
-    if (*params.phDevice == MockSklDeviceHandle)
-      ret = SklIp;
-    if (*params.phDevice == MockPvcDeviceHandle)
-      ret = PvcIp;
-    if (*params.phDevice == MockX86DeviceHandle)
-      ret = X86Ip;
+    ret = MockDeviceData::fromHandle(*params.phDevice)->Ip;
   }
-  if (*params.ppropName == UR_DEVICE_INFO_TYPE &&
-      *params.phDevice == MockX86DeviceHandle) {
+  if (*params.ppropName == UR_DEVICE_INFO_TYPE) {
     if (*params.ppPropValue)
       *static_cast<ur_device_type_t *>(*params.ppPropValue) =
-          UR_DEVICE_TYPE_CPU;
+          MockDeviceData::fromHandle(*params.phDevice)->DeviceType;
     if (*params.ppPropSizeRet)
-      **params.ppPropSizeRet = sizeof(UR_DEVICE_TYPE_CPU);
+      **params.ppPropSizeRet = sizeof(ur_device_type_t);
   }
   return UR_RESULT_SUCCESS;
 }
 
 static ur_result_t redefinedDeviceSelectBinary(void *pParams) {
   auto params = *static_cast<ur_device_select_binary_params_t *>(pParams);
-  auto target = *params.phDevice == MockX86DeviceHandle
+  auto target = MockDeviceData::fromHandle(*params.phDevice)->DeviceType ==
+                        UR_DEVICE_TYPE_CPU
                     ? UR_DEVICE_BINARY_TARGET_SPIRV64_X86_64
                     : UR_DEVICE_BINARY_TARGET_SPIRV64_GEN;
   uint32_t fallback = *params.pNumBinaries;
@@ -246,6 +246,16 @@ TEST_F(CompileTargetTest, SingleTask) {
   checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
     launchSingleTaskKernel(queue{archSelector(syclex::architecture::x86_64)});
   });
+
+  checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
+    launchSingleTaskKernel(
+        queue{archSelector(syclex::architecture::intel_cpu_spr)});
+  });
+
+  checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
+    launchSingleTaskKernel(
+        queue{archSelector(syclex::architecture::intel_cpu_gnr)});
+  });
 }
 
 void launchNDRangeKernel(queue q) {
@@ -268,6 +278,16 @@ TEST_F(CompileTargetTest, NDRangeKernel) {
   checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
     launchNDRangeKernel(queue{archSelector(syclex::architecture::x86_64)});
   });
+
+  checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
+    launchNDRangeKernel(
+        queue{archSelector(syclex::architecture::intel_cpu_spr)});
+  });
+
+  checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
+    launchNDRangeKernel(
+        queue{archSelector(syclex::architecture::intel_cpu_gnr)});
+  });
 }
 
 void launchRangeKernel(queue q) {
@@ -288,6 +308,14 @@ TEST_F(CompileTargetTest, RangeKernel) {
   checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
     launchRangeKernel(queue{archSelector(syclex::architecture::x86_64)});
   });
+
+  checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
+    launchRangeKernel(queue{archSelector(syclex::architecture::intel_cpu_spr)});
+  });
+
+  checkUsedImageWithCompileTarget("spir64_x86_64", [&]() {
+    launchRangeKernel(queue{archSelector(syclex::architecture::intel_cpu_gnr)});
+  });
 }
 
 TEST_F(CompileTargetTest, NoDeviceKernel) {

From 729d6f62ff2e0676f01d58d256392bd94e7b71f7 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Wed, 11 Sep 2024 11:18:37 +0200
Subject: [PATCH 08/19] [SYCL][E2E] Fix online compiler test for accelerator
 (#15270)

This commit fixes an issue where the online_compiler_OpenCL test would
assume that the clCreateProgramWithIL function is supported by all
OpenCL targets, while it was only supported with >=2.1 and had an
extension in older versions. This also enables the test for accelerator.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 .../OnlineCompiler/online_compiler_L0.cpp     |  6 +-
 .../OnlineCompiler/online_compiler_OpenCL.cpp | 81 +++++++++++++++++--
 .../OnlineCompiler/online_compiler_common.hpp |  7 ++
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp
index fc92d8957ef31..64a902bcbb1a4 100644
--- a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp
+++ b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp
@@ -7,8 +7,8 @@
 // All Level-Zero specific code is kept here and the common part that can be
 // re-used by other backends is kept in online_compiler_common.hpp file.
 
-#include <sycl/ext/intel/experimental/online_compiler.hpp>
 #include <sycl/detail/core.hpp>
+#include <sycl/ext/intel/experimental/online_compiler.hpp>
 
 #include <vector>
 
@@ -20,6 +20,10 @@
 using byte = unsigned char;
 
 #ifdef RUN_KERNELS
+bool testSupported(sycl::queue &Queue) {
+  return Queue.get_backend() == sycl::backend::ext_oneapi_level_zero;
+}
+
 sycl::kernel getSYCLKernelWithIL(sycl::queue &Queue,
                                  const std::vector<byte> &IL) {
 
diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp
index 8a0d67d3b02c3..161eb65478653 100644
--- a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp
+++ b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp
@@ -1,5 +1,4 @@
 // REQUIRES: opencl, opencl_icd, cm-compiler
-// UNSUPPORTED: accelerator
 
 // RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %opencl_lib -o %t.out
 // RUN: %{run} %t.out
@@ -18,14 +17,86 @@
 using byte = unsigned char;
 
 #ifdef RUN_KERNELS
+std::tuple<unsigned long, unsigned long> GetOCLVersion(sycl::device Device) {
+  cl_int Err;
+  cl_device_id ClDevice = sycl::get_native<sycl::backend::opencl>(Device);
+
+  size_t VersionSize = 0;
+  Err = clGetDeviceInfo(ClDevice, CL_DEVICE_VERSION, 0, nullptr, &VersionSize);
+  assert(Err == CL_SUCCESS);
+
+  std::string Version(VersionSize, '\0');
+  Err = clGetDeviceInfo(ClDevice, CL_DEVICE_VERSION, VersionSize,
+                        Version.data(), nullptr);
+  assert(Err == CL_SUCCESS);
+
+  std::string_view Prefix = "OpenCL ";
+  size_t VersionBegin = Version.find_first_of(" ");
+  size_t VersionEnd = Version.find_first_of(" ", VersionBegin + 1);
+  size_t VersionSeparator = Version.find_first_of(".", VersionBegin + 1);
+
+  bool HaveOCLPrefix =
+      std::equal(Prefix.begin(), Prefix.end(), Version.begin());
+
+  assert(HaveOCLPrefix && VersionBegin != std::string::npos &&
+         VersionEnd != std::string::npos &&
+         VersionSeparator != std::string::npos);
+
+  std::string VersionMajor{Version.begin() + VersionBegin + 1,
+                           Version.begin() + VersionSeparator};
+  std::string VersionMinor{Version.begin() + VersionSeparator + 1,
+                           Version.begin() + VersionEnd};
+
+  unsigned long OCLMajor = strtoul(VersionMajor.c_str(), nullptr, 10);
+  unsigned long OCLMinor = strtoul(VersionMinor.c_str(), nullptr, 10);
+
+  assert(OCLMajor > 0 && (OCLMajor > 2 || OCLMinor <= 2) &&
+         OCLMajor != UINT_MAX && OCLMinor != UINT_MAX);
+
+  return std::make_tuple(OCLMajor, OCLMinor);
+}
+
+bool testSupported(sycl::queue &Queue) {
+  if (Queue.get_backend() != sycl::backend::opencl)
+    return false;
+
+  sycl::device Device = Queue.get_device();
+  auto [OCLMajor, OCLMinor] = GetOCLVersion(Device);
+
+  // Creating a program from IL is only supported on >=2.1 or if
+  // cl_khr_il_program is supported on the device.
+  return (OCLMajor == 2 && OCLMinor >= 1) || OCLMajor > 2 ||
+         Device.has_extension("cl_khr_il_program");
+}
+
 sycl::kernel getSYCLKernelWithIL(sycl::queue &Queue,
                                  const std::vector<byte> &IL) {
   sycl::context Context = Queue.get_context();
 
-  cl_int Err;
-  cl_program ClProgram =
-      clCreateProgramWithIL(sycl::get_native<sycl::backend::opencl>(Context),
-                            IL.data(), IL.size(), &Err);
+  cl_int Err = 0;
+  cl_program ClProgram = 0;
+
+  sycl::device Device = Queue.get_device();
+  auto [OCLMajor, OCLMinor] = GetOCLVersion(Device);
+  if ((OCLMajor == 2 && OCLMinor >= 1) || OCLMajor > 2) {
+    // clCreateProgramWithIL is supported if OCL version >=2.1.
+    ClProgram =
+        clCreateProgramWithIL(sycl::get_native<sycl::backend::opencl>(Context),
+                              IL.data(), IL.size(), &Err);
+  } else {
+    // Fall back to using extension function for building IR.
+    using ApiFuncT =
+        cl_program(CL_API_CALL *)(cl_context, const void *, size_t, cl_int *);
+    ApiFuncT FuncPtr =
+        reinterpret_cast<ApiFuncT>(clGetExtensionFunctionAddressForPlatform(
+            sycl::get_native<sycl::backend::opencl>(Context.get_platform()),
+            "clCreateProgramWithILKHR"));
+
+    assert(FuncPtr != nullptr);
+
+    ClProgram = FuncPtr(sycl::get_native<sycl::backend::opencl>(Context),
+                        IL.data(), IL.size(), &Err);
+  }
   assert(Err == CL_SUCCESS);
 
   Err = clBuildProgram(ClProgram, 0, nullptr, nullptr, nullptr, nullptr);
diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp b/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp
index 505802d11f3b1..b585126f95674 100644
--- a/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp
+++ b/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp
@@ -53,6 +53,13 @@ int main(int argc, char **argv) {
   sycl::queue Q;
   sycl::device Device = Q.get_device();
 
+#ifdef RUN_KERNELS
+  if (!testSupported(Q)) {
+    std::cout << "Building for IL is not supported. Skipping!" << std::endl;
+    return 0;
+  }
+#endif
+
   { // Compile and run a trivial OpenCL kernel.
     std::cout << "Test case1\n";
     sycl::ext::intel::experimental::online_compiler<

From 81aacfa9af9b99fb6658e4b906c509968da18e43 Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Wed, 11 Sep 2024 14:02:24 +0100
Subject: [PATCH 09/19] [SYCL] Implement max_num_work_groups from the launch
 queries extension (#14333)

This PR implements the `max_num_work_groups ` query from the
`sycl_ext_oneapi_launch_queries` extension.

Additionally, this PR introduces changes that overload
`ext_oneapi_get_info` for another kernel-queue-specific query -
`max_num_work_group_sync` to take extra parameters for local work-group
size and dynamic local memory size (in bytes) in order to allow users to
pass those runtime resource limiting factors to the query, so they are
taken into account in the final group count suggestion.
---
 sycl/cmake/modules/FetchUnifiedRuntime.cmake  |  14 +-
 .../sycl_ext_oneapi_launch_queries.asciidoc   |   8 +-
 .../include/sycl/detail/info_desc_helpers.hpp |  12 +
 .../ext/oneapi/experimental/root_group.hpp    |   9 +-
 ...xt_oneapi_kernel_queue_specific_traits.def |   4 +
 sycl/include/sycl/info/info_desc.hpp          |   2 +
 sycl/include/sycl/kernel.hpp                  |  24 +-
 sycl/source/detail/kernel_impl.cpp            |  32 +++
 sycl/source/detail/kernel_impl.hpp            |  90 +++++++-
 sycl/source/kernel.cpp                        |  26 ++-
 .../launch_queries/max_num_work_groups.cpp    | 215 ++++++++++++++++++
 sycl/test-e2e/GroupAlgorithm/root_group.cpp   |  15 +-
 sycl/test/abi/sycl_symbols_linux.dump         |   4 +-
 sycl/test/abi/sycl_symbols_windows.dump       |   4 +-
 sycl/test/include_deps/sycl_accessor.hpp.cpp  |   1 +
 .../include_deps/sycl_detail_core.hpp.cpp     |   1 +
 16 files changed, 424 insertions(+), 37 deletions(-)
 create mode 100644 sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
 create mode 100644 sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp

diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
index 956c33bec68df..e569981589ac4 100644
--- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake
+++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
@@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
   endfunction()
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5
-  # Merge: a99dbcee 3abe18cf
-  # Author: Piotr Balcer <piotr.balcer@intel.com>
-  # Date:   Fri Sep 6 17:21:17 2024 +0200
-  #   Merge pull request #1820 from pbalcer/static-linking
-  #   Add support for static linking of the L0 adapter
-  set(UNIFIED_RUNTIME_TAG 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5)
+  # commit eb63d1a21729f6928bb6cccc5f92856b0690aca6
+  # Merge: e26bba51 45a781f4
+  # Author: Omar Ahmed <omar.ahmed@codeplay.com>
+  # Date:   Tue Sep 10 12:08:57 2024 +0100
+  #   Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs
+  #   [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda
+  set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
   # Due to the use of dependentloadflag and no installer for UMF and hwloc we need
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc
index 8221000502642..ee52d75b8fd21 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc
@@ -204,9 +204,11 @@ otherwise it is 0.
 |Returns the maximum number of work-groups, when the kernel is submitted to the
 specified queue with the specified work-group size and the specified amount of
 dynamic work-group local memory (in bytes), accounting for any kernel
-properties or features. If the kernel can be submitted to the specified queue
-without an error, the minimum value returned by this query is 1, otherwise it
-is 0.
+properties or features. If the specified work-group size is 0, which is
+invalid, then the implementation will throw a synchronous exception with the
+`errc::invalid` error code. If the kernel can be submitted to the specified
+queue without an error, the minimum value returned by this query is 1,
+otherwise it is 0.
 
 |===
 
diff --git a/sycl/include/sycl/detail/info_desc_helpers.hpp b/sycl/include/sycl/detail/info_desc_helpers.hpp
index e8bc8f76c83db..d3b4bfd977139 100644
--- a/sycl/include/sycl/detail/info_desc_helpers.hpp
+++ b/sycl/include/sycl/detail/info_desc_helpers.hpp
@@ -31,6 +31,8 @@ template <typename T> struct is_queue_info_desc : std::false_type {};
 template <typename T> struct is_kernel_info_desc : std::false_type {};
 template <typename T>
 struct is_kernel_device_specific_info_desc : std::false_type {};
+template <typename T>
+struct is_kernel_queue_specific_info_desc : std::false_type {};
 template <typename T> struct is_event_info_desc : std::false_type {};
 template <typename T> struct is_event_profiling_info_desc : std::false_type {};
 // Normally we would just use std::enable_if to limit valid get_info template
@@ -134,6 +136,16 @@ struct IsKernelInfo<info::kernel_device_specific::ext_codeplay_num_regs>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
 #undef __SYCL_PARAM_TRAITS_SPEC
+
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode)   \
+  template <>                                                                  \
+  struct is_##DescType##_info_desc<Namespace::info::DescType::Desc>            \
+      : std::true_type {                                                       \
+    using return_type = Namespace::info::DescType::Desc::return_type;          \
+  };
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+#undef __SYCL_PARAM_TRAITS_SPEC
+
 #define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode)              \
   template <>                                                                  \
   struct is_backend_info_desc<info::DescType::Desc> : std::true_type {         \
diff --git a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
index 558396bb6f9c8..b8c90683bbaaf 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
@@ -24,13 +24,8 @@ namespace sycl {
 inline namespace _V1 {
 namespace ext::oneapi::experimental {
 
-namespace info::kernel_queue_specific {
-// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
-// #7598 is merged.
-struct max_num_work_group_sync {
-  using return_type = size_t;
-};
-} // namespace info::kernel_queue_specific
+// See 'sycl/info/kernel_device_specific_traits.def' for the kernel
+// device-specific properties that relate to 'root_group'.
 
 template <int Dimensions> class root_group {
 public:
diff --git a/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
new file mode 100644
index 0000000000000..0ec11af0bb6b1
--- /dev/null
+++ b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
@@ -0,0 +1,4 @@
+// TODO: Revisit 'max_num_work_group_sync' and align it with the
+// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp
index 8f6a86e5f60bb..b84f98f350d0d 100644
--- a/sycl/include/sycl/info/info_desc.hpp
+++ b/sycl/include/sycl/info/info_desc.hpp
@@ -247,6 +247,8 @@ struct work_item_progress_capabilities;
 #include <sycl/info/ext_codeplay_device_traits.def>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+
 #undef __SYCL_PARAM_TRAITS_SPEC
 #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
 } // namespace _V1
diff --git a/sycl/include/sycl/kernel.hpp b/sycl/include/sycl/kernel.hpp
index 40db1d8461dde..dac7f619d745e 100644
--- a/sycl/include/sycl/kernel.hpp
+++ b/sycl/include/sycl/kernel.hpp
@@ -159,9 +159,29 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
       get_info(const device &Device, const range<3> &WGSize) const;
 
   // TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
-  // once #7598 is merged.
+  // once #7598 is merged. (regarding the 'max_num_work_group_sync' query)
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
   template <typename Param>
-  typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
 
 private:
   /// Constructs a SYCL kernel object from a valid kernel_impl instance.
diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp
index faf3695c04e94..50af09831f207 100644
--- a/sycl/source/detail/kernel_impl.cpp
+++ b/sycl/source/detail/kernel_impl.cpp
@@ -106,6 +106,38 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
       "interoperability function or to query a device built-in kernel");
 }
 
+bool kernel_impl::exceedsOccupancyResourceLimits(
+    const device &Device, const range<3> &WorkGroupSize,
+    size_t DynamicLocalMemorySize) const {
+  // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
+  // Generally, exceeding hardware resource limits will yield in an error when
+  // the kernel is launched.
+  const size_t MaxWorkGroupSize =
+      get_info<info::kernel_device_specific::work_group_size>(Device);
+  const size_t MaxLocalMemorySizeInBytes =
+      Device.get_info<info::device::local_mem_size>();
+
+  if (WorkGroupSize.size() > MaxWorkGroupSize)
+    return true;
+
+  if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
+    return true;
+
+  // It will be impossible to launch a kernel for Cuda when the hardware limit
+  // for the 32-bit registers page file size is exceeded.
+  if (Device.get_backend() == backend::ext_oneapi_cuda) {
+    const uint32_t RegsPerWorkItem =
+        get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
+    const uint32_t MaxRegsPerWorkGroup =
+        Device.get_info<ext::codeplay::experimental::info::device::
+                            max_registers_per_work_group>();
+    if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
+      return true;
+  }
+
+  return false;
+}
+
 template <>
 typename info::platform::version::return_type
 kernel_impl::get_backend_info<info::platform::version>() const {
diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp
index ab2950e26a856..040b5cbccf965 100644
--- a/sycl/source/detail/kernel_impl.hpp
+++ b/sycl/source/detail/kernel_impl.hpp
@@ -114,8 +114,26 @@ class kernel_impl {
   typename Param::return_type get_info(const device &Device,
                                        const range<3> &WGSize) const;
 
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename Param::return_type ext_oneapi_get_info(queue Queue) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
   template <typename Param>
-  typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+  typename Param::return_type
+  ext_oneapi_get_info(queue Queue, const range<3> &MaxWorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
 
   /// Get a constant reference to a raw kernel object.
   ///
@@ -171,6 +189,12 @@ class kernel_impl {
 
   bool isBuiltInKernel(const device &Device) const;
   void checkIfValidForNumArgsInfoQuery() const;
+
+  /// Check if the occupancy limits are exceeded for the given kernel launch
+  /// configuration.
+  bool exceedsOccupancyResourceLimits(const device &Device,
+                                      const range<3> &WorkGroupSize,
+                                      size_t DynamicLocalMemorySize) const;
 };
 
 template <typename Param>
@@ -217,20 +241,66 @@ kernel_impl::get_info(const device &Device,
       getPlugin());
 }
 
+namespace syclex = ext::oneapi::experimental;
+
 template <>
-inline typename ext::oneapi::experimental::info::kernel_queue_specific::
-    max_num_work_group_sync::return_type
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
+    return_type
     kernel_impl::ext_oneapi_get_info<
-        ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(const queue &Queue) const {
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  if (WorkGroupSize.size() == 0)
+    throw exception(sycl::make_error_code(errc::invalid),
+                    "The launch work-group size cannot be zero.");
+
   const auto &Plugin = getPlugin();
   const auto &Handle = getHandleRef();
+  auto Device = Queue.get_device();
+
+  uint32_t GroupCount{0};
+  if (auto Result = Plugin->call_nocheck<
+                    UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
+          Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
+      Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+    // The feature is supported. Check for other errors and throw if any.
+    Plugin->checkUrResult(Result);
+    return GroupCount;
+  }
+
+  // Fallback. If the backend API is unsupported, this query will return either
+  // 0 or 1 based on the kernel resource usage and the user-requested resources.
+  return exceedsOccupancyResourceLimits(Device, WorkGroupSize,
+                                        DynamicLocalMemorySize)
+             ? 0
+             : 1;
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+    return_type
+    kernel_impl::ext_oneapi_get_info<
+        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+        queue Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  return ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      Queue, WorkGroupSize, DynamicLocalMemorySize);
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+    return_type
+    kernel_impl::ext_oneapi_get_info<
+        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+        queue Queue) const {
+  auto Device = Queue.get_device();
   const auto MaxWorkGroupSize =
-      Queue.get_device().get_info<info::device::max_work_group_size>();
-  uint32_t GroupCount = 0;
-  Plugin->call<UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
-      Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
-  return GroupCount;
+      get_info<info::kernel_device_specific::work_group_size>(Device);
+  const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
+  return ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+      Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
 }
 
 } // namespace detail
diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
index f4ec76bcf9e7d..a4aae60bece08 100644
--- a/sycl/source/kernel.cpp
+++ b/sycl/source/kernel.cpp
@@ -106,16 +106,36 @@ kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
     const device &, const sycl::range<3> &) const;
 
 template <typename Param>
-typename Param::return_type
-kernel::ext_oneapi_get_info(const queue &Queue) const {
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue) const {
   return impl->ext_oneapi_get_info<Param>(Queue);
 }
 
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+                            size_t DynamicLocalMemorySize) const {
+  return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+                                          DynamicLocalMemorySize);
+}
+
 template __SYCL_EXPORT typename ext::oneapi::experimental::info::
     kernel_queue_specific::max_num_work_group_sync::return_type
     kernel::ext_oneapi_get_info<
         ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(const queue &Queue) const;
+            max_num_work_group_sync>(queue Queue) const;
+
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT)           \
+  template __SYCL_EXPORT ReturnT                                               \
+  kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
+      queue, const range<3> &, size_t) const;
+// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
+// kernel_queue_specific queries require the above-defined get_info interface.
+// clang-format off
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
+// clang-format on
+#undef __SYCL_PARAM_TRAITS_SPEC
 
 kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}
 
diff --git a/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp
new file mode 100644
index 0000000000000..7b76327d015b1
--- /dev/null
+++ b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp
@@ -0,0 +1,215 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/detail/core.hpp>
+
+#include <cassert>
+#include <cstdint>
+
+namespace syclex = sycl::ext::oneapi::experimental;
+using namespace sycl::info::device;
+using namespace sycl::info::kernel_device_specific;
+
+using value_type = int64_t;
+
+namespace kernels {
+
+template <class T, size_t Dim>
+using sycl_global_accessor =
+    sycl::accessor<T, Dim, sycl::access::mode::read_write,
+                   sycl::access::target::global_buffer>;
+
+class TestKernel {
+public:
+  static constexpr bool HasLocalMemory{false};
+
+  TestKernel(sycl_global_accessor<value_type, 1> acc) : acc_{acc} {}
+
+  void operator()(sycl::nd_item<1> item) const {
+    const auto gtid = item.get_global_linear_id();
+    acc_[gtid] = gtid + 42;
+  }
+
+private:
+  sycl_global_accessor<value_type, 1> acc_;
+};
+
+class TestLocalMemoryKernel {
+public:
+  static constexpr bool HasLocalMemory{true};
+
+  TestLocalMemoryKernel(sycl_global_accessor<value_type, 1> acc,
+                        sycl::local_accessor<value_type, 1> loc_acc)
+      : acc_{acc}, loc_acc_{loc_acc} {}
+
+  void operator()(sycl::nd_item<1> item) const {
+    const auto ltid = item.get_local_linear_id();
+    const auto gtid = item.get_global_linear_id();
+    if (ltid < loc_acc_.size()) {
+      loc_acc_[ltid] = ltid + 42;
+      item.barrier(sycl::access::fence_space::local_space);
+      acc_[gtid] = loc_acc_[ltid];
+    } else {
+      acc_[gtid] = 0;
+    }
+  }
+
+private:
+  sycl_global_accessor<value_type, 1> acc_;
+  sycl::local_accessor<value_type, 1> loc_acc_;
+};
+
+} // namespace kernels
+
+namespace {
+
+template <class KernelName>
+int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
+  const auto ctx = q.get_context();
+  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
+  auto kernel = bundle.template get_kernel<KernelName>();
+
+  const size_t maxWorkGroupSize =
+      kernel.template get_info<work_group_size>(dev);
+  const size_t NumWorkItems = maxWorkGroupSize * maxWorkGroupSize;
+
+  size_t workGroupSize = 32;
+  size_t localMemorySizeInBytes{0};
+  if constexpr (KernelName::HasLocalMemory) {
+    localMemorySizeInBytes = workGroupSize * sizeof(value_type);
+  }
+
+  sycl::buffer<value_type, 1> buf{sycl::range<1>{NumWorkItems}};
+
+  // Tests
+
+  // ==================== //
+  // Test 1 - return type //
+  // ==================== //
+  sycl::range<3> workGroupRange{workGroupSize, 1, 1};
+  auto maxWGs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      q, workGroupRange, localMemorySizeInBytes);
+
+  // Test the return type is as specified in the extension document.
+  static_assert(std::is_same_v<std::remove_cv_t<decltype(maxWGs)>, size_t>,
+                "max_num_work_groups query must return size_t");
+
+  // ===================== //
+  // Test 2 - return value //
+  // ===================== //
+  // We must have at least one active group if we are below resource limits.
+  assert(maxWGs > 0 && "max_num_work_groups query failed");
+  if (maxWGs == 0)
+    return 1;
+
+  // Run the kernel
+  auto launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
+                                        sycl::range<1>{workGroupSize}};
+  q.submit([&](sycl::handler &cgh) {
+     auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+     if constexpr (KernelName::HasLocalMemory) {
+       sycl::local_accessor<value_type, 1> loc_acc{
+           sycl::range<1>{workGroupSize}, cgh};
+       cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
+     } else {
+       cgh.parallel_for(launch_range, KernelName{acc});
+     }
+   }).wait();
+  assert(sycl::host_accessor{buf}[0] == 42);
+
+  // ========================== //
+  // Test 3 - use max resources //
+  // ========================== //
+  // A little over the maximum work-group size for the purpose of exceeding.
+  workGroupSize = maxWorkGroupSize;
+  workGroupRange[0] = workGroupSize;
+  size_t localSize =
+      (dev.get_info<sycl::info::device::local_mem_size>() / sizeof(value_type));
+  if constexpr (KernelName::HasLocalMemory) {
+    localMemorySizeInBytes = localSize * sizeof(value_type);
+  }
+  maxWGs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      q, workGroupRange, localMemorySizeInBytes);
+
+  assert(maxWGs > 0 && "max_num_work_groups query failed");
+  if (maxWGs == 0)
+    return 1;
+
+  launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
+                                   sycl::range<1>{workGroupSize}};
+
+  q.submit([&](sycl::handler &cgh) {
+     auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+     if constexpr (KernelName::HasLocalMemory) {
+       sycl::local_accessor<value_type, 1> loc_acc{sycl::range<1>{localSize},
+                                                   cgh};
+       cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
+     } else {
+       cgh.parallel_for(launch_range, KernelName{acc});
+     }
+   }).wait();
+  assert(sycl::host_accessor{buf}[0] == 42);
+
+  // =============================== //
+  // Test 4 - exceed resource limits //
+  // =============================== //
+  workGroupSize = maxWorkGroupSize + 32;
+  workGroupRange[0] = workGroupSize;
+  maxWGs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      q, workGroupRange, localMemorySizeInBytes);
+  // It cannot be possible to launch a kernel successfully with a configuration
+  // that exceeds the available resources as in the above defined workGroupSize.
+  // workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
+  if (dev.get_backend() == sycl::backend::ext_oneapi_cuda) {
+    assert(maxWGs == 0 && "max_num_work_groups query failed");
+    if (maxWGs > 0)
+      return 1;
+  }
+
+  // As we ensured that the 'max_num_work_groups' query correctly
+  // returns 0 possible work-groups, test that the kernel launch will fail.
+  // A configuration that defines a work-group size larger than the maximum
+  // possible should result in failure.
+  try {
+    launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
+                                     sycl::range<1>{workGroupSize}};
+
+    q.submit([&](sycl::handler &cgh) {
+       auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+       if constexpr (KernelName::HasLocalMemory) {
+         sycl::local_accessor<value_type, 1> loc_acc{sycl::range<1>{localSize},
+                                                     cgh};
+         cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
+       } else {
+         cgh.parallel_for(launch_range, KernelName{acc});
+       }
+     }).wait();
+  } catch (const sycl::exception &e) {
+    // 'nd_range' error is the expected outcome from the above launch config.
+    if (e.code() == sycl::make_error_code(sycl::errc::nd_range)) {
+      return 0;
+    }
+    std::cerr << e.code() << ":\t";
+    std::cerr << e.what() << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+} // namespace
+
+int main() {
+  sycl::queue q{};
+  sycl::device dev = q.get_device();
+
+  using namespace kernels;
+
+  int ret{0};
+  ret &= test_max_num_work_groups<TestKernel>(q, dev);
+  ret &= test_max_num_work_groups<TestLocalMemoryKernel>(q, dev);
+  return ret;
+}
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index d8393f35c6253..92e5d69ffcab4 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -30,12 +30,21 @@ void testQueriesAndProperties() {
   const auto maxWGs = kernel.ext_oneapi_get_info<
       sycl::ext::oneapi::experimental::info::kernel_queue_specific::
           max_num_work_group_sync>(q);
+  const auto wgRange = sycl::range{WorkGroupSize, 1, 1};
+  const auto maxWGsWithLimits = kernel.ext_oneapi_get_info<
+      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
+          max_num_work_group_sync>(q, wgRange, wgRange.size() * sizeof(int));
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
   q.single_task<class QueryKernel>(props, []() {});
-  static_assert(std::is_same_v<std::remove_cv<decltype(maxWGs)>::type, size_t>,
-                "max_num_work_group_sync query must return size_t");
-  assert(maxWGs >= 1 && "max_num_work_group_sync query failed");
+
+  static auto check_max_num_work_group_sync = [](auto Result) {
+    static_assert(std::is_same_v<std::remove_cv_t<decltype(Result)>, size_t>,
+                  "max_num_work_group_sync query must return size_t");
+    assert(Result >= 1 && "max_num_work_group_sync query failed");
+  };
+  check_max_num_work_group_sync(maxWGs);
+  check_max_num_work_group_sync(maxWGsWithLimits);
 }
 
 void testRootGroup() {
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 4c73f43ed6ba2..ec6ec2096403f 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3915,8 +3915,10 @@ _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device15backend_versionEEENS0_6
 _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info8platform7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16kernel17get_kernel_bundleEv
-_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENT_11return_typeERKNS0_5queueE
 _ZNK4sycl3_V16kernel3getEv
+_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueE
+_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm
+_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific19max_num_work_groupsEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm
 _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific15work_group_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
 _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16global_work_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
 _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16private_mem_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index e2c3643c557be..55ce460c64559 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -13,7 +13,9 @@
 ??$create_sub_devices@$0BAIH@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@_KV?$allocator@_K@std@@@4@@Z
 ??$create_sub_devices@$0BAII@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4partition_affinity_domain@info@12@@Z
 ??$create_sub_devices@$0BAIJ@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ
-??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVqueue@12@@Z
+??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@@Z
+??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z
+??$ext_oneapi_get_info@Umax_num_work_groups@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z
 ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
 ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
 ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@event@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp
index 36e9a5ad3961e..fd2628dfb6e1c 100644
--- a/sycl/test/include_deps/sycl_accessor.hpp.cpp
+++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp
@@ -110,6 +110,7 @@
 // CHECK-NEXT: info/ext_codeplay_device_traits.def
 // CHECK-NEXT: info/ext_intel_device_traits.def
 // CHECK-NEXT: info/ext_oneapi_device_traits.def
+// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def
 // CHECK-NEXT: info/sycl_backend_traits.def
 // CHECK-NEXT: platform.hpp
 // CHECK-NEXT: detail/string_view.hpp
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index 6cb1be75681ee..33dc01b8a4e74 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -111,6 +111,7 @@
 // CHECK-NEXT: info/ext_codeplay_device_traits.def
 // CHECK-NEXT: info/ext_intel_device_traits.def
 // CHECK-NEXT: info/ext_oneapi_device_traits.def
+// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def
 // CHECK-NEXT: info/sycl_backend_traits.def
 // CHECK-NEXT: platform.hpp
 // CHECK-NEXT: detail/string_view.hpp

From da2ff3c159ec1fe7a96e6645644ba66bdfbd601d Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 11 Sep 2024 22:57:08 +0900
Subject: [PATCH 10/19] BasicAA: Fix assert when indexing address spaces with
 different sizes (#103713) (#15347)

Cherry pick https://github.com/llvm/llvm-project/pull/103713 to fix AMD
postcommit testing.

Closes: https://github.com/intel/llvm/issues/15227

Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp  |  3 +++
 llvm/test/Analysis/BasicAA/issue103500.ll | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 llvm/test/Analysis/BasicAA/issue103500.ll

diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index e474899fb548e..e318cab3531c3 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -350,6 +350,9 @@ struct CastedValue {
   }
 
   bool hasSameCastsAs(const CastedValue &Other) const {
+    if (V->getType() != Other.V->getType())
+      return false;
+
     if (ZExtBits == Other.ZExtBits && SExtBits == Other.SExtBits &&
         TruncBits == Other.TruncBits)
       return true;
diff --git a/llvm/test/Analysis/BasicAA/issue103500.ll b/llvm/test/Analysis/BasicAA/issue103500.ll
new file mode 100644
index 0000000000000..3532e6e9479e5
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/issue103500.ll
@@ -0,0 +1,18 @@
+; RUN: opt -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "p0:64:64-p5:32:32"
+
+; CHECK: Function: indexing_different_sized_addrspace: 2 pointers, 0 call sites
+; CHECK: MayAlias:	i32* %gep.in.0, i32 addrspace(5)* %gep.in.5.1
+
+define i1 @indexing_different_sized_addrspace(ptr addrspace(5) %arg, i64 %arg1, i32 %arg2) {
+bb:
+  %arg.addrspacecast = addrspacecast ptr addrspace(5) %arg to ptr
+  %gep.in.5 = getelementptr i8, ptr addrspace(5) %arg, i32 16
+  %gep.in.0 = getelementptr i8, ptr %arg.addrspacecast, i64 %arg1
+  %gep.in.5.1 = getelementptr i8, ptr addrspace(5) %gep.in.5, i32 %arg2
+  %load.0 = load i32, ptr %gep.in.0, align 4
+  %load.1 = load i32, ptr addrspace(5) %gep.in.5.1, align 4
+  %cmp = icmp slt i32 %load.0, %load.1
+  ret i1 %cmp
+}

From 4f2ad4fc6dfee9f58352c18ba17e71d3164d7b3b Mon Sep 17 00:00:00 2001
From: lucyli-ca <107629053+lucyli-ca@users.noreply.github.com>
Date: Wed, 11 Sep 2024 11:22:31 -0400
Subject: [PATCH 11/19] Bump cryptography to 43.0.1 in
 llvm/utils/git/requirements.txt (#15360)

PR to bump dependency version to resolve security vulnerability found.

pyca/cryptography's wheels include a statically linked copy of OpenSSL.
The versions of OpenSSL included in cryptography 37.0.0-43.0.0 are
vulnerable to a security issue. More details about the vulnerability
itself can be found in
https://openssl-library.org/news/secadv/20240903.txt.
---
 llvm/utils/git/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt
index 35d38df674a89..9080e76558841 100644
--- a/llvm/utils/git/requirements.txt
+++ b/llvm/utils/git/requirements.txt
@@ -158,7 +158,7 @@ charset-normalizer==3.3.2 \
     --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \
     --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561
     # via requests
-cryptography==42.0.5 \
+cryptography==43.0.1 \
     --hash=sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee \
     --hash=sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576 \
     --hash=sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d \

From 1194277e47c2a69458d3e3b8dff6f3f2b303c7b3 Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Wed, 11 Sep 2024 17:02:50 +0100
Subject: [PATCH 12/19] Update docs regarding the kernel-fusion extension
 (#15356)

---
 sycl/doc/design/CompilerAndRuntimeDesign.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index 52ae88a2c0ef1..1935356d0a3a8 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -776,7 +776,7 @@ Note: Kernel naming is not fully stable for now.
 ##### Kernel Fusion Support
 
 The [experimental kernel fusion
-extension](../extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc)
+extension](../extensions/removed/sycl_ext_codeplay_kernel_fusion.asciidoc)
 also supports the CUDA and HIP backends. However, as the CUBIN, PTX and AMD assembly
 are not suitable input formats for the [kernel fusion JIT compiler](KernelFusionJIT.md), a
 suitable IR has to be added as an additional device binary.

From 783d2b93d7924879489943e3a8fa7d0c05f97ea5 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Thu, 12 Sep 2024 01:49:56 +0900
Subject: [PATCH 13/19] [SYCL][ClangLinkerWrapper] Support old-style objects
 and static archives (#15216)

This PR finishes up work our intern Jason was working on
[here](https://github.com/intel/llvm/pull/15156).

Most of the code here is not new, it is old code that was removed
[here](https://github.com/intel/llvm/commit/ece73ad61b49eaf9ecb6e2060e5f20e09e26def6).

This code is not intended to be permanent or upstreamed. It's intended
to be temporary to ease the work to enabling the new offload model by
default.

Both object files and static archives are supported and tested, but
SPIR-V fat objects are not, I don't think any customers are using that
anyway.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
Co-authored-by: Li, Jason <jason1.li@intel.com>
---
 .../ClangLinkerWrapper.cpp                    | 139 +++++++++++++++++-
 .../test-e2e/NewOffloadDriver/multisource.cpp |  17 +++
 2 files changed, 154 insertions(+), 2 deletions(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index eb37fa583d63a..65e25fa1e034e 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -251,8 +251,8 @@ Expected<OffloadFile> getInputBitcodeLibrary(StringRef Input) {
   Image.StringData["arch"] = Arch;
   Image.Image = std::move(*ImageOrError);
 
-  std::unique_ptr<MemoryBuffer> Binary =
-      MemoryBuffer::getMemBufferCopy(OffloadBinary::write(Image));
+  std::unique_ptr<MemoryBuffer> Binary = MemoryBuffer::getMemBufferCopy(
+      OffloadBinary::write(Image), Image.Image->getBufferIdentifier());
   auto NewBinaryOrErr = OffloadBinary::create(*Binary);
   if (!NewBinaryOrErr)
     return NewBinaryOrErr.takeError();
@@ -1358,6 +1358,135 @@ static Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
   return *DeviceLinkedFile;
 }
 
+static bool isStaticArchiveFile(const StringRef Filename) {
+  if (!llvm::sys::path::has_extension(Filename))
+    // Any file with no extension should not be considered an Archive.
+    return false;
+  llvm::file_magic Magic;
+  llvm::identify_magic(Filename, Magic);
+  // Only archive files are to be considered.
+  // TODO: .lib check to be added
+  return (Magic == llvm::file_magic::archive);
+}
+
+static Expected<StringRef> listSection(StringRef Filename,
+                                       const ArgList &Args) {
+  Expected<std::string> OffloadBundlerPath = findProgram(
+      "clang-offload-bundler", {getMainExecutable("clang-offload-bundler")});
+  if (!OffloadBundlerPath)
+    return OffloadBundlerPath.takeError();
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+
+  SmallVector<StringRef, 8> CmdArgs;
+  CmdArgs.push_back(*OffloadBundlerPath);
+  bool IsArchive = isStaticArchiveFile(Filename);
+  CmdArgs.push_back(IsArchive ? "-type=aoo" : "-type=o");
+  CmdArgs.push_back(Saver.save("-input=" + Filename));
+  CmdArgs.push_back("-list");
+  auto Output = createOutputFile("bundled-targets", "list");
+  if (!Output)
+    return Output.takeError();
+  SmallVector<std::optional<StringRef>> Redirects{std::nullopt, *Output,
+                                                  std::nullopt};
+  int ErrCode = llvm::sys::ExecuteAndWait(*OffloadBundlerPath, CmdArgs,
+                                          std::nullopt, Redirects);
+  if (ErrCode != 0)
+    return createStringError(inconvertibleErrorCode(),
+                             "Failed to list targets");
+  return *Output;
+}
+
+// This routine is used to run the clang-offload-bundler tool and unbundle
+// device inputs that have been created with an older compiler where the
+// device object is bundled into a host object.
+static Expected<StringRef> unbundle(StringRef Filename, const ArgList &Args,
+                                    llvm::Triple Triple) {
+  Expected<std::string> OffloadBundlerPath = findProgram(
+      "clang-offload-bundler", {getMainExecutable("clang-offload-bundler")});
+  if (!OffloadBundlerPath)
+    return OffloadBundlerPath.takeError();
+
+  // Create a new file to write the unbundled file to.
+  auto TempFileOrErr =
+      createOutputFile(sys::path::filename(ExecutableName), "ir");
+  if (!TempFileOrErr)
+    return TempFileOrErr.takeError();
+
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+
+  SmallVector<StringRef, 8> CmdArgs;
+  CmdArgs.push_back(*OffloadBundlerPath);
+  bool IsArchive = isStaticArchiveFile(Filename);
+  CmdArgs.push_back(IsArchive ? "-type=aoo" : "-type=o");
+  auto *Target = Args.MakeArgString(Twine("-targets=sycl-") + Triple.str());
+  CmdArgs.push_back(Target);
+  CmdArgs.push_back(Saver.save("-input=" + Filename));
+  CmdArgs.push_back(Saver.save("-output=" + *TempFileOrErr));
+  CmdArgs.push_back("-unbundle");
+  CmdArgs.push_back("-allow-missing-bundles");
+  if (Error Err = executeCommands(*OffloadBundlerPath, CmdArgs))
+    return std::move(Err);
+  return *TempFileOrErr;
+}
+
+Error extractBundledObjects(StringRef Filename, const ArgList &Args,
+                            SmallVector<OffloadFile> &Binaries) {
+  auto List = listSection(Filename, Args);
+  if (!List)
+    return List.takeError();
+  SmallVector<StringRef> TriplesInFile;
+  llvm::ErrorOr<std::unique_ptr<MemoryBuffer>> TripleList =
+      llvm::MemoryBuffer::getFileOrSTDIN(*List, /*isText=*/true);
+  if (std::error_code EC = TripleList.getError())
+    return createFileError(*List, EC);
+  (*TripleList)
+      ->getBuffer()
+      .split(TriplesInFile, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+  for (StringRef TripleStr : TriplesInFile) {
+    StringRef SYCLPrefix = "sycl-";
+    if (!TripleStr.starts_with(SYCLPrefix))
+      continue;
+    llvm::Triple Triple(TripleStr.substr(SYCLPrefix.size()));
+    auto UnbundledFile = unbundle(Filename, Args, Triple);
+    if (!UnbundledFile)
+      return UnbundledFile.takeError();
+    if (*UnbundledFile == Filename)
+      continue;
+
+    SmallVector<StringRef> ObjectFilePaths;
+    if (sycl::isStaticArchiveFile(Filename)) {
+      llvm::ErrorOr<std::unique_ptr<MemoryBuffer>> ObjList =
+          llvm::MemoryBuffer::getFileOrSTDIN(*UnbundledFile, /*isText=*/true);
+      if (std::error_code EC = ObjList.getError())
+        return createFileError(*UnbundledFile, EC);
+      (*ObjList)->getBuffer().split(ObjectFilePaths, '\n', /*MaxSplit=*/-1,
+                                    /*KeepEmpty=*/false);
+    } else {
+      ObjectFilePaths.push_back(*UnbundledFile);
+    }
+    for (StringRef ObjectFilePath : ObjectFilePaths) {
+      llvm::file_magic Magic;
+      llvm::identify_magic(ObjectFilePath, Magic);
+      if (Magic == file_magic::spirv_object)
+        return createStringError(
+            "SPIR-V fat objects must be generated with --offload-new-driver");
+      auto Arg = Args.MakeArgString(
+          "sycl-" +
+          (Triple.isSPIROrSPIRV() ? Triple.str() + "-" : Triple.str()) + "=" +
+          ObjectFilePath);
+      auto Binary = getInputBitcodeLibrary(Arg);
+
+      if (!Binary)
+        return Binary.takeError();
+
+      Binaries.push_back(std::move(*Binary));
+    }
+  }
+  return Error::success();
+}
+
 } // namespace sycl
 
 namespace generic {
@@ -2634,8 +2763,14 @@ getDeviceInput(const ArgList &Args) {
     if (identify_magic(Buffer.getBuffer()) == file_magic::elf_shared_object)
       continue;
     SmallVector<OffloadFile> Binaries;
+    size_t OldSize = Binaries.size();
     if (Error Err = extractOffloadBinaries(Buffer, Binaries))
       return std::move(Err);
+    if (Binaries.size() == OldSize) {
+      if (Error Err = sycl::extractBundledObjects(*Filename, Args, Binaries))
+        return std::move(Err);
+    }
+
     for (auto &OffloadFile : Binaries) {
       if (identify_magic(Buffer.getBuffer()) == file_magic::archive &&
           !WholeArchive)
diff --git a/sycl/test-e2e/NewOffloadDriver/multisource.cpp b/sycl/test-e2e/NewOffloadDriver/multisource.cpp
index 83223d4b68e09..0d7bec07654fb 100644
--- a/sycl/test-e2e/NewOffloadDriver/multisource.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/multisource.cpp
@@ -21,6 +21,23 @@
 // RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t.fat
 // RUN: %{run} %t.fat
 
+// Multiple sources with kernel code with old-style objects
+// Test with `--offload-new-driver`
+// RUN: %{build} --no-offload-new-driver -c -o %t.init.o -DINIT_KERNEL
+// RUN: %{build} --no-offload-new-driver -c -o %t.calc.o -DCALC_KERNEL
+// RUN: %{build} --no-offload-new-driver -c -o %t.main.o -DMAIN_APP
+// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t.fat
+// RUN: %{run} %t.fat
+
+// Multiple sources with kernel code with old-style objects in a static archive
+// Test with `--offload-new-driver`
+// RUN: %{build} --no-offload-new-driver -c -o %t.init.o -DINIT_KERNEL
+// RUN: %{build} --no-offload-new-driver -c -o %t.calc.o -DCALC_KERNEL
+// RUN: %{build} --no-offload-new-driver -c -o %t.main.o -DMAIN_APP
+// RUN: llvm-ar r %t.a %t.init.o %t.calc.o
+// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.main.o %t.a -o %t.fat
+// RUN: %{run} %t.fat
+
 #include <sycl/detail/core.hpp>
 
 #include <iostream>

From 1b05b81c69eb11890986ae3f8938ebc28f0536f7 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Thu, 12 Sep 2024 01:56:26 +0900
Subject: [PATCH 14/19] [SYCL][NFCI] Rework spec constants metadata used for
 split (#15346)

Addressing review feedback from https://github.com/intel/llvm/pull/15271

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 llvm/include/llvm/SYCLLowerIR/SpecConstants.h     |  5 -----
 llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp | 13 +++++--------
 llvm/lib/SYCLLowerIR/SpecConstants.cpp            |  6 +++---
 llvm/tools/sycl-post-link/sycl-post-link.cpp      |  4 ++++
 4 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h
index 114bf431a279a..217d9f90913ca 100644
--- a/llvm/include/llvm/SYCLLowerIR/SpecConstants.h
+++ b/llvm/include/llvm/SYCLLowerIR/SpecConstants.h
@@ -72,11 +72,6 @@ class SpecConstantsPass : public PassInfoMixin<SpecConstantsPass> {
   collectSpecConstantDefaultValuesMetadata(const Module &M,
                                            std::vector<char> &DefaultValues);
 
-  // Name of the metadata which holds a list of all specialization constants
-  // (with associated information) encountered in the module
-  static constexpr char SPEC_CONST_MD_STRING[] =
-      "sycl.specialization-constants";
-
   // Name of the metadata which indicates this module was proccessed with the
   // default values handing mode.
   static constexpr char SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING[] =
diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
index 67ac13c569f10..0bfcd53784af3 100644
--- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
+++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
@@ -150,14 +150,11 @@ PropSetRegTy computeModuleProperties(const Module &M,
     PropSet.add(PropSetRegTy::SYCL_DEVICE_REQUIREMENTS,
                 computeDeviceRequirements(M, EntryPoints).asMap());
   }
-  auto *SpecConstsMD =
-      M.getNamedMetadata(SpecConstantsPass::SPEC_CONST_MD_STRING);
-  bool SpecConstsMet =
-      SpecConstsMD != nullptr && SpecConstsMD->getNumOperands() != 0;
-  if (SpecConstsMet) {
-    // extract spec constant maps per each module
-    SpecIDMapTy TmpSpecIDMap;
-    SpecConstantsPass::collectSpecConstantMetadata(M, TmpSpecIDMap);
+
+  // extract spec constant maps per each module
+  SpecIDMapTy TmpSpecIDMap;
+  SpecConstantsPass::collectSpecConstantMetadata(M, TmpSpecIDMap);
+  if (!TmpSpecIDMap.empty()) {
     PropSet.add(PropSetRegTy::SYCL_SPECIALIZATION_CONSTANTS, TmpSpecIDMap);
 
     // Add property with the default values of spec constants
diff --git a/llvm/lib/SYCLLowerIR/SpecConstants.cpp b/llvm/lib/SYCLLowerIR/SpecConstants.cpp
index bf8215db94028..4f43a22e95fd9 100644
--- a/llvm/lib/SYCLLowerIR/SpecConstants.cpp
+++ b/llvm/lib/SYCLLowerIR/SpecConstants.cpp
@@ -47,6 +47,9 @@ constexpr char SPIRV_GET_SPEC_CONST_VAL[] = "__spirv_SpecConstant";
 constexpr char SPIRV_GET_SPEC_CONST_COMPOSITE[] =
     "__spirv_SpecConstantComposite";
 
+// Name of the metadata which holds a list of all specialization constants (with
+// associated information) encountered in the module
+constexpr char SPEC_CONST_MD_STRING[] = "sycl.specialization-constants";
 // Name of the metadata which holds a default value list of all specialization
 // constants encountered in the module
 constexpr char SPEC_CONST_DEFAULT_VAL_MD_STRING[] =
@@ -1026,9 +1029,6 @@ PreservedAnalyses SpecConstantsPass::run(Module &M,
   for (const auto &P : DefaultsMetadata)
     MDDefaults->addOperand(P);
 
-  if (Mode == HandlingMode::default_values)
-    M.getOrInsertNamedMetadata(SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING);
-
   return IRModified ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 00bacce06d08b..3ea65b2492a5c 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -509,6 +509,10 @@ processSpecConstantsWithDefaultValues(const module_split::ModuleDesc &MD) {
   assert(NewModuleDesc->Props.SpecConstsMet &&
          "This property should be true since the presence of SpecConsts "
          "has been checked before the run of the pass");
+  // Add metadata to the module so we can identify it as the default value split
+  // later.
+  NewModuleDesc->getModule().getOrInsertNamedMetadata(
+      SpecConstantsPass::SPEC_CONST_DEFAULT_VAL_MODULE_MD_STRING);
   NewModuleDesc->rebuildEntryPoints();
   return NewModuleDesc;
 }

From 8d1c9f02126cab95a0b84f5429f639050c993711 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Thu, 12 Sep 2024 02:07:40 +0900
Subject: [PATCH 15/19] [SYCL] Remove device_global_static.cpp test  (#15362)

We don't think it provides value.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../device_global_static.cpp                  | 36 -------------------
 1 file changed, 36 deletions(-)
 delete mode 100644 sycl/test/check_device_code/device_global_static.cpp

diff --git a/sycl/test/check_device_code/device_global_static.cpp b/sycl/test/check_device_code/device_global_static.cpp
deleted file mode 100644
index a2d7768067de6..0000000000000
--- a/sycl/test/check_device_code/device_global_static.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Tests that the llvm.compiler.used symbol, which is used to implement static
-// device globals, is removed at some point in compilation. For SPIR-V this
-// symbol is removed at sycl-post-link and for NVPTX/AMDGCN it is removed at
-// lowering.
-//
-// It also checks that the symbol can be found in an object file for a given
-// triple, thus validating that `llvm-strings` can successfully be used to
-// check for the presence of the symbol.
-
-// UNSUPPORTED: windows
-
-// RUN: %clangxx -fsycl -fsycl-device-only %s -o %t
-// RUN: llvm-strings %t | grep "llvm.compiler.used"
-// RUN: %clangxx -fsycl %s -o %t
-// RUN: llvm-strings %t | not grep "llvm.compiler.used"
-
-// RUN: %if cuda %{ %clangxx -fsycl -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda %s -o %t %}
-// RUN: %if cuda %{ llvm-strings %t | grep "llvm.compiler.used" %}
-// RUN: %if cuda %{ %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s -o %t %}
-// RUN: %if cuda %{ llvm-strings %t | not grep "llvm.compiler.used" %}
-
-// RUN: %if hip_amd %{ %clangxx -fsycl -fsycl-device-only -fsycl-targets=amd_gpu_gfx906 %s -o %t %}
-// RUN: %if hip_amd %{ llvm-strings %t | grep "llvm.compiler.used" %}
-// RUN: %if hip_amd %{ %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -o %t %}
-// RUN: %if hip_amd %{ llvm-strings %t | not grep "llvm.compiler.used" %}
-
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
-
-static device_global<int> DeviceGlobalVar;
-
-int main() {
-  sycl::queue{}.single_task([=] { volatile int ReadVal = DeviceGlobalVar; });
-}

From d286ca2c1aa4377c51faf282a4d972d8b728c711 Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Wed, 11 Sep 2024 19:20:56 +0200
Subject: [PATCH 16/19] [SYCL] Add sync for host task after barrier (#15345)

PR includes the following fixes:
* When submitting a command to an out-of-order queue we don't need to
add the dependency from the last barrier to the scheduler if command is
enqueued via UR because that means that UR backend will take care of the
command from barrier.

* We used to update the last barrier only if barrier which is being
submitted to the sycl::queue is not enqueued to the backend which is not
correct, because there might be host tasks following that barrier which
have to depend on it.

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.hpp               | 15 ++++++++-------
 sycl/unittests/scheduler/HostTaskAndBarrier.cpp |  9 ++++++---
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index e15e9bc69503e..a67fdb5e19102 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -784,18 +784,19 @@ class queue_impl {
       if (Type == CGType::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
         Handler.depends_on(Deps.UnenqueuedCmdEvents);
       }
-      if (Deps.LastBarrier)
+      if (Deps.LastBarrier && (Type == CGType::CodeplayHostTask ||
+                               (!Deps.LastBarrier->isEnqueued())))
         Handler.depends_on(Deps.LastBarrier);
+
       EventRet = Handler.finalize();
       EventImplPtr EventRetImpl = getSyclObjImpl(EventRet);
       if (Type == CGType::CodeplayHostTask)
         Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
-      else if (!EventRetImpl->isEnqueued()) {
-        if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) {
-          Deps.LastBarrier = EventRetImpl;
-          Deps.UnenqueuedCmdEvents.clear();
-        } else
-          Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
+      else if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) {
+        Deps.LastBarrier = EventRetImpl;
+        Deps.UnenqueuedCmdEvents.clear();
+      } else if (!EventRetImpl->isEnqueued()) {
+        Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
       }
     }
   }
diff --git a/sycl/unittests/scheduler/HostTaskAndBarrier.cpp b/sycl/unittests/scheduler/HostTaskAndBarrier.cpp
index 1a2e981c2ae99..c244e7b60a0b3 100644
--- a/sycl/unittests/scheduler/HostTaskAndBarrier.cpp
+++ b/sycl/unittests/scheduler/HostTaskAndBarrier.cpp
@@ -196,7 +196,8 @@ TEST_F(BarrierHandlingWithHostTask, BarrierHostTaskKernel) {
   sycl::event HTEvent = AddTask(TestCGType::HOST_TASK);
   EventImplPtr HostTaskEventImpl = sycl::detail::getSyclObjImpl(HTEvent);
   auto HostTaskWaitList = HostTaskEventImpl->getWaitList();
-  ASSERT_EQ(HostTaskWaitList.size(), 0u);
+  ASSERT_EQ(HostTaskWaitList.size(), 1u);
+  EXPECT_EQ(HostTaskWaitList[0], BarrierEventImpl);
   EXPECT_EQ(HostTaskEventImpl->isEnqueued(), true);
 
   sycl::event KernelEvent = AddTask(TestCGType::KERNEL_TASK);
@@ -225,7 +226,8 @@ TEST_F(BarrierHandlingWithHostTask, BarrierKernelHostTask) {
   sycl::event HTEvent = AddTask(TestCGType::HOST_TASK);
   EventImplPtr HostTaskEventImpl = sycl::detail::getSyclObjImpl(HTEvent);
   auto HostTaskWaitList = HostTaskEventImpl->getWaitList();
-  ASSERT_EQ(HostTaskWaitList.size(), 0u);
+  ASSERT_EQ(HostTaskWaitList.size(), 1u);
+  EXPECT_EQ(HostTaskWaitList[0], BarrierEventImpl);
   EXPECT_EQ(HostTaskEventImpl->isEnqueued(), true);
 
   MainLock.unlock();
@@ -272,7 +274,8 @@ TEST_F(BarrierHandlingWithHostTask, KernelBarrierHostTask) {
   sycl::event HTEvent = AddTask(TestCGType::HOST_TASK);
   EventImplPtr HostTaskEventImpl = sycl::detail::getSyclObjImpl(HTEvent);
   auto HostTaskWaitList = HostTaskEventImpl->getWaitList();
-  ASSERT_EQ(HostTaskWaitList.size(), 0u);
+  ASSERT_EQ(HostTaskWaitList.size(), 1u);
+  EXPECT_EQ(HostTaskWaitList[0], BarrierEventImpl);
   EXPECT_EQ(HostTaskEventImpl->isEnqueued(), true);
 
   MainLock.unlock();

From 4152ed323077ea5bc8c8c61063484e9db2395cfd Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Wed, 11 Sep 2024 10:50:09 -0700
Subject: [PATCH 17/19] [SYCL][NFC] Fix InOrderQueueIsolatedDeps when run
 repeatedly (#15340)

The test needs to reset the variable used to check when a UR call is
made.
---
 sycl/unittests/scheduler/InOrderQueueDeps.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
index 4e53f0ed73a18..b763cbef85a7a 100644
--- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
@@ -107,6 +107,7 @@ TEST_F(SchedulerTest, InOrderQueueIsolatedDeps) {
   sycl::platform Plt = sycl::platform();
   mock::getCallbacks().set_before_callback(
       "urEnqueueEventsWaitWithBarrier", &redefinedEnqueueEventsWaitWithBarrier);
+  BarrierCalled = false;
 
   context Ctx{Plt.get_devices()[0]};
   queue Q1{Ctx, default_selector_v, property::queue::in_order()};

From d3d95215454af4ba8491888cc030273b32cb5c8d Mon Sep 17 00:00:00 2001
From: David Garcia Orozco <david.garcia.orozco@intel.com>
Date: Wed, 11 Sep 2024 13:30:41 -0700
Subject: [PATCH 18/19] [SYCL][E2E] Fix warnings from using GNU style options
 with MSVC compiler driver (#15364)

Due to the addition of the `-Werror` flag, these warnings were causing
test failures when compiling tests with clang-cl. This patch does the
following:

- Add `-Wno-unused-command-line-argument` to linking run lines. This is
needed because when testing the MSVC driver the `/EHsc` flag is added to
all run lines that call the compiler. This flag however is only needed
at the compile stage so it is reported as unused if a run line is only
linking.
- Add new expansion for no optimizations `%no_opt`, which is either
`-O0` or `/Od` depending on the compiler driver.
- Use the expansion `%cxx_std_option` in place of `-std=`
- For flags that do not necessarily have an MSVC equivalent, prepend
`/clang:` to the flag when using MSVC driver.
---
 sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp            | 8 ++++----
 sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp         | 3 ++-
 sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp             | 2 +-
 sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp         | 2 +-
 sycl/test-e2e/Basic/group_async_copy.cpp                 | 2 +-
 sycl/test-e2e/Basic/group_async_copy_legacy.cpp          | 2 +-
 sycl/test-e2e/Basic/group_local_id.cpp                   | 2 +-
 sycl/test-e2e/Basic/multisource.cpp                      | 4 ++--
 sycl/test-e2e/Basic/multisource_spv_obj.cpp              | 6 +++---
 sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp           | 2 +-
 sycl/test-e2e/Basic/vector/byte.cpp                      | 2 +-
 sycl/test-e2e/Config/config.cpp                          | 2 +-
 sycl/test-e2e/Config/env_vars.cpp                        | 2 +-
 sycl/test-e2e/DeviceLib/separate_compile_test.cpp        | 4 ++--
 sycl/test-e2e/ESIMD/rotate.cpp                           | 4 ++--
 sycl/test-e2e/ESIMD/slm_alloc.cpp                        | 2 +-
 .../test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp | 2 +-
 sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp  | 2 +-
 .../GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp   | 2 +-
 sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp            | 2 +-
 sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp               | 2 +-
 .../test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp | 2 +-
 .../is_compatible/is_compatible_with_aspects.cpp         | 2 +-
 .../sycl-external-with-optional-features.cpp             | 2 +-
 .../throw-exception-for-unsupported-aspect.cpp           | 2 +-
 sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp     | 2 +-
 .../PropagateOptionsToBackend/sycl-opt-level-opencl.cpp  | 4 ++--
 .../Regression/2020-spec-constants-debug-info.cpp        | 2 +-
 sycl/test-e2e/Regression/DAE-separate-compile.cpp        | 2 +-
 sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp     | 2 +-
 .../Regression/default-constructed-local-accessor.cpp    | 2 +-
 .../Regression/optimization_level_debug_info_intopt.cpp  | 2 +-
 .../Regression/optimization_level_debug_info_specopt.cpp | 9 ++++++---
 sycl/test-e2e/Regression/unoptimized_stream.cpp          | 2 +-
 sycl/test-e2e/SeparateCompile/same-kernel.cpp            | 2 +-
 .../SeparateCompile/sycl-external-within-staticlib.cpp   | 4 ++--
 sycl/test-e2e/SeparateCompile/sycl-external.cpp          | 4 ++--
 sycl/test-e2e/SeparateCompile/test.cpp                   | 2 +-
 sycl/test-e2e/lit.cfg.py                                 | 2 ++
 39 files changed, 57 insertions(+), 51 deletions(-)

diff --git a/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp b/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp
index d23ece6677b06..08cc0e8199cfd 100644
--- a/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp
+++ b/sycl/test-e2e/AOT/fpga-aoc-archive-early2.cpp
@@ -18,10 +18,10 @@
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 // Build any image archive binaries from early archives.
-// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_sub.a   -o %t_early_image_sub.a
-// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_add.a   -o %t_early_image_add.a
-// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_sub_x.a -o %t_early_image_sub_x.a
-// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image %t_early_add_x.a -o %t_early_image_add_x.a
+// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_sub.a   -o %t_early_image_sub.a
+// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_add.a   -o %t_early_image_add.a
+// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_sub_x.a -o %t_early_image_sub_x.a
+// RUN: %clangxx -fintelfpga -fsycl -fsycl-link=image -Wno-unused-command-line-argument %t_early_add_x.a -o %t_early_image_add_x.a
 ////////////////////////////////////////////////////////////////////////////////
 // Use a variety of archive orders
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
index 75fe3f2fc50f9..190ca3bedf095 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
@@ -1,4 +1,5 @@
-// RUN: %{build} -fno-fast-math -o %t.out
+// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
+// RUN: %{build} %{mathflags} -o %t.out
 // RUN: %{run} %t.out
 
 // Test new, ABI-breaking for all platforms.
diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp
index 578602dccb0e2..6f2ca5f76f1ec 100644
--- a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp
+++ b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx.cpp
@@ -20,7 +20,7 @@
 // AOCX with source
 // RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp %t_image.a -o %t_aocx_src.out
 // AOCX with object
-// RUN: %clangxx -fsycl -fintelfpga %t.o %t_image.a -o %t_aocx_obj.out
+// RUN: %clangxx -fsycl -fintelfpga %t.o %t_image.a -Wno-unused-command-line-argument -o %t_aocx_obj.out
 //
 // RUN: %{run} %t_aocx_src.out
 // RUN: %{run} %t_aocx_obj.out
diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp
index 44f768ace53df..d648eaaff46ca 100644
--- a/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp
+++ b/sycl/test-e2e/Basic/fpga_tests/fpga_aocx_win.cpp
@@ -21,7 +21,7 @@
 // AOCX with source
 // RUN: %clangxx -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp %t_image.lib -o %t_aocx_src.out
 // AOCX with object
-// RUN: %clangxx -fsycl -fintelfpga %t.obj %t_image.lib -o %t_aocx_obj.out
+// RUN: %clangxx -fsycl -fintelfpga %t.obj %t_image.lib -Wno-unused-command-line-argument -o %t_aocx_obj.out
 //
 // RUN: %{run} %t_aocx_src.out
 // RUN: %{run} %t_aocx_obj.out
diff --git a/sycl/test-e2e/Basic/group_async_copy.cpp b/sycl/test-e2e/Basic/group_async_copy.cpp
index 0e539d9912fe8..2eceadd3e9898 100644
--- a/sycl/test-e2e/Basic/group_async_copy.cpp
+++ b/sycl/test-e2e/Basic/group_async_copy.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -std=c++17 -o %t.run
+// RUN: %{build} %cxx_std_optionc++17 -o %t.run
 // RUN: %{run} %t.run
 
 // Windows doesn't yet have full shutdown(). Skipping TC MemLeak tests.
diff --git a/sycl/test-e2e/Basic/group_async_copy_legacy.cpp b/sycl/test-e2e/Basic/group_async_copy_legacy.cpp
index dbdaaeada882c..ac5fac91e0cdb 100644
--- a/sycl/test-e2e/Basic/group_async_copy_legacy.cpp
+++ b/sycl/test-e2e/Basic/group_async_copy_legacy.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -std=c++17 -o %t.run -Wno-deprecated-declarations
+// RUN: %{build} %cxx_std_optionc++17 -o %t.run -Wno-deprecated-declarations
 // RUN: %{run} %t.run
 
 // Variant of group_asymc_copy.cpp using legacy multi_ptr and the corresponding
diff --git a/sycl/test-e2e/Basic/group_local_id.cpp b/sycl/test-e2e/Basic/group_local_id.cpp
index 83e43ef4be188..a523ea70cc8de 100644
--- a/sycl/test-e2e/Basic/group_local_id.cpp
+++ b/sycl/test-e2e/Basic/group_local_id.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: cpu
 
-// RUN: %{build} -std=c++17 -o %t.out
+// RUN: %{build} %cxx_std_optionc++17 -o %t.out
 // RUN: %{run} %t.out
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Basic/multisource.cpp b/sycl/test-e2e/Basic/multisource.cpp
index 778fc7007e0a8..87ac6ccfe4888 100644
--- a/sycl/test-e2e/Basic/multisource.cpp
+++ b/sycl/test-e2e/Basic/multisource.cpp
@@ -9,14 +9,14 @@
 // Separate kernel sources and host code sources
 // RUN: %{build} -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL
 // RUN: %{build} -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -o %t.fat
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t.fat
 // RUN: %{run} %t.fat
 
 // Multiple sources with kernel code
 // RUN: %{build} -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -o %t.fat
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t.fat
 // RUN: %{run} %t.fat
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Basic/multisource_spv_obj.cpp b/sycl/test-e2e/Basic/multisource_spv_obj.cpp
index 46bbfceac0fb9..9756de3f24fd1 100644
--- a/sycl/test-e2e/Basic/multisource_spv_obj.cpp
+++ b/sycl/test-e2e/Basic/multisource_spv_obj.cpp
@@ -11,21 +11,21 @@
 // Separate kernel sources and host code sources
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -o %t.fat
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t.fat
 // RUN: %{run} %t.fat
 
 // Multiple sources with kernel code
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -o %t.fat
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t.fat
 // RUN: %{run} %t.fat
 
 // Multiple sources with kernel code, mixed SPIR-V and LLVM-IR objects
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} -fsycl-device-obj=llvmir -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -o %t.fat
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t.fat
 // RUN: %{run} %t.fat
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp b/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp
index 4c6f371814b69..9dd2194e9e33c 100644
--- a/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp
+++ b/sycl/test-e2e/Basic/spirv_device_obj_smoke.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: cuda || hip
 // RUN: %clangxx -fsycl -fsycl-device-obj=spirv -c -o %t.o %s
-// RUN: %clangxx -fsycl -o %t.out %t.o
+// RUN: %clangxx -fsycl -Wno-unused-command-line-argument -o %t.out %t.o
 // RUN: %{run} %t.out
 
 // This test verifies SPIR-V based fat objects.
diff --git a/sycl/test-e2e/Basic/vector/byte.cpp b/sycl/test-e2e/Basic/vector/byte.cpp
index c0f94e687559a..278a49a31b955 100644
--- a/sycl/test-e2e/Basic/vector/byte.cpp
+++ b/sycl/test-e2e/Basic/vector/byte.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -std=c++17 -o %t.out
+// RUN: %{build} %cxx_std_optionc++17 -o %t.out
 // RUN: %{run} %t.out
 
 // RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -std=c++17 -o %t2.out %}
diff --git a/sycl/test-e2e/Config/config.cpp b/sycl/test-e2e/Config/config.cpp
index 1499a6f9908d5..1374e6e56cfde 100644
--- a/sycl/test-e2e/Config/config.cpp
+++ b/sycl/test-e2e/Config/config.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// RUN: %{build} %debug_option -O0 -o %t.out
+// RUN: %{build} %debug_option %no_opt -o %t.out
 // RUN: echo SYCL_PRINT_EXECUTION_GRAPH=always > %t.cfg
 // RUN: env SYCL_CONFIG_FILE_NAME=%t.cfg %t.out
 // RUN: cat *.dot > /dev/null
diff --git a/sycl/test-e2e/Config/env_vars.cpp b/sycl/test-e2e/Config/env_vars.cpp
index 00f17c508ccae..67ec5debd4986 100644
--- a/sycl/test-e2e/Config/env_vars.cpp
+++ b/sycl/test-e2e/Config/env_vars.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: opencl
 // Env vars are used to pass OpenCL-specific flags to PI compiling/linking.
 //
-// RUN: %{build} -O0 -o %t.out
+// RUN: %{build} %no_opt -o %t.out
 //
 // RUN: env SYCL_PROGRAM_COMPILE_OPTIONS="-g" %{run} %t.out
 // RUN: env SYCL_PROGRAM_APPEND_COMPILE_OPTIONS="-g" %{run} %t.out
diff --git a/sycl/test-e2e/DeviceLib/separate_compile_test.cpp b/sycl/test-e2e/DeviceLib/separate_compile_test.cpp
index ad137fb14e5b8..09085fe2c5ab0 100644
--- a/sycl/test-e2e/DeviceLib/separate_compile_test.cpp
+++ b/sycl/test-e2e/DeviceLib/separate_compile_test.cpp
@@ -4,12 +4,12 @@
 // RUN: %clangxx -fsycl-device-only -Xclang -fsycl-int-header=std_complex_math_test_ihdr.h %S/std_complex_math_test.cpp -Wno-sycl-strict %{mathflags}
 // >> host compilation...
 // RUN: %clangxx -Wno-error=unused-command-line-argument -Wno-error=ignored-attributes %cxx_std_optionc++17 %include_option std_complex_math_test_ihdr.h -c %S/std_complex_math_test.cpp -o %t_host.o %sycl_options -Wno-sycl-strict %{mathflags}
-// RUN: %clangxx %t_host.o %t_device.o -o %t.out %sycl_options %{mathflags}
+// RUN: %clangxx %t_host.o %t_device.o -Wno-unused-command-line-argument -o %t.out %sycl_options %{mathflags}
 // RUN: %{run} %t.out
 
 // RUN: %clangxx -fsycl -fsycl-link  %S/std_complex_math_fp64_test.cpp -o %t_fp64_device.o %{mathflags}
 // RUN: %clangxx -fsycl-device-only -Xclang -fsycl-int-header=std_complex_math_fp64_test_ihdr.h %S/std_complex_math_fp64_test.cpp -Wno-sycl-strict %{mathflags}
 // >> host compilation...
 // RUN: %clangxx -Wno-error=unused-command-line-argument -Wno-error=ignored-attributes %cxx_std_optionc++17 %include_option std_complex_math_fp64_test_ihdr.h -c %S/std_complex_math_fp64_test.cpp -o %t_fp64_host.o %sycl_options -Wno-sycl-strict %{mathflags}
-// RUN: %clangxx %t_fp64_host.o %t_fp64_device.o -o %t_fp64.out %sycl_options %{mathflags}
+// RUN: %clangxx %t_fp64_host.o %t_fp64_device.o -Wno-unused-command-line-argument -o %t_fp64.out %sycl_options %{mathflags}
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/rotate.cpp b/sycl/test-e2e/ESIMD/rotate.cpp
index 484be469752e7..246304b46f4e6 100644
--- a/sycl/test-e2e/ESIMD/rotate.cpp
+++ b/sycl/test-e2e/ESIMD/rotate.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: %{build} -fsycl-device-code-split=per_kernel -std=c++20 -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel %cxx_std_optionc++20 -o %t.out
 // RUN: %{run} %t.out
-// RUN: %{build} -fsycl-device-code-split=per_kernel -std=c++20 -o %t1.out -DEXP
+// RUN: %{build} -fsycl-device-code-split=per_kernel %cxx_std_optionc++20 -o %t1.out -DEXP
 // RUN: %{run} %t1.out
 
 // This is a basic test to validate the ror/rol functions.
diff --git a/sycl/test-e2e/ESIMD/slm_alloc.cpp b/sycl/test-e2e/ESIMD/slm_alloc.cpp
index 06bfa06498746..fe2ebdf23d484 100644
--- a/sycl/test-e2e/ESIMD/slm_alloc.cpp
+++ b/sycl/test-e2e/ESIMD/slm_alloc.cpp
@@ -8,7 +8,7 @@
 // RUN: %{run} %t.2.out
 
 // Check if the test sill passes with O0
-// RUN: %{build} -O0 -o %t.3.out
+// RUN: %{build} %no_opt -o %t.3.out
 // RUN: %{run} %t.3.out
 
 // This is end-to-end test for the slm_allocator API used together with the
diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
index bd0ed45866ea4..dce107b5019a4 100644
--- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
+++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.2.out
 
 // Check if the test sill passes with O0
-// RUN: %{build} -O0 -o %t.3.out
+// RUN: %{build} %no_opt -o %t.3.out
 // RUN: %{run} %t.3.out
 
 // Checks validity of SLM frame offsets in case of complex call graph with two
diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp
index 2e744fdf88aeb..311220039db62 100644
--- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp
+++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.2.out
 
 // Check if the test sill passes with O0
-// RUN: %{build} -O0 -o %t.3.out
+// RUN: %{build} %no_opt -o %t.3.out
 // RUN: %{run} %t.3.out
 
 // Check that SLM frame offset of a function foo called from two kernels Test1
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
index ed705979ac4fe..1c1483dfec33e 100644
--- a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
@@ -5,7 +5,7 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %{build} -O0 -o %t_O0.out
+// RUN: %{build} %no_opt -o %t_O0.out
 // RUN: %{run} %t_O0.out
 
 /*
diff --git a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
index 6f19d9f3cdf2a..a24e9e6d6afd1 100644
--- a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
+++ b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// RUN: %{build} -O0 -o %t.out
+// RUN: %{build} %no_opt -o %t.out
 
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp b/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp
index 09ff48d97173c..94b698d6ca6ca 100644
--- a/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp
@@ -6,5 +6,5 @@
 // RUN: %{run} %t.out
 
 // Test -O0 with `--offload-new-driver`
-// RUN: %clangxx -O0 -fsycl -fsycl-targets=spir64-x86_64 %S/Inputs/aot.cpp
+// RUN: %clangxx %no_opt -fsycl -fsycl-targets=spir64-x86_64 %S/Inputs/aot.cpp
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp b/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp
index 63351c8cab3c6..d7f0fb22ffb70 100644
--- a/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/spirv_device_obj_smoke.cpp
@@ -1,7 +1,7 @@
 // UNSUPPORTED: cuda || hip
 // Test with `--offload-new-driver`
 // RUN: %clangxx -fsycl -fsycl-device-obj=spirv --offload-new-driver -c -o %t.o %s
-// RUN: %clangxx -fsycl --offload-new-driver -o %t.out %t.o
+// RUN: %clangxx -fsycl --offload-new-driver -Wno-unused-command-line-argument -o %t.out %t.o
 // RUN: %{run} %t.out
 
 // This test verifies SPIR-V based fat objects.
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
index 68a8c19f1de0a..7bf2c5e1edb12 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
@@ -1,7 +1,7 @@
 // requires: cpu, gpu, accelerator
 // UNSUPPORTED: hip
 // FIXME: enable the test back, see intel/llvm#8146
-// RUN: %{build} -Wno-error=incorrect-sub-group-size -O0 -o %t.out
+// RUN: %{build} -Wno-error=incorrect-sub-group-size %no_opt -o %t.out
 // RUN: %{run} %t.out
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp b/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp
index 3f38a4df3023d..5a04ea4ed55df 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -DSOURCE1 -c -o %t1.o
 // RUN: %{build} -DSOURCE2 -c -o %t2.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -o %t.exe
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -Wno-unused-command-line-argument -o %t.exe
 // RUN: %{run} %t.exe
 
 #ifdef SOURCE1
diff --git a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp
index c9c9f54eaff98..245fb3a28d718 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: cpu
 
-// RUN: %{build} -O0 -o %t.out
+// RUN: %{build} %no_opt -o %t.out
 // RUN: %{run} %t.out
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp b/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp
index e21309db20e64..6304f4bbfd5b5 100644
--- a/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp
+++ b/sycl/test-e2e/ProgramManager/uneven_kernel_split.cpp
@@ -4,7 +4,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 -I %S/Inputs/ %S/uneven_kernel_split.cpp -c -o %t.o
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts -I %S/Inputs/ %S/Inputs/gpu_kernel1.cpp -c -o %t1.o
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts -I %S/Inputs/ %S/Inputs/gpu_kernel2.cpp -c -o %t2.o
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts %t.o %t1.o %t2.o -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts -Wno-unused-command-line-argument %t.o %t1.o %t2.o -o %t.out
 // RUN: %{run} %t.out
 
 // Test require the following device image structure: cpu target device image
diff --git a/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp b/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp
index c2877438392f7..29f49ef69b74b 100644
--- a/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp
+++ b/sycl/test-e2e/PropagateOptionsToBackend/sycl-opt-level-opencl.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: opencl
 
-// RUN: %{build} %if cl_options %{/Od%} %else %{-O0%} -o %t0.out
+// RUN: %{build} %no_opt -o %t0.out
 // RUN: %if !acc %{ env SYCL_UR_TRACE=2 %{run} %t0.out 2>&1 | FileCheck %s --check-prefixes=CHECKOCL0 %}
 // RUN: %{build} -O1 -o %t1.out
 // RUN: %if !acc %{ env SYCL_UR_TRACE=2 %{run} %t1.out 2>&1 | FileCheck %s --check-prefixes=CHECKOCL1 %}
@@ -9,7 +9,7 @@
 // RUN: %{build} -O3 -o %t3.out
 // RUN: %if !acc %{ env SYCL_UR_TRACE=2 %{run} %t3.out 2>&1 | FileCheck %s --check-prefixes=CHECKOCL3 %}
 
-// RUN: %{build} -O0 -o %t.out
+// RUN: %{build} %no_opt -o %t.out
 // RUN: %{run} %t.out
 
 // This test verifies the propagation of front-end compiler optimization
diff --git a/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp b/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp
index 99617f8ddab87..946ae9a75a0d4 100644
--- a/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp
+++ b/sycl/test-e2e/Regression/2020-spec-constants-debug-info.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -g -o %t.out
-// RUN: %{build} -g -O0 -o %t.out
+// RUN: %{build} -g %no_opt -o %t.out
 // RUN: %{build} -g -O2 -o %t.out
 //
 // The idea of this test is to make sure that we can compile the following
diff --git a/sycl/test-e2e/Regression/DAE-separate-compile.cpp b/sycl/test-e2e/Regression/DAE-separate-compile.cpp
index a6c8ae885cb39..7bfc1028b2644 100644
--- a/sycl/test-e2e/Regression/DAE-separate-compile.cpp
+++ b/sycl/test-e2e/Regression/DAE-separate-compile.cpp
@@ -5,7 +5,7 @@
 // The test checks that the scenario works correctly.
 //
 // RUN: %{build} -O2 -c -o %t.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.o -O0 -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.o %no_opt -Wno-unused-command-line-argument -o %t.out
 // RUN: %{run} %t.out
 
 // Failing on HIP AMD, enable after fixed
diff --git a/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp b/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp
index f33000f22f269..57826c81ece2e 100644
--- a/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp
+++ b/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: windows
 
 // RUN: %clangxx --driver-mode=cl -fsycl /MDd -c %s -o %t.obj
-// RUN: %clangxx --driver-mode=cl -fsycl %t.obj -o %t.out
+// RUN: %clangxx --driver-mode=cl -fsycl %t.obj -Wno-unused-command-line-argument -o %t.out
 // RUN: %{run} %t.out
 
 // The test aims to prevent regressions similar to the one which caused by
diff --git a/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp b/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp
index e136e2833a59a..e5c1df272a358 100644
--- a/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp
+++ b/sycl/test-e2e/Regression/default-constructed-local-accessor.cpp
@@ -1,6 +1,6 @@
 // -O0 is necessary; on higher levels of optimization, an error
 // would not occur because of dead argument elimination of the local_accessor.
-// RUN: %{build} -o %t.out -O0
+// RUN: %{build} -o %t.out %no_opt
 // RUN: %{run} %t.out
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp b/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp
index 106460629b242..9091eae989310 100644
--- a/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp
+++ b/sycl/test-e2e/Regression/optimization_level_debug_info_intopt.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} %debug_option -O0 -o %t.out
+// RUN: %{build} %debug_option %no_opt -o %t.out
 // RUN: %{build} %debug_option -O1 -o %t.out
 // RUN: %{build} %debug_option -O2 -o %t.out
 // RUN: %{build} %debug_option -O3 -o %t.out
diff --git a/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp b/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp
index c085d744cb075..57246c6cf4542 100644
--- a/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp
+++ b/sycl/test-e2e/Regression/optimization_level_debug_info_specopt.cpp
@@ -1,8 +1,11 @@
-// RUN: %{build} %debug_option -ffp-model=fast -o %t.out
+// DEFINE: %{ffpflags} = %if cl_options %{/clang:-ffp-model=fast%} %else %{-ffp-model=fast%}
+// DEFINE: %{Oz} = %if cl_options %{/clang:-Oz%} %else %{-Oz%}
+// DEFINE: %{O} = %if cl_options %{/clang:-O%} %else %{-O%}
+// RUN: %{build} %debug_option %{ffpflags} -o %t.out
 // RUN: %{build} %debug_option -Os -o %t.out
-// RUN: %{build} %debug_option -Oz -o %t.out
+// RUN: %{build} %debug_option %{Oz} -o %t.out
 // RUN: %{build} %debug_option -Og -o %t.out
-// RUN: %{build} %debug_option -O -o %t.out
+// RUN: %{build} %debug_option %{O} -o %t.out
 
 // NOTE: Tests that debugging information can be generated for all special-name
 // optimization levels.
diff --git a/sycl/test-e2e/Regression/unoptimized_stream.cpp b/sycl/test-e2e/Regression/unoptimized_stream.cpp
index e00d160f5d167..44f3c9bf39e80 100644
--- a/sycl/test-e2e/Regression/unoptimized_stream.cpp
+++ b/sycl/test-e2e/Regression/unoptimized_stream.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -O0 -o %t.out
+// RUN: %{build} %no_opt -o %t.out
 // RUN: %{run} %t.out
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/SeparateCompile/same-kernel.cpp b/sycl/test-e2e/SeparateCompile/same-kernel.cpp
index 15a2bfc9bc5c5..8bdadbe7ad62b 100644
--- a/sycl/test-e2e/SeparateCompile/same-kernel.cpp
+++ b/sycl/test-e2e/SeparateCompile/same-kernel.cpp
@@ -12,7 +12,7 @@
 // RUN: %{build} -DB_CPP=1 -c -o %t-same-kernel-b.o
 //
 // >> ---- link the full hetero app
-// RUN: %clangxx %t-same-kernel-a.o %t-same-kernel-b.o -o %t-same-kernel.exe -fsycl -fsycl-targets=%{sycl_triple}
+// RUN: %clangxx %t-same-kernel-a.o %t-same-kernel-b.o -Wno-unused-command-line-argument -o %t-same-kernel.exe -fsycl -fsycl-targets=%{sycl_triple}
 // RUN: %{run} %t-same-kernel.exe
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp b/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp
index 912df03aadccd..219634f47646c 100644
--- a/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp
+++ b/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp
@@ -5,7 +5,7 @@
 // RUN: %{build} -O3 -DSOURCE3 -c -o %t3.o
 // RUN: rm -f %t.a
 // RUN: llvm-ar crv %t.a %t1.o %t2.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t.a -o %t1.exe
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t.a -Wno-unused-command-line-argument -o %t1.exe
 // RUN: %{run} %t1.exe
 
 // Check the repacked case as it can behave differently.
@@ -13,7 +13,7 @@
 // RUN: echo addlib %t.a >> %t.txt
 // RUN: echo save >> %t.txt
 // RUN: cat %t.txt | llvm-ar -M
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t_repacked.a -o %t2.exe
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t_repacked.a -Wno-unused-command-line-argument -o %t2.exe
 // RUN: %{run} %t2.exe
 
 #include <iostream>
diff --git a/sycl/test-e2e/SeparateCompile/sycl-external.cpp b/sycl/test-e2e/SeparateCompile/sycl-external.cpp
index d455ba7e71d3e..8b6fc12382198 100644
--- a/sycl/test-e2e/SeparateCompile/sycl-external.cpp
+++ b/sycl/test-e2e/SeparateCompile/sycl-external.cpp
@@ -2,14 +2,14 @@
 // different object file.
 // RUN: %{build} -DSOURCE1 -c -o %t1.o
 // RUN: %{build} -DSOURCE2 -c -o %t2.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -o %t.exe
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -Wno-unused-command-line-argument -o %t.exe
 // RUN: %{run} %t.exe
 //
 // Test2 - check that kernel can call a SYCL_EXTERNAL function defined in a
 // static library.
 // RUN: rm -f %t.a
 // RUN: llvm-ar crv %t.a %t1.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t2.o %t.a -o %t.exe
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t2.o %t.a -Wno-unused-command-line-argument -o %t.exe
 // RUN: %{run} %t.exe
 
 #include <iostream>
diff --git a/sycl/test-e2e/SeparateCompile/test.cpp b/sycl/test-e2e/SeparateCompile/test.cpp
index bd5d489ffddeb..8e38711ac974c 100644
--- a/sycl/test-e2e/SeparateCompile/test.cpp
+++ b/sycl/test-e2e/SeparateCompile/test.cpp
@@ -51,7 +51,7 @@
 // RUN: %clangxx -Wno-error=override-module -c wrapper.bc -o wrapper.o
 //
 // >> ---- link the full hetero app
-// RUN: %clangxx wrapper.o a.o b.o -o app.exe %sycl_options
+// RUN: %clangxx wrapper.o a.o b.o -Wno-unused-command-line-argument -o app.exe %sycl_options
 // RUN: %{run} ./app.exe | FileCheck %s
 // CHECK: pass
 
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index bbef264be9c24..f7d56aaee8f8e 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -382,6 +382,7 @@ def open_check_file(file_name):
     config.substitutions.append(("%cxx_std_option", "/std:"))
     config.substitutions.append(("%fPIC", ""))
     config.substitutions.append(("%shared_lib", "/LD"))
+    config.substitutions.append(("%no_opt", "/Od"))
 else:
     config.substitutions.append(
         (
@@ -406,6 +407,7 @@ def open_check_file(file_name):
         ("%fPIC", ("" if platform.system() == "Windows" else "-fPIC"))
     )
     config.substitutions.append(("%shared_lib", "-shared"))
+    config.substitutions.append(("%no_opt", "-O0"))
 
 # Check if user passed verbose-print parameter, if yes, add VERBOSE_PRINT macro
 if "verbose-print" in lit_config.params:

From 404fb8a3f1439ecb648c17cc3989dd8926fee0e3 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Thu, 12 Sep 2024 11:20:06 +0100
Subject: [PATCH 19/19] [SYCL][UR] Bump UR tag to 2fea679 (#15350)

Pull in fix from https://github.com/oneapi-src/unified-runtime/pull/2078

This should fix the post-commit E2E failures on opencl:fpga

---------

Co-authored-by: Omar Ahmed <omarpiratee2010@gmail.com>
---
 sycl/cmake/modules/FetchUnifiedRuntime.cmake | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
index e569981589ac4..fce554d30582d 100644
--- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake
+++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
@@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
   endfunction()
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit eb63d1a21729f6928bb6cccc5f92856b0690aca6
-  # Merge: e26bba51 45a781f4
+  # commit 24a8299efc59c715a1c2dd180692a5e12a12283a
+  # Merge: eb63d1a2 2fea679d
   # Author: Omar Ahmed <omar.ahmed@codeplay.com>
-  # Date:   Tue Sep 10 12:08:57 2024 +0100
-  #   Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs
-  #   [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda
-  set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6)
+  # Date:   Wed Sep 11 10:40:59 2024 +0100
+  #     Merge pull request #2078 from callumfare/callum/fix_device_extensions_fpga
+  #     Add workaround for silently supported OpenCL extensions on Intel FPGA
+  set(UNIFIED_RUNTIME_TAG 24a8299efc59c715a1c2dd180692a5e12a12283a)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
   # Due to the use of dependentloadflag and no installer for UMF and hwloc we need