From c49ef6be05f69b29f4fe7222f4d4c30b067f63dc Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 9 Sep 2024 18:47:41 +0100
Subject: [PATCH 01/10] [HIP] Don't run failing device global test on AMD
 (#15327)

TODO: investigate why.
---
 sycl/test-e2e/DeviceGlobal/device_global_static.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/DeviceGlobal/device_global_static.cpp b/sycl/test-e2e/DeviceGlobal/device_global_static.cpp
index 68c0fc1ede7cb..d5ce36beca0aa 100644
--- a/sycl/test-e2e/DeviceGlobal/device_global_static.cpp
+++ b/sycl/test-e2e/DeviceGlobal/device_global_static.cpp
@@ -3,7 +3,10 @@
 //
 // The OpenCL GPU backends do not currently support device_global backend
 // calls.
-// UNSUPPORTED: opencl && gpu
+//
+// UNSUPPORTED: hip_amd, opencl && gpu
+//
+// For HIP see https://github.com/intel/llvm/issues/15329
 //
 // Tests static device_global access through device kernels.
 

From e7935c691c86f82f57ced3864a2b8e94904e1ccb Mon Sep 17 00:00:00 2001
From: Chris Perkins <chris.perkins@intel.com>
Date: Mon, 9 Sep 2024 13:55:58 -0700
Subject: [PATCH 02/10] [SYCL] minor changes to kernel_compiler tests. 
 (#15320)

minor changes to ensure tests are and are not run on appropriate devices
---
 sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp | 6 +-----
 sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp   | 4 +---
 sycl/test-e2e/KernelCompiler/multi_device.cpp           | 2 ++
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp
index d0be7afe4bdec..cfe2824ec0564 100644
--- a/sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp
+++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp
@@ -7,15 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: (opencl || level_zero)
+// UNSUPPORTED: accelerator
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// the new spec for the kernel_compiler opens the door to supporting several
-// different source languages. But, initially, OpenCL Kernels are the only ones
-// supported. This test is limited to that (thus the cm-compiler requirement)
-// but in the future it may need to broken out into other tests.
-
 #include <sycl/detail/core.hpp>
 
 auto constexpr CLSource = R"===(
diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp
index 7ef7eb712777e..511f713b7c95c 100644
--- a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp
+++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: (opencl || level_zero)
-
-// https://github.com/intel/llvm/issues/14826
-// XFAIL: arch-intel_gpu_pvc && igc-dev
+// UNSUPPORTED: accelerator
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/KernelCompiler/multi_device.cpp b/sycl/test-e2e/KernelCompiler/multi_device.cpp
index 69183ff2d8bd9..1fae9f7626a97 100644
--- a/sycl/test-e2e/KernelCompiler/multi_device.cpp
+++ b/sycl/test-e2e/KernelCompiler/multi_device.cpp
@@ -1,4 +1,6 @@
 // REQUIRES: (opencl || level_zero)
+// UNSUPPORTED: accelerator
+
 // RUN: %{build} -o %t.out
 // RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=3 %{run} %t.out
 

From eaa0b7da2a25f0718a9b7caa1329dc4fc0631119 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <udit.agarwal@intel.com>
Date: Mon, 9 Sep 2024 16:04:33 -0700
Subject: [PATCH 03/10] [CI] Install `libzstd-dev` on docker and OSX (#15304)

Required for testing https://github.com/intel/llvm/pull/15124 (device
image compression).
I've already installed `zstd` release v1.5.6 on our Windows runners.
---
 .github/workflows/sycl-macos-build-and-test.yml | 2 +-
 devops/scripts/install_build_tools.sh           | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sycl-macos-build-and-test.yml b/.github/workflows/sycl-macos-build-and-test.yml
index 723f0c90ac65b..20615ab78a42b 100644
--- a/.github/workflows/sycl-macos-build-and-test.yml
+++ b/.github/workflows/sycl-macos-build-and-test.yml
@@ -30,7 +30,7 @@ jobs:
       CCACHE_MAXSIZE: ${{ inputs.build_cache_size }}
     steps:
     - name: Install dependencies
-      run: brew install ccache ninja hwloc
+      run: brew install ccache ninja hwloc zstd
     - uses: actions/checkout@v4
       with:
         ref: ${{ inputs.build_ref }}
diff --git a/devops/scripts/install_build_tools.sh b/devops/scripts/install_build_tools.sh
index 4e0dcdc57cdc0..9c4b9d56999de 100755
--- a/devops/scripts/install_build_tools.sh
+++ b/devops/scripts/install_build_tools.sh
@@ -23,7 +23,8 @@ apt update && apt install -yqq \
       unzip \
       jq \
       curl \
-      libhwloc-dev
+      libhwloc-dev \
+      libzstd-dev
 
 pip3 install psutil
 

From 114236ffe20c2ec85a288cff6a7aeb0f7aca43ea Mon Sep 17 00:00:00 2001
From: Udit Agarwal <udit.agarwal@intel.com>
Date: Mon, 9 Sep 2024 16:04:50 -0700
Subject: [PATCH 04/10] [CI][Container] Add sycl user to the render group
 (#15324)

Render group is required for the sycl user to access the PVC card in the
docker container.

https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers
---
 devops/containers/ubuntu2204_base.Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/devops/containers/ubuntu2204_base.Dockerfile b/devops/containers/ubuntu2204_base.Dockerfile
index 543e600b09f7b..07bb343cae93c 100644
--- a/devops/containers/ubuntu2204_base.Dockerfile
+++ b/devops/containers/ubuntu2204_base.Dockerfile
@@ -16,6 +16,11 @@ RUN groupadd -g 1001 sycl && useradd sycl -u 1001 -g 1001 -m -s /bin/bash
 # Add sycl user to video/irc groups so that it can access GPU
 RUN usermod -aG video sycl
 RUN usermod -aG irc sycl
+
+# group 109 is required for sycl user to access PVC card.
+RUN groupadd -g 109 render
+RUN usermod -aG render sycl
+
 # Allow sycl user to run as sudo
 RUN echo "sycl  ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
 

From fbb1fb0010ec128c93148461c951dccf4c01b5b7 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Mon, 9 Sep 2024 16:33:23 -0700
Subject: [PATCH 05/10] [SYCL] Take into account
 UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY error code in Program Manager (#15335)

Currently if program manager faces one of the errors -
UR_RESULT_ERROR_OUT_OF_RESOURCES or UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -
during the program building/linking then it will clear the cache and
make another attempt.
This PR adds the following changes:
* Additionally take into account UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
error which is also possible in addition to aforementioned error codes.
* Parameterize the existing unit test by error code (which allows to
avoid excessive code duplication) and add
UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY value to testing.
---
 sycl/source/detail/kernel_program_cache.hpp   |   3 +-
 .../program_manager/program_manager.cpp       |   6 +-
 .../kernel-and-program/OutOfResources.cpp     | 140 ++----------------
 3 files changed, 22 insertions(+), 127 deletions(-)

diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp
index bc800b034179d..44dfd84751afd 100644
--- a/sycl/source/detail/kernel_program_cache.hpp
+++ b/sycl/source/detail/kernel_program_cache.hpp
@@ -329,7 +329,8 @@ class KernelProgramCache {
         BuildResult->Error.Code = detail::get_ur_error(Ex);
         if (Ex.code() == errc::memory_allocation ||
             BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
-            BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
+            BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
+            BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
           reset();
           BuildResult->updateAndNotify(BuildState::BS_Initial);
           continue;
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index fb30de4695499..99f3c5204dc74 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1605,7 +1605,8 @@ ProgramManager::ProgramPtr ProgramManager::build(
   };
   ur_result_t Error = doLink();
   if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
-      Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
+      Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
+      Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
     Context->getKernelProgramCache().reset();
     Error = doLink();
   }
@@ -2427,7 +2428,8 @@ ProgramManager::link(const device_image_plain &DeviceImage,
   };
   ur_result_t Error = doLink();
   if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
-      Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
+      Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
+      Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
     ContextImpl->getKernelProgramCache().reset();
     Error = doLink();
   }
diff --git a/sycl/unittests/kernel-and-program/OutOfResources.cpp b/sycl/unittests/kernel-and-program/OutOfResources.cpp
index c249e6599ec5a..b0b6e877ebe77 100644
--- a/sycl/unittests/kernel-and-program/OutOfResources.cpp
+++ b/sycl/unittests/kernel-and-program/OutOfResources.cpp
@@ -35,28 +35,24 @@ static sycl::unittest::UrImageArray<2> ImgArray{Img};
 
 static int nProgramCreate = 0;
 static volatile bool outOfResourcesToggle = false;
-static volatile bool outOfHostMemoryToggle = false;
+static volatile ur_result_t ErrorCode = UR_RESULT_SUCCESS;
 
 static ur_result_t redefinedProgramCreateWithIL(void *) {
   ++nProgramCreate;
   if (outOfResourcesToggle) {
     outOfResourcesToggle = false;
-    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    return ErrorCode;
   }
   return UR_RESULT_SUCCESS;
 }
 
-static ur_result_t redefinedProgramCreateWithILOutOfHostMemory(void *) {
-  ++nProgramCreate;
-  if (outOfHostMemoryToggle) {
-    outOfHostMemoryToggle = false;
-    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-  }
-  return UR_RESULT_SUCCESS;
-}
+// Parameterized test fixture
+class OutOfResourcesTestSuite : public ::testing::TestWithParam<ur_result_t> {};
 
-TEST(OutOfResourcesTest, urProgramCreate) {
+TEST_P(OutOfResourcesTestSuite, urProgramCreate) {
+  nProgramCreate = 0;
   sycl::unittest::UrMock<> Mock;
+  ErrorCode = GetParam();
   mock::getCallbacks().set_before_callback("urProgramCreateWithIL",
                                            &redefinedProgramCreateWithIL);
 
@@ -116,92 +112,21 @@ TEST(OutOfResourcesTest, urProgramCreate) {
   }
 }
 
-TEST(OutOfHostMemoryTest, urProgramCreate) {
-  // Reset to zero.
-  nProgramCreate = 0;
-
-  sycl::unittest::UrMock<> Mock;
-  mock::getCallbacks().set_before_callback(
-      "urProgramCreateWithIL", &redefinedProgramCreateWithILOutOfHostMemory);
-
-  sycl::platform Plt{sycl::platform()};
-  sycl::context Ctx{Plt};
-  auto CtxImpl = detail::getSyclObjImpl(Ctx);
-  queue q(Ctx, default_selector_v);
-
-  int runningTotal = 0;
-  // Cache is empty, so one urProgramCreateWithIL call.
-  q.single_task<class OutOfResourcesKernel1>([] {});
-  EXPECT_EQ(nProgramCreate, runningTotal += 1);
-
-  // Now, we make the next urProgramCreateWithIL call fail with
-  // UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. The caching mechanism should catch
-  // this, clear the cache, and retry the urProgramCreateWithIL.
-  outOfHostMemoryToggle = true;
-  q.single_task<class OutOfResourcesKernel2>([] {});
-  EXPECT_FALSE(outOfHostMemoryToggle);
-  EXPECT_EQ(nProgramCreate, runningTotal += 2);
-  {
-    detail::KernelProgramCache::ProgramCache &Cache =
-        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
-    EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
-  }
-
-  // The next urProgramCreateWithIL call will fail with
-  // UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. But OutOfResourcesKernel2 is in the
-  // cache, so we expect no new urProgramCreateWithIL calls.
-  outOfHostMemoryToggle = true;
-  q.single_task<class OutOfResourcesKernel2>([] {});
-  EXPECT_TRUE(outOfHostMemoryToggle);
-  EXPECT_EQ(nProgramCreate, runningTotal);
-
-  // OutOfResourcesKernel1 is not in the cache, so we have to
-  // build it. From what we set before, this call will fail,
-  // the cache will clear out, and will try again.
-  q.single_task<class OutOfResourcesKernel1>([] {});
-  EXPECT_FALSE(outOfHostMemoryToggle);
-  EXPECT_EQ(nProgramCreate, runningTotal += 2);
-  {
-    detail::KernelProgramCache::ProgramCache &Cache =
-        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
-    EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
-  }
-
-  // Finally, OutOfResourcesKernel1 will be in the cache, but
-  // OutOfResourceKenel2 will not, so one more urProgramCreateWithIL.
-  // Toggle is not set, so this should succeed.
-  q.single_task<class OutOfResourcesKernel1>([] {});
-  q.single_task<class OutOfResourcesKernel2>([] {});
-  EXPECT_EQ(nProgramCreate, runningTotal += 1);
-  {
-    detail::KernelProgramCache::ProgramCache &Cache =
-        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
-    EXPECT_EQ(Cache.size(), 2U) << "Expected 2 program in the cache";
-  }
-}
-
 static int nProgramLink = 0;
 
 static ur_result_t redefinedProgramLink(void *) {
   ++nProgramLink;
   if (outOfResourcesToggle) {
     outOfResourcesToggle = false;
-    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
-  }
-  return UR_RESULT_SUCCESS;
-}
-
-static ur_result_t redefinedProgramLinkOutOfHostMemory(void *) {
-  ++nProgramLink;
-  if (outOfHostMemoryToggle) {
-    outOfHostMemoryToggle = false;
-    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    return ErrorCode;
   }
   return UR_RESULT_SUCCESS;
 }
 
-TEST(OutOfResourcesTest, urProgramLink) {
+TEST_P(OutOfResourcesTestSuite, urProgramLink) {
+  nProgramLink = 0;
   sycl::unittest::UrMock<> Mock;
+  ErrorCode = GetParam();
   mock::getCallbacks().set_before_callback("urProgramLinkExp",
                                            &redefinedProgramLink);
 
@@ -236,41 +161,8 @@ TEST(OutOfResourcesTest, urProgramLink) {
   }
 }
 
-TEST(OutOfHostMemoryTest, urProgramLink) {
-  // Reset to zero.
-  nProgramLink = 0;
-
-  sycl::unittest::UrMock<> Mock;
-  mock::getCallbacks().set_before_callback(
-      "urProgramLinkExp", &redefinedProgramLinkOutOfHostMemory);
-
-  sycl::platform Plt{sycl::platform()};
-  sycl::context Ctx{Plt};
-  auto CtxImpl = detail::getSyclObjImpl(Ctx);
-  queue q(Ctx, default_selector_v);
-  // Put some programs in the cache
-  q.single_task<class OutOfResourcesKernel1>([] {});
-  q.single_task<class OutOfResourcesKernel2>([] {});
-  {
-    detail::KernelProgramCache::ProgramCache &Cache =
-        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
-    EXPECT_EQ(Cache.size(), 2U) << "Expect 2 programs in the cache";
-  }
-
-  auto b1 = sycl::get_kernel_bundle<OutOfResourcesKernel1,
-                                    sycl::bundle_state::object>(Ctx);
-  auto b2 = sycl::get_kernel_bundle<OutOfResourcesKernel2,
-                                    sycl::bundle_state::object>(Ctx);
-  outOfHostMemoryToggle = true;
-  EXPECT_EQ(nProgramLink, 0);
-  auto b3 = sycl::link({b1, b2});
-  EXPECT_FALSE(outOfHostMemoryToggle);
-  // one restart due to out of resources, one link per each of b1 and b2.
-  EXPECT_EQ(nProgramLink, 3);
-  // no programs should be in the cache due to out of resources.
-  {
-    detail::KernelProgramCache::ProgramCache &Cache =
-        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
-    EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
-  }
-}
+INSTANTIATE_TEST_SUITE_P(
+    OutOfResourcesParameterizedRun, OutOfResourcesTestSuite,
+    ::testing::Values(UR_RESULT_ERROR_OUT_OF_RESOURCES,
+                      UR_RESULT_ERROR_OUT_OF_HOST_MEMORY,
+                      UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY));

From 178a42c57398257ce0c3189ee24d4954785b6dc6 Mon Sep 17 00:00:00 2001
From: Ian Li <ian.li@intel.com>
Date: Tue, 10 Sep 2024 03:30:28 -0400
Subject: [PATCH 06/10] [SYCL] Initialize uninitialized handler_impl fields
 (#15323)

In the same vein as https://github.com/intel/llvm/pull/15237, this PR
fixes additional uninitialized values recently discovered by Coverity.
Similar to the resolution discussed in #15237, I have
default-initialized integer values that are defined later on to 0
instead of another more complex solution.

Additionally, since I had set `MExternalSempahore` in `handler_impl` to
`nullptr`, I added null checks where `MExternalSemaphore` is ultimately
returned to ensure `nullptr` doesn't actually get passed into the UR.
This is not necessarily necessary, but without this check Coverity would
probably generate another hit because of it.
---
 sycl/source/detail/cg.hpp           |  6 ++++++
 sycl/source/detail/handler_impl.hpp | 12 ++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp
index 42bd088427dd9..c4ae7d87f4403 100644
--- a/sycl/source/detail/cg.hpp
+++ b/sycl/source/detail/cg.hpp
@@ -624,6 +624,8 @@ class CGSemaphoreWait : public CG {
         MExternalSemaphore(ExternalSemaphore), MWaitValue(WaitValue) {}
 
   ur_exp_external_semaphore_handle_t getExternalSemaphore() const {
+    assert(MExternalSemaphore != nullptr &&
+           "MExternalSemaphore has not been defined yet.");
     return MExternalSemaphore;
   }
   std::optional<uint64_t> getWaitValue() const { return MWaitValue; }
@@ -643,6 +645,10 @@ class CGSemaphoreSignal : public CG {
         MExternalSemaphore(ExternalSemaphore), MSignalValue(SignalValue) {}
 
   ur_exp_external_semaphore_handle_t getExternalSemaphore() const {
+    if (MExternalSemaphore == nullptr)
+      throw exception(make_error_code(errc::runtime),
+                      "getExternalSemaphore(): MExternalSemaphore has not been "
+                      "defined yet.");
     return MExternalSemaphore;
   }
   std::optional<uint64_t> getSignalValue() const { return MSignalValue; }
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index a306d3c69b498..37a697a57bc2b 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -90,13 +90,13 @@ class handler_impl {
 
   std::shared_ptr<detail::kernel_bundle_impl> MKernelBundle;
 
-  ur_usm_advice_flags_t MAdvice;
+  ur_usm_advice_flags_t MAdvice = 0;
 
   // 2D memory operation information.
-  size_t MSrcPitch;
-  size_t MDstPitch;
-  size_t MWidth;
-  size_t MHeight;
+  size_t MSrcPitch = 0;
+  size_t MDstPitch = 0;
+  size_t MWidth = 0;
+  size_t MHeight = 0;
 
   /// Offset into a device_global for copy operations.
   size_t MOffset = 0;
@@ -134,7 +134,7 @@ class handler_impl {
   ur_rect_region_t MCopyExtent = {};
 
   // Extra information for semaphore interoperability
-  ur_exp_external_semaphore_handle_t MExternalSemaphore;
+  ur_exp_external_semaphore_handle_t MExternalSemaphore = nullptr;
   std::optional<uint64_t> MWaitValue;
   std::optional<uint64_t> MSignalValue;
 

From 21365ca35e401411a3bf8e47994f59c3def4613a Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Tue, 10 Sep 2024 07:31:01 +0000
Subject: [PATCH 07/10] [SYCL][E2E LIT] Remove append_path=True from
 config.extra_environment handling (#15279)

When append_path is True, value is normalized to lowercase on Windows at

https://github.com/llvm-mirror/llvm/blob/2c4ca6832fa6b306ee6a7010bfb80a3f2596f824/utils/lit/lit/llvm/config.py#L139
For env value that isn't a path, the normalization is unexpected.

For env value that is a path, it doesn't make sense to append_path
unconditionally. Test driver should append path when necessary before
setting a env.
---
 sycl/test-e2e/lit.cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index 8315ac4620ce2..bbef264be9c24 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -122,7 +122,7 @@
     for env_pair in config.extra_environment.split(","):
         [var, val] = env_pair.split("=", 1)
         if val:
-            llvm_config.with_environment(var, val, append_path=True)
+            llvm_config.with_environment(var, val)
             lit_config.note("\t" + var + "=" + val)
         else:
             lit_config.note("\tUnset " + var)

From ad494e9dd3a7d9a305821f693edc76645bf2ce30 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Tue, 10 Sep 2024 09:34:03 +0100
Subject: [PATCH 08/10] [SYCL][NATIVECPU] Fix local scope module variables for
 native cpu (#15280)

Although local scope variables inside the kernel are less common in
SYCL, they can happen with hierarchical.

This fixes the problem by adding a pass to replace the local scope
variables which start life as globals with a struct which is allocated
on the stack.

Additionally, this required updating of the code which renames and
removes kernel based on wrappers and vecz success. To simplify this we
run the OCK utility pass TransferKernelMetadata which adds metadata to
store the original kernel name. This in turn simplifies this code
significantly.

Note this fixes fails in kernel/kernel_attributes_wg_hint.cpp SYCL CTS
for native cpu, which is being tested locally.
---
 .../PipelineSYCLNativeCPU.cpp                 |  4 ++
 .../PrepareSYCLNativeCPU.cpp                  | 48 +++++--------------
 .../native_cpu/local_module_scope.cpp         | 39 +++++++++++++++
 3 files changed, 54 insertions(+), 37 deletions(-)
 create mode 100644 sycl/test/check_device_code/native_cpu/local_module_scope.cpp

diff --git a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp
index 581c2f4866c9c..1454c10fc4200 100644
--- a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp
@@ -21,7 +21,9 @@
 #include "compiler/utils/builtin_info.h"
 #include "compiler/utils/define_mux_builtins_pass.h"
 #include "compiler/utils/device_info.h"
+#include "compiler/utils/encode_kernel_metadata_pass.h"
 #include "compiler/utils/prepare_barriers_pass.h"
+#include "compiler/utils/replace_local_module_scope_variables_pass.h"
 #include "compiler/utils/sub_group_analysis.h"
 #include "compiler/utils/work_item_loops_pass.h"
 #include "vecz/pass.h"
@@ -60,6 +62,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
     OptimizationLevel OptLevel) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
+  MPM.addPass(compiler::utils::TransferKernelMetadataPass());
   // Always enable vectorizer, unless explictly disabled or -O0 is set.
   if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) {
     MAM.registerPass([] { return vecz::TargetInfoAnalysis(); });
@@ -87,6 +90,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
   MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); });
   MPM.addPass(compiler::utils::PrepareBarriersPass());
   MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
+  MPM.addPass(compiler::utils::ReplaceLocalModuleScopeVariablesPass());
   MPM.addPass(AlwaysInlinerPass());
 #endif
   MPM.addPass(PrepareSYCLNativeCPUPass());
diff --git a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
index c5625217bdfd1..b3888db8a7b50 100644
--- a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
@@ -338,47 +338,21 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
     SmallSet<Function *, 5> RemovableFuncs;
     SmallVector<Function *, 5> WrapperFuncs;
 
-    // Retrieve the wrapper functions created by the WorkItemLoop pass.
     for (auto &OldF : OldKernels) {
-      std::optional<compiler::utils::LinkMetadataResult> VeczR =
-          compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
-      if (VeczR && VeczR.value().first) {
-        WrapperFuncs.push_back(OldF);
-      } else {
-        auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
-        if (Name != OldF->getName()) {
-          WrapperFuncs.push_back(OldF);
-        }
-      }
-    }
-
-    for (auto &OldF : WrapperFuncs) {
       // If vectorization occurred, at this point we have a wrapper function
-      // that runs the vectorized kernel and peels using the scalar kernel. We
-      // make it so this wrapper steals the original kernel name.
-      std::optional<compiler::utils::LinkMetadataResult> VeczR =
-          compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
-      if (VeczR && VeczR.value().first) {
-        auto ScalarF = VeczR.value().first;
-        OldF->takeName(ScalarF);
-        if (ScalarF->use_empty())
-          RemovableFuncs.insert(ScalarF);
-      } else {
-        // The WorkItemLoops pass created a wrapper function for the original
-        // kernel. If we have a kernel named foo(), the wrapper will be called
-        // foo-wrapper(), and will have the original kernel name retrieved by
-        // getBaseFnNameOrFnName. We set the name of the wrapper function
-        // to the original kernel name and add the original kernel to the
-        // list of functions that can be removed from the module.
-        auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
-        Function *OrigF = M.getFunction(Name);
+      // that runs the vectorized kernel and peels using the scalar kernel.
+      // There may also be a wrapper for local variables replacement. We make it
+      // so this wrapper steals the original kernel name. Otherwise we will have
+      // a wrapper function from the work item loops. In this case we also steal
+      // the original kernel name.
+      auto Name = compiler::utils::getOrigFnName(*OldF);
+      Function *OrigF = M.getFunction(Name);
+      if (Name != OldF->getName()) {
         if (OrigF != nullptr) {
-          // The original kernel is inlined by the WorkItemLoops
-          // pass if it contained barriers or group collectives, otherwise
-          // we don't want to (and can't) remove it.
-          if (OrigF->use_empty())
-            RemovableFuncs.insert(OrigF);
           OldF->takeName(OrigF);
+          if (OrigF->use_empty()) {
+            RemovableFuncs.insert(OrigF);
+          }
         } else {
           OldF->setName(Name);
         }
diff --git a/sycl/test/check_device_code/native_cpu/local_module_scope.cpp b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp
new file mode 100644
index 0000000000000..bb1ea27a115bd
--- /dev/null
+++ b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp
@@ -0,0 +1,39 @@
+// REQUIRES: native_cpu_ock
+
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s
+
+// Check that local types structure is created and placed on the stack
+// We also check that the attribute mux-orig-fn is created as this is needed to
+// find the original function after this pass is run
+
+// CHECK: %localVarTypes = type { ptr addrspace(1) }
+// CHECK: define void @_ZTS4TestILi1ELi4EiE.NativeCPUKernel{{.*}} #[[ATTR:[0-9]*]]
+// CHECK: alloca %localVarTypes
+// CHECK: attributes #[[ATTR]] = {{.*}} "mux-orig-fn"="_ZTS4TestILi1ELi4EiE"
+
+#include "sycl.hpp"
+
+template <int dims, int size, typename T = int> struct Test;
+
+int main() {
+  sycl::queue queue;
+
+  constexpr int dims = 1;
+  constexpr int size = 4;
+
+  std::array<int, size> data;
+
+  const auto range = sycl::range<dims>(size);
+  const auto range_wg = sycl::range<dims>(1);
+  {
+    sycl::buffer<int, dims> buf(data.data(), range);
+
+    queue.submit([&](sycl::handler &cgh) {
+      auto acc = sycl::accessor(buf, cgh, sycl::write_only);
+      cgh.parallel_for_work_group<Test<dims, size>>(
+          range, range_wg, [=](auto group) { acc[group.get_group_id()] = 42; });
+    });
+    queue.wait_and_throw();
+  }
+  return 0;
+}

From c87f6b216181846fd3614abcfcebdf92a26c0bb1 Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Tue, 10 Sep 2024 03:01:21 -0700
Subject: [PATCH 09/10] [SYCL] Use built-ins to retrieve kernel information
 (#15070)

Using built-ins is going to be the preferred way to fetch kernel
information, while integration headers are still going to be used for
cases where built-ins are unavailable (i.e., different host compiler).

Additionally, switch to the new entry point attribute when using the
built-ins.
---
 sycl/include/sycl/detail/kernel_desc.hpp | 104 +++++++++++++++++++-
 sycl/include/sycl/handler.hpp            | 115 ++++++++++++++---------
 sycl/include/sycl/kernel_bundle.hpp      |   4 +-
 sycl/include/sycl/queue.hpp              |  38 +++-----
 sycl/source/handler.cpp                  |  23 +++--
 sycl/test/abi/sycl_symbols_linux.dump    |   1 +
 sycl/test/abi/sycl_symbols_windows.dump  |   2 +-
 7 files changed, 207 insertions(+), 80 deletions(-)

diff --git a/sycl/include/sycl/detail/kernel_desc.hpp b/sycl/include/sycl/detail/kernel_desc.hpp
index 85519c3388efd..1049c4d78aadd 100644
--- a/sycl/include/sycl/detail/kernel_desc.hpp
+++ b/sycl/include/sycl/detail/kernel_desc.hpp
@@ -10,10 +10,26 @@
 
 // FIXME: include export.hpp because integration header emitted by the compiler
 // uses the macro defined in this header, but it doesn't explicitly include it.
+#include <sycl/detail/defines_elementary.hpp>
 #include <sycl/detail/export.hpp>
-
 // This header file must not include any standard C++ header files.
 
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+#if __has_builtin(__builtin_sycl_kernel_name)
+static_assert(__has_builtin(__builtin_sycl_kernel_param_count) &&
+              __has_builtin(__builtin_sycl_kernel_name) &&
+              __has_builtin(__builtin_sycl_kernel_param_access_target) &&
+              __has_builtin(__builtin_sycl_kernel_param_size) &&
+              __has_builtin(__builtin_sycl_kernel_param_offset) &&
+              __has_builtin(__builtin_sycl_kernel_file_name) &&
+              __has_builtin(__builtin_sycl_kernel_function_name) &&
+              __has_builtin(__builtin_sycl_kernel_line_number) &&
+              __has_builtin(__builtin_sycl_kernel_column_number));
+#else
+#define __INTEL_SYCL_USE_INTEGRATION_HEADERS 1
+#endif
+#endif
+
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
@@ -151,6 +167,92 @@ template <class KernelNameType> struct KernelInfo {
 };
 #endif //__SYCL_UNNAMED_LAMBDA__
 
+// Built-ins accept an object due to lacking infrastructure support for
+// accepting types. The kernel name type itself isn't used because it might be
+// incomplete, cv-qualified, or not default constructible. Passing an object
+// also allows future extension for SYCL kernels defined as free functions.
+template <typename KNT> struct KernelIdentity {
+  using type = KNT;
+};
+
+template <typename KernelNameType> constexpr unsigned getKernelNumParams() {
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+  return __builtin_sycl_kernel_param_count(KernelIdentity<KernelNameType>());
+#else
+  return KernelInfo<KernelNameType>::getNumParams();
+#endif
+}
+
+template <typename KernelNameType>
+kernel_param_desc_t getKernelParamDesc(int Idx) {
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+  kernel_param_desc_t ParamDesc;
+  ParamDesc.kind =
+      __builtin_sycl_kernel_param_kind(KernelIdentity<KernelNameType>(), Idx);
+  ParamDesc.info = ParamDesc.kind == kernel_param_kind_t::kind_accessor
+                       ? __builtin_sycl_kernel_param_access_target(
+                             KernelIdentity<KernelNameType>(), Idx)
+                       : __builtin_sycl_kernel_param_size(
+                             KernelIdentity<KernelNameType>(), Idx);
+  ParamDesc.offset =
+      __builtin_sycl_kernel_param_offset(KernelIdentity<KernelNameType>(), Idx);
+  return ParamDesc;
+#else
+  return KernelInfo<KernelNameType>::getParamDesc(Idx);
+#endif
+}
+
+template <typename KernelNameType> constexpr const char *getKernelName() {
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+  return __builtin_sycl_kernel_name(KernelIdentity<KernelNameType>());
+#else
+  return KernelInfo<KernelNameType>::getName();
+#endif
+}
+
+template <typename KernelNameType> constexpr bool isKernelESIMD() {
+  // TODO Needs a builtin counterpart
+  return KernelInfo<KernelNameType>::isESIMD();
+}
+
+template <typename KernelNameType> constexpr const char *getKernelFileName() {
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+  return __builtin_sycl_kernel_file_name(KernelIdentity<KernelNameType>());
+#else
+  return KernelInfo<KernelNameType>::getFileName();
+#endif
+}
+
+template <typename KernelNameType>
+constexpr const char *getKernelFunctionName() {
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+  return __builtin_sycl_kernel_function_name(KernelIdentity<KernelNameType>());
+#else
+  return KernelInfo<KernelNameType>::getFunctionName();
+#endif
+}
+
+template <typename KernelNameType> constexpr unsigned getKernelLineNumber() {
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+  return __builtin_sycl_kernel_line_number(KernelIdentity<KernelNameType>());
+#else
+  return KernelInfo<KernelNameType>::getLineNumber();
+#endif
+}
+
+template <typename KernelNameType> constexpr unsigned getKernelColumnNumber() {
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+  return __builtin_sycl_kernel_column_number(KernelIdentity<KernelNameType>());
+#else
+  return KernelInfo<KernelNameType>::getColumnNumber();
+#endif
+}
+
+template <typename KernelNameType> constexpr int64_t getKernelSize() {
+  // TODO needs a builtin counterpart, but is currently only used for checking
+  // cases with external host compiler, which use integration headers.
+  return KernelInfo<KernelNameType>::getKernelSize();
+}
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 6f2e9f9fc19b7..6181a41e6ef8c 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -429,6 +429,17 @@ template <int Dims> bool range_size_fits_in_size_t(const range<Dims> &r) {
   }
   return true;
 }
+
+template <typename KernelNameType>
+std::vector<kernel_param_desc_t> getKernelParamDescs() {
+  std::vector<kernel_param_desc_t> Result;
+  int NumParams = getKernelNumParams<KernelNameType>();
+  Result.reserve(NumParams);
+  for (int I = 0; I < NumParams; ++I) {
+    Result.push_back(getKernelParamDesc<KernelNameType>(I));
+  }
+  return Result;
+}
 } // namespace detail
 
 /// Command group handler class.
@@ -528,14 +539,12 @@ class __SYCL_EXPORT handler {
   void throwOnLocalAccessorMisuse() const {
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    using KI = sycl::detail::KernelInfo<NameT>;
-
-    auto *KernelArgs = &KI::getParamDesc(0);
-
-    for (unsigned I = 0; I < KI::getNumParams(); ++I) {
-      const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
+    for (unsigned I = 0; I < detail::getKernelNumParams<NameT>(); ++I) {
+      const detail::kernel_param_desc_t ParamDesc =
+          detail::getKernelParamDesc<NameT>(I);
+      const detail::kernel_param_kind_t &Kind = ParamDesc.kind;
       const access::target AccTarget =
-          static_cast<access::target>(KernelArgs[I].info & AccessTargetMask);
+          static_cast<access::target>(ParamDesc.info & AccessTargetMask);
       if ((Kind == detail::kernel_param_kind_t::kind_accessor) &&
           (AccTarget == target::local))
         throw sycl::exception(
@@ -546,8 +555,12 @@ class __SYCL_EXPORT handler {
     }
   }
 
-  /// Extracts and prepares kernel arguments from the lambda using integration
-  /// header.
+  /// Extracts and prepares kernel arguments from the lambda using information
+  /// from the built-ins or integration header.
+  void extractArgsAndReqsFromLambda(
+      char *LambdaPtr,
+      const std::vector<detail::kernel_param_desc_t> &ParamDescs, bool IsESIMD);
+  // TODO Unused, remove during ABI breaking window
   void
   extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
                                const detail::kernel_param_desc_t *KernelArgs,
@@ -570,7 +583,7 @@ class __SYCL_EXPORT handler {
     // kernel. Else it is necessary use set_atg(s) for resolve the order and
     // values of arguments for the kernel.
     assert(MKernel && "MKernel is not initialized");
-    const std::string LambdaName = detail::KernelInfo<LambdaNameT>::getName();
+    const std::string LambdaName = detail::getKernelName<LambdaNameT>();
     detail::string KernelName = getKernelName();
     return KernelName == LambdaName;
   }
@@ -885,21 +898,22 @@ class __SYCL_EXPORT handler {
   ///
   /// \param KernelName is the name of the SYCL kernel to check that the used
   ///                   kernel bundle contains.
-  void verifyUsedKernelBundle(const std::string &KernelName) {
-    verifyUsedKernelBundleInternal(detail::string_view{KernelName});
+  template <typename KernelNameT> void verifyUsedKernelBundle() {
+    verifyUsedKernelBundleInternal(
+        detail::string_view{detail::getKernelName<KernelNameT>()});
   }
   void verifyUsedKernelBundleInternal(detail::string_view KernelName);
 
   /// Stores lambda to the template-free object
   ///
   /// Also initializes kernel name, list of arguments and requirements using
-  /// information from the integration header.
+  /// information from the integration header/built-ins.
   ///
-  /// \param KernelFunc is a SYCL kernel function.
+  /// \param KernelFunc is a SYCL kernel function
+  /// \param ParamDescs is the vector of kernel parameter descriptors.
   template <typename KernelName, typename KernelType, int Dims,
             typename LambdaArgType>
   void StoreLambda(KernelType KernelFunc) {
-    using KI = detail::KernelInfo<KernelName>;
     constexpr bool IsCallableWithKernelHandler =
         detail::KernelLambdaHasKernelHandlerArgT<KernelType,
                                                  LambdaArgType>::value;
@@ -908,13 +922,18 @@ class __SYCL_EXPORT handler {
         ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
 
     constexpr bool KernelHasName =
-        KI::getName() != nullptr && KI::getName()[0] != '\0';
+        detail::getKernelName<KernelName>() != nullptr &&
+        detail::getKernelName<KernelName>()[0] != '\0';
 
     // Some host compilers may have different captures from Clang. Currently
     // there is no stable way of handling this when extracting the captures, so
     // a static assert is made to fail for incompatible kernel lambdas.
+
+    // TODO remove the ifdef once the kernel size builtin is supported.
+#ifdef __INTEL_SYCL_USE_INTEGRATION_HEADERS
     static_assert(
-        !KernelHasName || sizeof(KernelFunc) == KI::getKernelSize(),
+        !KernelHasName ||
+            sizeof(KernelFunc) == detail::getKernelSize<KernelName>(),
         "Unexpected kernel lambda size. This can be caused by an "
         "external host compiler producing a lambda with an "
         "unexpected layout. This is a limitation of the compiler."
@@ -925,16 +944,16 @@ class __SYCL_EXPORT handler {
         "In case of MSVC, passing "
         "-fsycl-host-compiler-options='/std:c++latest' "
         "might also help.");
-
+#endif
     // Empty name indicates that the compilation happens without integration
     // header, so don't perform things that require it.
     if (KernelHasName) {
       // TODO support ESIMD in no-integration-header case too.
       clearArgs();
       extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
-                                   KI::getNumParams(), &KI::getParamDesc(0),
-                                   KI::isESIMD());
-      MKernelName = KI::getName();
+                                   detail::getKernelParamDescs<KernelName>(),
+                                   detail::isKernelESIMD<KernelName>());
+      MKernelName = detail::getKernelName<KernelName>();
     } else {
       // In case w/o the integration header it is necessary to process
       // accessors from the list(which are associated with this handler) as
@@ -1031,7 +1050,6 @@ class __SYCL_EXPORT handler {
       typename KernelName,
       typename PropertiesT = ext::oneapi::experimental::empty_properties_t>
   void processProperties(PropertiesT Props) {
-    using KI = detail::KernelInfo<KernelName>;
     static_assert(
         ext::oneapi::experimental::is_property_list<PropertiesT>::value,
         "Template type is not a property list.");
@@ -1040,7 +1058,7 @@ class __SYCL_EXPORT handler {
             sycl::ext::intel::experimental::fp_control_key>() ||
             (PropertiesT::template has_property<
                  sycl::ext::intel::experimental::fp_control_key>() &&
-             KI::isESIMD()),
+             detail::isKernelESIMD<KernelName>()),
         "Floating point control property is supported for ESIMD kernels only.");
     static_assert(
         !PropertiesT::template has_property<
@@ -1334,8 +1352,7 @@ class __SYCL_EXPORT handler {
     //       conflicts they should be included in the name.
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
 
     // Range rounding can be disabled by the user.
     // Range rounding is not done on the host device.
@@ -1417,7 +1434,7 @@ class __SYCL_EXPORT handler {
     //       conflicts they should be included in the name.
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType =
         sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
     static_assert(
@@ -1507,7 +1524,7 @@ class __SYCL_EXPORT handler {
     //       conflicts they should be included in the name.
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType =
         sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
     (void)NumWorkGroups;
@@ -1548,7 +1565,7 @@ class __SYCL_EXPORT handler {
     //       conflicts they should be included in the name.
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType =
         sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
     (void)NumWorkGroups;
@@ -1568,10 +1585,14 @@ class __SYCL_EXPORT handler {
   }
 
 #ifdef SYCL_LANGUAGE_VERSION
+#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS
+#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel_entry_point(KernelName)]]
+#else
 #define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]]
+#endif // __INTEL_SYCL_USE_INTEGRATION_HEADERS
 #else
 #define __SYCL_KERNEL_ATTR__
-#endif
+#endif // SYCL_LANGUAGE_VERSION
 
   // NOTE: the name of this function - "kernel_single_task" - is used by the
   // Front End to determine kernel invocation kind.
@@ -1583,7 +1604,9 @@ class __SYCL_EXPORT handler {
       nullptr,
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
 #endif
-  __SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
+
+  __SYCL_KERNEL_ATTR__ static void
+  kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
 #ifdef __SYCL_DEVICE_ONLY__
     KernelFunc();
 #else
@@ -1601,8 +1624,8 @@ class __SYCL_EXPORT handler {
       nullptr,
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
 #endif
-  __SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc),
-                                               kernel_handler KH) {
+  __SYCL_KERNEL_ATTR__ static void
+  kernel_single_task(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
 #ifdef __SYCL_DEVICE_ONLY__
     KernelFunc(KH);
 #else
@@ -1620,7 +1643,8 @@ class __SYCL_EXPORT handler {
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
 #endif
-  __SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
+  __SYCL_KERNEL_ATTR__ static void
+  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
 #ifdef __SYCL_DEVICE_ONLY__
     KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
 #else
@@ -1637,8 +1661,8 @@ class __SYCL_EXPORT handler {
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
 #endif
-  __SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc),
-                                                kernel_handler KH) {
+  __SYCL_KERNEL_ATTR__ static void
+  kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
 #ifdef __SYCL_DEVICE_ONLY__
     KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
 #else
@@ -1656,7 +1680,7 @@ class __SYCL_EXPORT handler {
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
 #endif
-  __SYCL_KERNEL_ATTR__ void
+  __SYCL_KERNEL_ATTR__ static void
   kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc)) {
 #ifdef __SYCL_DEVICE_ONLY__
     KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
@@ -1674,7 +1698,7 @@ class __SYCL_EXPORT handler {
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
       ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
 #endif
-  __SYCL_KERNEL_ATTR__ void
+  __SYCL_KERNEL_ATTR__ static void
   kernel_parallel_for_work_group(_KERNELFUNCPARAM(KernelFunc),
                                  kernel_handler KH) {
 #ifdef __SYCL_DEVICE_ONLY__
@@ -1822,7 +1846,8 @@ class __SYCL_EXPORT handler {
     //       conflicts they should be included in the name.
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+
+    verifyUsedKernelBundle<NameT>();
     kernel_single_task_wrapper<NameT, KernelType, PropertiesT>(KernelFunc);
 #ifndef __SYCL_DEVICE_ONLY__
     // No need to check if range is out of INT_MAX limits as it's compile-time
@@ -2118,7 +2143,7 @@ class __SYCL_EXPORT handler {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
     using TransformedArgType = std::conditional_t<
         std::is_integral<LambdaArgType>::value && Dims == 1, item<Dims>,
@@ -2259,7 +2284,7 @@ class __SYCL_EXPORT handler {
     setHandlerKernelBundle(Kernel);
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     (void)Kernel;
     kernel_single_task<NameT>(KernelFunc);
 #ifndef __SYCL_DEVICE_ONLY__
@@ -2294,7 +2319,7 @@ class __SYCL_EXPORT handler {
     setHandlerKernelBundle(Kernel);
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
     (void)Kernel;
     (void)NumWorkItems;
@@ -2333,7 +2358,7 @@ class __SYCL_EXPORT handler {
     setHandlerKernelBundle(Kernel);
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
     (void)Kernel;
     (void)NumWorkItems;
@@ -2372,7 +2397,7 @@ class __SYCL_EXPORT handler {
     setHandlerKernelBundle(Kernel);
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType =
         sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
     (void)Kernel;
@@ -2415,7 +2440,7 @@ class __SYCL_EXPORT handler {
     setHandlerKernelBundle(Kernel);
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType =
         sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
     (void)Kernel;
@@ -2455,7 +2480,7 @@ class __SYCL_EXPORT handler {
     setHandlerKernelBundle(Kernel);
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
+    verifyUsedKernelBundle<NameT>();
     using LambdaArgType =
         sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
     (void)Kernel;
diff --git a/sycl/include/sycl/kernel_bundle.hpp b/sycl/include/sycl/kernel_bundle.hpp
index 036bb6a3afe6a..1237bc0651b40 100644
--- a/sycl/include/sycl/kernel_bundle.hpp
+++ b/sycl/include/sycl/kernel_bundle.hpp
@@ -496,8 +496,8 @@ __SYCL_EXPORT kernel_id get_kernel_id_impl(string_view KernelName);
 template <typename KernelName> kernel_id get_kernel_id() {
   // FIXME: This must fail at link-time if KernelName not in any available
   // translation units.
-  using KI = sycl::detail::KernelInfo<KernelName>;
-  return detail::get_kernel_id_impl(detail::string_view{KI::getName()});
+  return detail::get_kernel_id_impl(
+      detail::string_view{detail::getKernelName<KernelName>()});
 }
 
 /// \returns a vector with all kernel_id's defined in the application
diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index f07e09db7a8b3..39f69046ad2aa 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -2357,10 +2357,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
           ext::oneapi::experimental::is_property_list<PropertiesT>::value,
       event>
   parallel_for(nd_range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
-    using KI = sycl::detail::KernelInfo<KernelName>;
-    constexpr detail::code_location CodeLoc(
-        KI::getFileName(), KI::getFunctionName(), KI::getLineNumber(),
-        KI::getColumnNumber());
+    constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -2393,10 +2390,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   template <typename KernelName = detail::auto_name, int Dims,
             typename... RestT>
   event parallel_for(nd_range<Dims> Range, event DepEvent, RestT &&...Rest) {
-    using KI = sycl::detail::KernelInfo<KernelName>;
-    constexpr detail::code_location CodeLoc(
-        KI::getFileName(), KI::getFunctionName(), KI::getLineNumber(),
-        KI::getColumnNumber());
+    constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -2418,10 +2412,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
             typename... RestT>
   event parallel_for(nd_range<Dims> Range, const std::vector<event> &DepEvents,
                      RestT &&...Rest) {
-    using KI = sycl::detail::KernelInfo<KernelName>;
-    constexpr detail::code_location CodeLoc(
-        KI::getFileName(), KI::getFunctionName(), KI::getLineNumber(),
-        KI::getColumnNumber());
+    constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -2754,10 +2745,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
       event>
   parallel_for_impl(range<Dims> Range, PropertiesT Properties,
                     RestT &&...Rest) {
-    using KI = sycl::detail::KernelInfo<KernelName>;
-    constexpr detail::code_location CodeLoc(
-        KI::getFileName(), KI::getFunctionName(), KI::getLineNumber(),
-        KI::getColumnNumber());
+    constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -2791,10 +2779,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
       ext::oneapi::experimental::is_property_list<PropertiesT>::value, event>
   parallel_for_impl(range<Dims> Range, event DepEvent, PropertiesT Properties,
                     RestT &&...Rest) {
-    using KI = sycl::detail::KernelInfo<KernelName>;
-    constexpr detail::code_location CodeLoc(
-        KI::getFileName(), KI::getFunctionName(), KI::getLineNumber(),
-        KI::getColumnNumber());
+    constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -2831,10 +2816,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
       ext::oneapi::experimental::is_property_list<PropertiesT>::value, event>
   parallel_for_impl(range<Dims> Range, const std::vector<event> &DepEvents,
                     PropertiesT Properties, RestT &&...Rest) {
-    using KI = sycl::detail::KernelInfo<KernelName>;
-    constexpr detail::code_location CodeLoc(
-        KI::getFileName(), KI::getFunctionName(), KI::getLineNumber(),
-        KI::getColumnNumber());
+    constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -2869,6 +2851,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
                                size_t Offset,
                                const std::vector<event> &DepEvents);
   const property_list &getPropList() const;
+
+  template <typename KernelName>
+  static constexpr detail::code_location getCodeLocation() {
+    return {detail::getKernelFileName<KernelName>(),
+            detail::getKernelFunctionName<KernelName>(),
+            detail::getKernelLineNumber<KernelName>(),
+            detail::getKernelColumnNumber<KernelName>()};
+  }
 };
 
 } // namespace _V1
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index db50068328854..50e7d007e537e 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -842,16 +842,16 @@ void handler::extractArgsAndReqs() {
 }
 
 void handler::extractArgsAndReqsFromLambda(
-    char *LambdaPtr, size_t KernelArgsNum,
-    const detail::kernel_param_desc_t *KernelArgs, bool IsESIMD) {
+    char *LambdaPtr, const std::vector<detail::kernel_param_desc_t> &ParamDescs,
+    bool IsESIMD) {
   const bool IsKernelCreatedFromSource = false;
   size_t IndexShift = 0;
-  impl->MArgs.reserve(MaxNumAdditionalArgs * KernelArgsNum);
+  impl->MArgs.reserve(MaxNumAdditionalArgs * ParamDescs.size());
 
-  for (size_t I = 0; I < KernelArgsNum; ++I) {
-    void *Ptr = LambdaPtr + KernelArgs[I].offset;
-    const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
-    const int &Size = KernelArgs[I].info;
+  for (size_t I = 0; I < ParamDescs.size(); ++I) {
+    void *Ptr = LambdaPtr + ParamDescs[I].offset;
+    const detail::kernel_param_kind_t &Kind = ParamDescs[I].kind;
+    const int &Size = ParamDescs[I].info;
     if (Kind == detail::kernel_param_kind_t::kind_accessor) {
       // For args kind of accessor Size is information about accessor.
       // The first 11 bits of Size encodes the accessor target.
@@ -875,6 +875,15 @@ void handler::extractArgsAndReqsFromLambda(
   }
 }
 
+// TODO Unused, remove during ABI breaking window
+void handler::extractArgsAndReqsFromLambda(
+    char *LambdaPtr, size_t KernelArgsNum,
+    const detail::kernel_param_desc_t *KernelArgs, bool IsESIMD) {
+  std::vector<detail::kernel_param_desc_t> ParamDescs(
+      KernelArgs, KernelArgs + KernelArgsNum);
+  extractArgsAndReqsFromLambda(LambdaPtr, ParamDescs, IsESIMD);
+}
+
 // Calling methods of kernel_impl requires knowledge of class layout.
 // As this is impossible in header, there's a function that calls necessary
 // method inside the library and returns the result.
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index d9c1b61f7f691..4c73f43ed6ba2 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3514,6 +3514,7 @@ _ZN4sycl3_V17handler26setNDRangeDescriptorPaddedENS0_5rangeILi3EEES3_NS0_2idILi3
 _ZN4sycl3_V17handler26setNDRangeDescriptorPaddedENS0_5rangeILi3EEEbi
 _ZN4sycl3_V17handler27addLifetimeSharedPtrStorageESt10shared_ptrIKvE
 _ZN4sycl3_V17handler27computeFallbackKernelBoundsEmm
+_ZN4sycl3_V17handler28extractArgsAndReqsFromLambdaEPcRKSt6vectorINS0_6detail19kernel_param_desc_tESaIS5_EEb
 _ZN4sycl3_V17handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tEb
 _ZN4sycl3_V17handler28memcpyToHostOnlyDeviceGlobalEPKvS3_mbmm
 _ZN4sycl3_V17handler28setArgsToAssociatedAccessorsEv
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 6f55e82a9151a..e2c3643c557be 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -3862,6 +3862,7 @@
 ?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uexternal_semaphore@experimental@oneapi@ext@23@_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uexternal_semaphore@experimental@oneapi@ext@23@_KV423@AEBUcode_location@detail@23@@Z
 ?extractArgsAndReqs@handler@_V1@sycl@@AEAAXXZ
+?extractArgsAndReqsFromLambda@handler@_V1@sycl@@AEAAXPEADAEBV?$vector@Ukernel_param_desc_t@detail@_V1@sycl@@V?$allocator@Ukernel_param_desc_t@detail@_V1@sycl@@@std@@@std@@_N@Z
 ?extractArgsAndReqsFromLambda@handler@_V1@sycl@@AEAAXPEAD_KPEBUkernel_param_desc_t@detail@23@_N@Z
 ?fill_impl@handler@_V1@sycl@@AEAAXPEAXPEBX_K2@Z
 ?finalize@handler@_V1@sycl@@AEAA?AVevent@23@XZ
@@ -4269,7 +4270,6 @@
 ?updateValue@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBVraw_kernel_arg@34567@_K@Z
 ?use_kernel_bundle@handler@_V1@sycl@@QEAAXAEBV?$kernel_bundle@$01@23@@Z
 ?verifyDeviceHasProgressGuarantee@handler@_V1@sycl@@AEAAXW4forward_progress_guarantee@experimental@oneapi@ext@23@W4execution_scope@56723@1@Z
-?verifyUsedKernelBundle@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
 ?verifyUsedKernelBundleInternal@handler@_V1@sycl@@AEAAXVstring_view@detail@23@@Z
 ?wait@event@_V1@sycl@@QEAAXXZ
 ?wait@event@_V1@sycl@@SAXAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z

From ed2128d8678b56524790161193e93c10271d96c4 Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Tue, 10 Sep 2024 03:01:48 -0700
Subject: [PATCH 10/10] [SYCL]{NFC] Add thread library linking to
 multithread_write_accessor (#15331)

---
 sycl/test-e2e/Regression/multithread_write_accessor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Regression/multithread_write_accessor.cpp b/sycl/test-e2e/Regression/multithread_write_accessor.cpp
index d3228a2d2f49a..c8ef2c534d57a 100644
--- a/sycl/test-e2e/Regression/multithread_write_accessor.cpp
+++ b/sycl/test-e2e/Regression/multithread_write_accessor.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -o %t.out
+// RUN: %{build} -o %t.out %threads_lib
 // RUN: %{run} %t.out
 #include <sycl/detail/core.hpp>