From 8a231d192ccd52c1d63077f91ef19603af1455dd Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 5 Dec 2024 07:09:06 -0800
Subject: [PATCH 01/22] Add XFAIL trackers for several tests

---
 sycl/test-e2e/InvokeSimd/Spec/tuple.cpp        | 1 +
 sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp | 1 +
 sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp
index 6c9d869b3684a..cf4f1e4d5ec75 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp
@@ -1,5 +1,6 @@
 // TODO: enable when Jira ticket resolved
 // XFAIL: *
+// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
 //
 // Check that full compilation works:
 // RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp
index bdc5fe9a7e19b..8c2910a6eb5f3 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp
@@ -1,5 +1,6 @@
 // TODO: enable when Jira ticket resolved
 // XFAIL: *
+// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
 //
 // Check that full compilation works:
 // RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp
index 38f654ea769b3..6a6b1fc16f383 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp
@@ -1,5 +1,6 @@
 // TODO: enable when Jira ticket resolved
 // XFAIL: *
+// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
 //
 // Check that full compilation works:
 // RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out

From abd8fd3f12a03d4de94550e33fb221d83e139840 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 5 Dec 2024 07:10:52 -0800
Subject: [PATCH 02/22] Add XFAIL trackers for several tests

---
 sycl/test-e2e/Basic/buffer/reinterpret.cpp                     | 1 +
 sycl/test-e2e/Basic/queue/queue.cpp                            | 2 ++
 sycl/test-e2e/Basic/queue/release.cpp                          | 1 +
 sycl/test-e2e/Basic/span.cpp                                   | 1 +
 sycl/test-e2e/Basic/stream/auto_flush.cpp                      | 2 ++
 sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp         | 2 ++
 sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp             | 1 +
 sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp        | 1 +
 sycl/test-e2e/GroupAlgorithm/root_group.cpp                    | 2 ++
 sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp          | 1 +
 sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp                | 1 +
 .../InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp | 1 +
 sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp        | 1 +
 sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp      | 1 +
 sycl/test-e2e/Reduction/reduction_nd_conditional.cpp           | 3 ++-
 sycl/test-e2e/Reduction/reduction_nd_dw.cpp                    | 1 +
 sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp            | 3 ++-
 sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp              | 1 +
 sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp        | 1 +
 sycl/test-e2e/Reduction/reduction_nd_rw.cpp                    | 1 +
 sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp     | 1 +
 sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp             | 1 +
 sycl/test-e2e/Reduction/reduction_span_pack.cpp                | 1 +
 sycl/test-e2e/Reduction/reduction_usm.cpp                      | 1 +
 sycl/test-e2e/Reduction/reduction_usm_dw.cpp                   | 1 +
 25 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Basic/buffer/reinterpret.cpp b/sycl/test-e2e/Basic/buffer/reinterpret.cpp
index 691af19ff34f8..7efa1f61030ea 100644
--- a/sycl/test-e2e/Basic/buffer/reinterpret.cpp
+++ b/sycl/test-e2e/Basic/buffer/reinterpret.cpp
@@ -2,6 +2,7 @@
 // RUN: %{run} %t.out
 //
 // XFAIL: level_zero&&gpu
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14430
 
 //==---------- reinterpret.cpp --- SYCL buffer reinterpret basic test ------==//
 //
diff --git a/sycl/test-e2e/Basic/queue/queue.cpp b/sycl/test-e2e/Basic/queue/queue.cpp
index bde85310cc06c..94b628197a3d3 100644
--- a/sycl/test-e2e/Basic/queue/queue.cpp
+++ b/sycl/test-e2e/Basic/queue/queue.cpp
@@ -2,6 +2,8 @@
 // RUN: %{run} %t.out
 //
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16197
+
 //==--------------- queue.cpp - SYCL queue test ----------------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test-e2e/Basic/queue/release.cpp b/sycl/test-e2e/Basic/queue/release.cpp
index d241b742a0216..b041871fb31a3 100644
--- a/sycl/test-e2e/Basic/queue/release.cpp
+++ b/sycl/test-e2e/Basic/queue/release.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s %if !windows %{--check-prefixes=CHECK-RELEASE%}
 //
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16197
 
 #include <sycl/detail/core.hpp>
 int main() {
diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp
index 17c84359e8140..e258997a10fb8 100644
--- a/sycl/test-e2e/Basic/span.cpp
+++ b/sycl/test-e2e/Basic/span.cpp
@@ -3,6 +3,7 @@
 //
 // Fails to release USM pointer on HIP for NVIDIA
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14404
 // REQUIRES: aspect-usm_shared_allocations
 #include <numeric>
 
diff --git a/sycl/test-e2e/Basic/stream/auto_flush.cpp b/sycl/test-e2e/Basic/stream/auto_flush.cpp
index d25cf943a6f3a..ddc5b8e19e0f6 100644
--- a/sycl/test-e2e/Basic/stream/auto_flush.cpp
+++ b/sycl/test-e2e/Basic/stream/auto_flush.cpp
@@ -2,6 +2,8 @@
 // RUN: %{run} %t.out %if !gpu || linux %{ | FileCheck %s %}
 //
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16198
+
 //==-------------- copy.cpp - SYCL stream obect auto flushing test ---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp b/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp
index 98f36dd106bae..ee93969e8a547 100644
--- a/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp
+++ b/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp
@@ -4,6 +4,8 @@
 // hip_nvidia has problems constructing queues due to `No device of requested
 // type available`.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16199
+
 //==-------- queue_old_interop.cpp - SYCL queue OpenCL interop test --------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp b/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp
index 8b32a880a6470..b1999b7bbf4e2 100644
--- a/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp
+++ b/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp
@@ -3,6 +3,7 @@
 // RUN: %{run} %t.out
 //
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16201
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp b/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp
index 4f9063c18e784..50f8db4a060b9 100644
--- a/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp
+++ b/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp
@@ -3,6 +3,7 @@
 // RUN: %{run} %t.out
 //
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16201
 
 #include "Inputs/split-per-source.h"
 
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index fd467f14800e8..fc76171e36fb7 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -1,5 +1,7 @@
 // Fails with opencl non-cpu, enable when fixed.
 // XFAIL: (opencl && !cpu && !accelerator)
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641
+
 // RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp b/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp
index 7356f94a69ff8..5fcd1de9b2ca5 100644
--- a/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp
+++ b/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp
@@ -2,6 +2,7 @@
 // RUN: %{run} %t.out
 //
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16204
 
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp b/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp
index 1091cf90b6dca..2f9822138a2fe 100644
--- a/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp
+++ b/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp
@@ -2,6 +2,7 @@
 // RUN: %{run} %t.out
 //
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16204
 
 // The test checks that multiple calls to the same template instantiation of a
 // group local memory function result in separate allocations, even with device
diff --git a/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp b/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp
index 74b82dfc85ddc..fafd570f4bab2 100644
--- a/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp
+++ b/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp
@@ -1,6 +1,7 @@
 // TODO: Passing/returning structures via invoke_simd() API is not implemented
 // in GPU driver yet. Enable the test when GPU RT supports it.
 // XFAIL: gpu && run-mode
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14543
 //
 // RUN: %{build} -DIMPL_SUBGROUP -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
 // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
diff --git a/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp b/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp
index 014027460301c..961a91b83b4f7 100644
--- a/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp
+++ b/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp
@@ -1,6 +1,7 @@
 // TODO: Passing/returning structures via invoke_simd() API is not implemented
 // in GPU driver yet. Enable the test when GPU RT supports it.
 // XFAIL: gpu, run-mode
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14543
 //
 // RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
 // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
diff --git a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
index 5c2a9edc4682c..b22b380465a9b 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
@@ -3,6 +3,7 @@
 
 // Group algorithms are not supported on NVidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // This test only checks that the method queue::parallel_for() accepting
 // reduction, can be properly translated into queue::submit + parallel_for().
diff --git a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
index b8ab5f0952e13..76bbccb358176 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
@@ -5,7 +5,8 @@
 // parallel_for with reduction requires work group size not bigger than 1` on
 // Nvidia.
 // XFAIL: hip_nvidia
-
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
+ 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
diff --git a/sycl/test-e2e/Reduction/reduction_nd_dw.cpp b/sycl/test-e2e/Reduction/reduction_nd_dw.cpp
index 7bfab8d98cdf1..093c97d050c41 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_dw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_dw.cpp
@@ -3,6 +3,7 @@
 //
 // Group algorithms are not supported on Nvidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
index a4ebe61e84ed0..746ea97148cee 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
@@ -6,7 +6,8 @@
 // work group size not bigger than 1` on Nvidia.
 
 // XFAIL: hip_nvidia
-
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
+ 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // used with 'double' type.
 
diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp
index 03b9ebe7ca423..cc82325754984 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp
@@ -6,6 +6,7 @@
 // `The implementation handling parallel_for with reduction requires
 // work group size not bigger than 1`.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
index 68f1ee3397576..bbefd1dbe51fb 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
@@ -3,6 +3,7 @@
 
 // Group algorithms are not supported on NVidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_nd_rw.cpp b/sycl/test-e2e/Reduction/reduction_nd_rw.cpp
index 901188866a41a..2b26b135582d9 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_rw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_rw.cpp
@@ -3,6 +3,7 @@
 //
 // `Group algorithms are not supported on host device.` on Nvidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp
index f06aab2f517a1..81d6026a2f9e0 100644
--- a/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp
@@ -3,6 +3,7 @@
 
 // Group algorithms are not supported on NVidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp
index 2d1cae039550a..4a6ee054f4539 100644
--- a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp
@@ -4,6 +4,7 @@
 // Error message `Group algorithms are not
 // supported on host device.` on Nvidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
index e6e5252c57577..ec55e26f781a8 100644
--- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
@@ -3,6 +3,7 @@
 //
 // `Group algorithms are not supported on host device.` on Nvidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_usm.cpp b/sycl/test-e2e/Reduction/reduction_usm.cpp
index 9a27956982117..353de97907cf0 100644
--- a/sycl/test-e2e/Reduction/reduction_usm.cpp
+++ b/sycl/test-e2e/Reduction/reduction_usm.cpp
@@ -3,6 +3,7 @@
 //
 // `Group algorithms are not supported on host device.` on Nvidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_usm_dw.cpp
index 5b36fcba18e56..56e07083587cd 100644
--- a/sycl/test-e2e/Reduction/reduction_usm_dw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_usm_dw.cpp
@@ -3,6 +3,7 @@
 
 // `Group algorithms are not supported on host device` on Nvidia.
 // XFAIL: hip_nvidia
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows

From 5edd0ad19797c0a0a3ae3b2e9319341460499c8a Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 5 Dec 2024 07:26:50 -0800
Subject: [PATCH 03/22] Add XFAIL trackers for several more tests

---
 sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp       | 1 +
 .../test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp | 1 +
 sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp  | 1 +
 sycl/test-e2e/Reduction/reduction_nd_conditional.cpp           | 3 +--
 sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp            | 3 +--
 5 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp
index 34fe12ebb70e9..daada8ecf343d 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp
@@ -1,5 +1,6 @@
 // TODO: enable when Jira ticket resolved
 // XFAIL: *
+// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
 //
 // Check that full compilation works:
 // RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple.cpp -o %t.out
diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp
index 90c3468b4151f..270c109150a2c 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp
@@ -1,5 +1,6 @@
 // TODO: enable when Jira ticket resolved
 // XFAIL: *
+// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
 //
 // Check that full compilation works:
 // RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple_return.cpp -o %t.out
diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp
index 7d2bed1426cd8..89828d6a76000 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp
@@ -1,5 +1,6 @@
 // TODO: enable when Jira ticket resolved
 // XFAIL: *
+// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
 //
 // Check that full compilation works:
 // RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple_vadd.cpp -o %t.out
diff --git a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
index 76bbccb358176..0611367572373 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
@@ -5,8 +5,7 @@
 // parallel_for with reduction requires work group size not bigger than 1` on
 // Nvidia.
 // XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
- 
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
index 746ea97148cee..74b1ffe425419 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
@@ -6,8 +6,7 @@
 // work group size not bigger than 1` on Nvidia.
 
 // XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
- 
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // used with 'double' type.
 

From dda50ca91e1e64c1bd90f70a5a35b6b2f78a2fdc Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 5 Dec 2024 07:35:17 -0800
Subject: [PATCH 04/22] Add XFAIL trackers for several more tests

---
 sycl/test-e2e/Reduction/reduction_nd_conditional.cpp | 2 +-
 sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
index 0611367572373..c5fcbac3ff29b 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
@@ -5,7 +5,7 @@
 // parallel_for with reduction requires work group size not bigger than 1` on
 // Nvidia.
 // XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973 
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
index 74b1ffe425419..18c9e7bc88db4 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
@@ -6,7 +6,7 @@
 // work group size not bigger than 1` on Nvidia.
 
 // XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973 
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // used with 'double' type.
 

From 87856a72381b1caea69e7286b21dad7a2cbedfd0 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 5 Dec 2024 07:43:57 -0800
Subject: [PATCH 05/22] Reduce number of tests without XFAIL tracker

---
 sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
index c867ce1f4f420..c42505b01f3b5 100644
--- a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
+++ b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
@@ -51,7 +51,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 77
+// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 46
 //
 // List of improperly XFAIL-ed tests.
 // Remove the CHECK once the test has been properly XFAIL-ed.

From 8d1047365fcb007e2471e684b277dd18feaab3ca Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 5 Dec 2024 08:16:36 -0800
Subject: [PATCH 06/22] Fix wrong tracker in span.cpp test

---
 sycl/test-e2e/Basic/span.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp
index e258997a10fb8..735b705f18fc1 100644
--- a/sycl/test-e2e/Basic/span.cpp
+++ b/sycl/test-e2e/Basic/span.cpp
@@ -3,7 +3,7 @@
 //
 // Fails to release USM pointer on HIP for NVIDIA
 // XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14404
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14432
 // REQUIRES: aspect-usm_shared_allocations
 #include <numeric>
 

From 0f2daf2ac2576a25e188c03eaf281d7e9cc7aacb Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 5 Dec 2024 08:21:14 -0800
Subject: [PATCH 07/22] Delete tests failing because of GSD-4509 issue

---
 .../Spec/ImplicitSubgroup/tuple.cpp           |  16 --
 .../Spec/ImplicitSubgroup/tuple_return.cpp    |  16 --
 .../Spec/ImplicitSubgroup/tuple_vadd.cpp      |  16 --
 sycl/test-e2e/InvokeSimd/Spec/tuple.cpp       | 148 -----------------
 .../test-e2e/InvokeSimd/Spec/tuple_return.cpp | 150 -----------------
 sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp  | 152 ------------------
 6 files changed, 498 deletions(-)
 delete mode 100644 sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp
 delete mode 100644 sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp
 delete mode 100644 sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp
 delete mode 100644 sycl/test-e2e/InvokeSimd/Spec/tuple.cpp
 delete mode 100644 sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp
 delete mode 100644 sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp

diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp
deleted file mode 100644
index daada8ecf343d..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// TODO: enable when Jira ticket resolved
-// XFAIL: *
-// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
-//
-// Check that full compilation works:
-// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple.cpp -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-/*
- * This tests is the same as InvokeSimd/spec/tuple.cpp, but compiles without
- * optional subgroup attribute specified and intended to check that compiler is
- * able to choose subgroup size correctly.
- */
diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp
deleted file mode 100644
index 270c109150a2c..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// TODO: enable when Jira ticket resolved
-// XFAIL: *
-// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
-//
-// Check that full compilation works:
-// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple_return.cpp -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-/*
- * This tests is the same as InvokeSimd/spec/tuple_return.cpp, but compiles
- * without optional subgroup attribute specified and intended to check that
- * compiler is able to choose subgroup size correctly.
- */
diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp
deleted file mode 100644
index 89828d6a76000..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// TODO: enable when Jira ticket resolved
-// XFAIL: *
-// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
-//
-// Check that full compilation works:
-// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple_vadd.cpp -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-/*
- * This tests is the same as InvokeSimd/spec/tuple_vadd.cpp, but compiles
- * without optional subgroup attribute specified and intended to check that
- * compiler is able to choose subgroup size correctly.
- */
diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp
deleted file mode 100644
index cf4f1e4d5ec75..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// TODO: enable when Jira ticket resolved
-// XFAIL: *
-// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
-//
-// Check that full compilation works:
-// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-// Tests invoke_simd support in the compiler/headers
-/* Test case description:
- * ----------------------
- * This is a minimal test case to test invoke_simd support for tuples,
- * as defined in the invoke_simd spec.
- *
- * This test case simply creates a scalar tuple<float, int> per work-item
- * which gets implicitly vectorized into a
- * tuple<simd<float, VL>, simd<int, VL>>. Then, inside the ESIMD function,
- * we simply get the first tuple element (simd<float, VL>) and return it.
- *
- * This test also runs with all types of VISA link time optimizations enabled.
- */
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/intel/esimd.hpp>
-#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
-#include <sycl/usm.hpp>
-
-#include <functional>
-#include <iostream>
-#include <type_traits>
-
-#include <tuple>
-
-/* Subgroup size attribute is optional
- * In case it is absent compiler decides what subgroup size to use
- */
-#ifdef IMPL_SUBGROUP
-#define SUBGROUP_ATTR
-#else
-#define SUBGROUP_ATTR [[sycl::reqd_sub_group_size(VL)]]
-#endif
-
-using namespace sycl::ext::oneapi::experimental;
-namespace esimd = sycl::ext::intel::esimd;
-constexpr int VL = 16;
-
-__attribute__((always_inline)) esimd::simd<float, VL>
-ESIMD_CALLEE(std::tuple<esimd::simd<float, VL>, esimd::simd<int, VL>> tup,
-             esimd::simd<float, VL> a) SYCL_ESIMD_FUNCTION {
-  esimd::simd<float, VL> float_vector = std::get<0>(tup);
-  esimd::simd<int, VL> int_vector = std::get<1>(tup);
-  return float_vector;
-}
-
-[[intel::device_indirectly_callable]] SYCL_EXTERNAL
-    simd<float, VL> __regcall SIMD_CALLEE(
-        std::tuple<simd<float, VL>, simd<int, VL>> tup,
-        simd<float, VL> a) SYCL_ESIMD_FUNCTION;
-
-using namespace sycl;
-
-int main(void) {
-  constexpr unsigned Size = 1024;
-  constexpr unsigned GroupSize = 4 * VL;
-
-  auto q = queue{gpu_selector_v};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
-  auto ctxt = q.get_context();
-
-  float *A =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-  float *C =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-
-  int *D = static_cast<int *>(malloc_shared(Size * sizeof(int), dev, ctxt));
-
-  for (unsigned i = 0; i < Size; ++i) {
-    A[i] = i;
-    C[i] = -1;
-    D[i] = 1;
-  }
-
-  sycl::range<1> GlobalRange{Size};
-  // Number of workitems in each workgroup.
-  sycl::range<1> LocalRange{GroupSize};
-
-  sycl::nd_range<1> Range(GlobalRange, LocalRange);
-
-  try {
-    auto e = q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Test>(Range, [=](nd_item<1> ndi) SUBGROUP_ATTR {
-        sub_group sg = ndi.get_sub_group();
-        group<1> g = ndi.get_group();
-        uint32_t i =
-            sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize;
-        uint32_t wi_id = i + sg.get_local_id();
-
-        std::tuple<float, int> tup(A[wi_id], D[wi_id]);
-        float res = invoke_simd(sg, SIMD_CALLEE, tup, A[wi_id]);
-        C[wi_id] = res;
-      });
-    });
-    e.wait();
-  } catch (sycl::exception const &e) {
-    sycl::free(A, q);
-    sycl::free(C, q);
-    sycl::free(D, q);
-
-    std::cout << "SYCL exception caught: " << e.what() << '\n';
-    return e.code().value();
-  }
-
-  int err_cnt = 0;
-
-  for (unsigned i = 0; i < Size; ++i) {
-    if (A[i] != C[i]) {
-      if (++err_cnt < 10) {
-        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
-                  << "\n";
-      }
-    }
-  }
-  if (err_cnt > 0) {
-    std::cout << "  pass rate: "
-              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
-              << (Size - err_cnt) << "/" << Size << ")\n";
-  }
-
-  sycl::free(A, q);
-  sycl::free(C, q);
-  sycl::free(D, q);
-
-  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
-  return err_cnt > 0 ? 1 : 0;
-}
-
-[[intel::device_indirectly_callable]] SYCL_EXTERNAL
-    simd<float, VL> __regcall SIMD_CALLEE(
-        std::tuple<simd<float, VL>, simd<int, VL>> tup,
-        simd<float, VL> a) SYCL_ESIMD_FUNCTION {
-  esimd::simd<float, VL> res = ESIMD_CALLEE(tup, a);
-  return res;
-}
diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp
deleted file mode 100644
index 8c2910a6eb5f3..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-// TODO: enable when Jira ticket resolved
-// XFAIL: *
-// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
-//
-// Check that full compilation works:
-// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-// Tests invoke_simd support in the compiler/headers
-/* Test case purpose:
- * ----------------------
- * To test returning a tuple from invoke_simd.
- *
- * Test case description:
- * ----------------------
- * This test case performs a vector add of A and B by passing A[wi_id] and
- * B[wi_id] to an invoke_simd callee which simply combines these into a
- * tuple and returns it. Then, the indivual values a and b are gotten
- * back out of the tuple, added together, and stored in C[wi_id].
- *
- * This test also runs with all types of VISA link time optimizations enabled.
- */
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/intel/esimd.hpp>
-#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
-#include <sycl/usm.hpp>
-
-#include <functional>
-#include <iostream>
-#include <type_traits>
-
-#include <tuple>
-
-/* Subgroup size attribute is optional
- * In case it is absent compiler decides what subgroup size to use
- */
-#ifdef IMPL_SUBGROUP
-#define SUBGROUP_ATTR
-#else
-#define SUBGROUP_ATTR [[sycl::reqd_sub_group_size(VL)]]
-#endif
-
-using namespace sycl::ext::oneapi::experimental;
-namespace esimd = sycl::ext::intel::esimd;
-constexpr int VL = 16;
-
-__attribute__((always_inline))
-std::tuple<esimd::simd<float, VL>, esimd::simd<float, VL>>
-ESIMD_CALLEE(esimd::simd<float, VL> va,
-             esimd::simd<float, VL> vb) SYCL_ESIMD_FUNCTION {
-  std::tuple<esimd::simd<float, VL>, esimd::simd<float, VL>> tup(va, vb);
-  return tup;
-}
-
-[[intel::device_indirectly_callable]] SYCL_EXTERNAL
-    std::tuple<simd<float, VL>, simd<float, VL>> __regcall SIMD_CALLEE(
-        simd<float, VL> va, simd<float, VL> vb) SYCL_ESIMD_FUNCTION;
-
-using namespace sycl;
-
-int main(void) {
-  constexpr unsigned Size = 1024;
-  constexpr unsigned GroupSize = 4 * VL;
-
-  auto q = queue{gpu_selector_v};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
-  auto ctxt = q.get_context();
-
-  float *A =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-  float *B =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-  float *C =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-
-  for (unsigned i = 0; i < Size; ++i) {
-    A[i] = B[i] = i;
-    C[i] = -1;
-  }
-
-  sycl::range<1> GlobalRange{Size};
-  // Number of workitems in each workgroup.
-  sycl::range<1> LocalRange{GroupSize};
-
-  sycl::nd_range<1> Range(GlobalRange, LocalRange);
-
-  try {
-    auto e = q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Test>(Range, [=](nd_item<1> ndi) SUBGROUP_ATTR {
-        sub_group sg = ndi.get_sub_group();
-        group<1> g = ndi.get_group();
-        uint32_t i =
-            sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize;
-        uint32_t wi_id = i + sg.get_local_id();
-
-        std::tuple<float, float> tup =
-            invoke_simd(sg, SIMD_CALLEE, A[wi_id], B[wi_id]);
-        float a = std::get<0>(tup);
-        float b = std::get<1>(tup);
-        float res = a + b;
-        C[wi_id] = res;
-      });
-    });
-    e.wait();
-  } catch (sycl::exception const &e) {
-    sycl::free(A, q);
-    sycl::free(B, q);
-    sycl::free(C, q);
-
-    std::cout << "SYCL exception caught: " << e.what() << '\n';
-    return e.code().value();
-  }
-
-  int err_cnt = 0;
-
-  for (unsigned i = 0; i < Size; ++i) {
-    if (A[i] + B[i] != C[i]) {
-      if (++err_cnt < 10) {
-        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
-                  << " + " << B[i] << "\n";
-      }
-    }
-  }
-  if (err_cnt > 0) {
-    std::cout << "  pass rate: "
-              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
-              << (Size - err_cnt) << "/" << Size << ")\n";
-  }
-
-  sycl::free(A, q);
-  sycl::free(B, q);
-  sycl::free(C, q);
-
-  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
-  return err_cnt > 0 ? 1 : 0;
-}
-
-[[intel::device_indirectly_callable]] SYCL_EXTERNAL
-    std::tuple<simd<float, VL>, simd<float, VL>> __regcall SIMD_CALLEE(
-        simd<float, VL> va, simd<float, VL> vb) SYCL_ESIMD_FUNCTION {
-  std::tuple<esimd::simd<float, VL>, esimd::simd<float, VL>> res =
-      ESIMD_CALLEE(va, vb);
-  return res;
-}
diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp
deleted file mode 100644
index 6a6b1fc16f383..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-// TODO: enable when Jira ticket resolved
-// XFAIL: *
-// XFAIL-TRACKER: https://jira.devtools.intel.com/browse/GSD-4509
-//
-// Check that full compilation works:
-// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-// Tests invoke_simd support in the compiler/headers
-/* Test case purpose:
- * ------------------
- * To test invoke_simd support for tuples, as defined in the invoke_simd spec.
- *
- * Test case description:
- * ----------------------
- * This test case performs a vector addition of 2 vectors of float, a and b.
- * Instead of passing in 2 separate simd<float, VL> to the SIMD and ESIMD
- * functions, we pass in a single tuple<simd<float, VL>, simd<float, VL>> and
- * then get and add these tuple elements,
- *
- * Implementation notes:
- * --------------------
- * I've included an equivalent set of regular (non-tuple) vadd functions to
- * verify that the overall logic of the program is correct. Switch the
- * invoke_simd() call to see that the regular vadd works correctly.
- *
- * This test also runs with all types of VISA link time optimizations enabled.
- */
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/intel/esimd.hpp>
-#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
-#include <sycl/usm.hpp>
-
-#include <functional>
-#include <iostream>
-#include <type_traits>
-
-#include <tuple>
-
-/* Subgroup size attribute is optional
- * In case it is absent compiler decides what subgroup size to use
- */
-#ifdef IMPL_SUBGROUP
-#define SUBGROUP_ATTR
-#else
-#define SUBGROUP_ATTR [[sycl::reqd_sub_group_size(VL)]]
-#endif
-
-using namespace sycl::ext::oneapi::experimental;
-namespace esimd = sycl::ext::intel::esimd;
-constexpr int VL = 16;
-
-__attribute__((always_inline)) esimd::simd<float, VL> ESIMD_CALLEE_TUPLE_VADD(
-    std::tuple<esimd::simd<float, VL>, esimd::simd<float, VL>> tup)
-    SYCL_ESIMD_FUNCTION {
-  esimd::simd<float, VL> va = std::get<0>(tup);
-  esimd::simd<float, VL> vb = std::get<1>(tup);
-  return va + vb;
-}
-
-[[intel::device_indirectly_callable]] SYCL_EXTERNAL
-    simd<float, VL> __regcall SIMD_CALLEE_TUPLE_VADD(
-        std::tuple<simd<float, VL>, simd<float, VL>> tup) SYCL_ESIMD_FUNCTION;
-
-using namespace sycl;
-
-int main(void) {
-  constexpr unsigned Size = 1024;
-  constexpr unsigned GroupSize = 4 * VL;
-
-  auto q = queue{gpu_selector_v};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
-  auto ctxt = q.get_context();
-
-  float *A =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-  float *B =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-  float *C =
-      static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
-
-  for (unsigned i = 0; i < Size; ++i) {
-    A[i] = B[i] = i;
-    C[i] = -1;
-  }
-
-  sycl::range<1> GlobalRange{Size};
-  // Number of workitems in each workgroup.
-  sycl::range<1> LocalRange{GroupSize};
-
-  sycl::nd_range<1> Range(GlobalRange, LocalRange);
-
-  try {
-    auto e = q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Test>(Range, [=](nd_item<1> ndi) SUBGROUP_ATTR {
-        sub_group sg = ndi.get_sub_group();
-        group<1> g = ndi.get_group();
-        uint32_t i =
-            sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize;
-        uint32_t wi_id = i + sg.get_local_id();
-
-        std::tuple<float, float> tup(A[wi_id], B[wi_id]);
-        float res = invoke_simd(sg, SIMD_CALLEE_TUPLE_VADD, tup);
-        C[wi_id] = res;
-      });
-    });
-    e.wait();
-  } catch (sycl::exception const &e) {
-    sycl::free(A, q);
-    sycl::free(B, q);
-    sycl::free(C, q);
-
-    std::cout << "SYCL exception caught: " << e.what() << '\n';
-    return e.code().value();
-  }
-
-  int err_cnt = 0;
-
-  for (unsigned i = 0; i < Size; ++i) {
-    if (A[i] + B[i] != C[i]) {
-      if (++err_cnt < 10) {
-        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
-                  << " + " << B[i] << "\n";
-      }
-    }
-  }
-  if (err_cnt > 0) {
-    std::cout << "  pass rate: "
-              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
-              << (Size - err_cnt) << "/" << Size << ")\n";
-  }
-
-  sycl::free(A, q);
-  sycl::free(B, q);
-  sycl::free(C, q);
-
-  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
-  return err_cnt > 0 ? 1 : 0;
-}
-
-[[intel::device_indirectly_callable]] SYCL_EXTERNAL
-    simd<float, VL> __regcall SIMD_CALLEE_TUPLE_VADD(
-        std::tuple<simd<float, VL>, simd<float, VL>> tup) SYCL_ESIMD_FUNCTION {
-  esimd::simd<float, VL> res = ESIMD_CALLEE_TUPLE_VADD(tup);
-  return res;
-}

From 7bd59220529aec218877f154bf5b88ba6011553f Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Fri, 6 Dec 2024 07:44:11 -0800
Subject: [PATCH 08/22] Remove XFAILs for hip nvidia plaform

---
 sycl/test-e2e/Basic/queue/queue.cpp                        | 3 ---
 sycl/test-e2e/Basic/span.cpp                               | 3 ---
 sycl/test-e2e/Basic/stream/auto_flush.cpp                  | 3 ---
 sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp     | 5 -----
 sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp         | 3 ---
 sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp    | 3 ---
 sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp      | 3 ---
 sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp            | 3 ---
 sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp  | 3 ---
 sycl/test-e2e/Reduction/reduction_nd_conditional.cpp       | 5 -----
 sycl/test-e2e/Reduction/reduction_nd_dw.cpp                | 4 ----
 sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp        | 6 ------
 sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp          | 6 ------
 sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp    | 3 ---
 sycl/test-e2e/Reduction/reduction_nd_rw.cpp                | 4 ----
 sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp | 4 ----
 sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp         | 5 -----
 sycl/test-e2e/Reduction/reduction_span_pack.cpp            | 4 ----
 sycl/test-e2e/Reduction/reduction_usm.cpp                  | 4 ----
 sycl/test-e2e/Reduction/reduction_usm_dw.cpp               | 4 ----
 sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp               | 2 --
 sycl/test-e2e/Scheduler/MemObjRemapping.cpp                | 3 +--
 sycl/test-e2e/Scheduler/MultipleDevices.cpp                | 2 --
 sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp           | 2 --
 sycl/test-e2e/Tracing/buffer_printers.cpp                  | 2 --
 25 files changed, 1 insertion(+), 88 deletions(-)

diff --git a/sycl/test-e2e/Basic/queue/queue.cpp b/sycl/test-e2e/Basic/queue/queue.cpp
index 94b628197a3d3..20697acfbee89 100644
--- a/sycl/test-e2e/Basic/queue/queue.cpp
+++ b/sycl/test-e2e/Basic/queue/queue.cpp
@@ -1,8 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16197
 
 //==--------------- queue.cpp - SYCL queue test ----------------------------==//
 //
diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp
index 735b705f18fc1..2a112ba4740d5 100644
--- a/sycl/test-e2e/Basic/span.cpp
+++ b/sycl/test-e2e/Basic/span.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// Fails to release USM pointer on HIP for NVIDIA
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14432
 // REQUIRES: aspect-usm_shared_allocations
 #include <numeric>
 
diff --git a/sycl/test-e2e/Basic/stream/auto_flush.cpp b/sycl/test-e2e/Basic/stream/auto_flush.cpp
index ddc5b8e19e0f6..f346eca5b72cf 100644
--- a/sycl/test-e2e/Basic/stream/auto_flush.cpp
+++ b/sycl/test-e2e/Basic/stream/auto_flush.cpp
@@ -1,8 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out %if !gpu || linux %{ | FileCheck %s %}
-//
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16198
 
 //==-------------- copy.cpp - SYCL stream obect auto flushing test ---------==//
 //
diff --git a/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp b/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp
index ee93969e8a547..ab9059ce98976 100644
--- a/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp
+++ b/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp
@@ -1,10 +1,5 @@
 // RUN: %{build} -D__SYCL_INTERNAL_API -o %t.out
 // RUN: %{run-unfiltered-devices} %t.out
-//
-// hip_nvidia has problems constructing queues due to `No device of requested
-// type available`.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16199
 
 //==-------- queue_old_interop.cpp - SYCL queue OpenCL interop test --------==//
 //
diff --git a/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp b/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp
index b1999b7bbf4e2..15677a64ea5e5 100644
--- a/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp
+++ b/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -Wno-error=unused-command-line-argument -fsycl-device-code-split=per_kernel -o %t.out \
 // RUN: -fsycl-dead-args-optimization
 // RUN: %{run} %t.out
-//
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16201
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp b/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp
index 50f8db4a060b9..29c6102f71284 100644
--- a/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp
+++ b/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -Wno-error=unused-command-line-argument -fsycl-device-code-split=per_source -I %S/Inputs -o %t.out %S/Inputs/split-per-source-second-file.cpp \
 // RUN: -fsycl-dead-args-optimization
 // RUN: %{run} %t.out
-//
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16201
 
 #include "Inputs/split-per-source.h"
 
diff --git a/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp b/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp
index 5fcd1de9b2ca5..da0f3881ffc59 100644
--- a/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp
+++ b/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp
@@ -1,8 +1,5 @@
 // RUN: %{build} -Wno-error=deprecated-declarations -o %t.out
 // RUN: %{run} %t.out
-//
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16204
 
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp b/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp
index 2f9822138a2fe..b0b3a0e8a8309 100644
--- a/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp
+++ b/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp
@@ -1,8 +1,5 @@
 // RUN: %{build} -Wno-error=deprecated-declarations -fno-sycl-early-optimizations -o %t.out
 // RUN: %{run} %t.out
-//
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16204
 
 // The test checks that multiple calls to the same template instantiation of a
 // group local memory function result in separate allocations, even with device
diff --git a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
index b22b380465a9b..f9ff2f64140c7 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// Group algorithms are not supported on NVidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // This test only checks that the method queue::parallel_for() accepting
 // reduction, can be properly translated into queue::submit + parallel_for().
diff --git a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
index c5fcbac3ff29b..6e71cc363ad98 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp
@@ -1,11 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// Error message `The implementation handling
-// parallel_for with reduction requires work group size not bigger than 1` on
-// Nvidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
diff --git a/sycl/test-e2e/Reduction/reduction_nd_dw.cpp b/sycl/test-e2e/Reduction/reduction_nd_dw.cpp
index 093c97d050c41..dd45a974b0950 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_dw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_dw.cpp
@@ -1,9 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// Group algorithms are not supported on Nvidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
index 18c9e7bc88db4..7e7b9fac72ecf 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp
@@ -1,12 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// Error
-// message `The implementation handling parallel_for with reduction requires
-// work group size not bigger than 1` on Nvidia.
 
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // used with 'double' type.
 
diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp
index cc82325754984..8277360d39059 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp
@@ -1,12 +1,6 @@
 // REQUIRES: aspect-fp16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// Error message on Nvidia:
-// `The implementation handling parallel_for with reduction requires
-// work group size not bigger than 1`.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
index bbefd1dbe51fb..6a2c062ba9ffa 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// Group algorithms are not supported on NVidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_nd_rw.cpp b/sycl/test-e2e/Reduction/reduction_nd_rw.cpp
index 2b26b135582d9..d329033446191 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_rw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_rw.cpp
@@ -1,9 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// `Group algorithms are not supported on host device.` on Nvidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp
index 81d6026a2f9e0..1d94e6faaacef 100644
--- a/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp
@@ -1,10 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// Group algorithms are not supported on NVidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
-
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
diff --git a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp
index 4a6ee054f4539..9c9f366f6a588 100644
--- a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp
@@ -1,10 +1,5 @@
 // RUN: %{build} -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
 // RUN: %{run} %t.out
-//
-// Error message `Group algorithms are not
-// supported on host device.` on Nvidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
index ec55e26f781a8..023e78fe5e85d 100644
--- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
@@ -1,9 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// `Group algorithms are not supported on host device.` on Nvidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_usm.cpp b/sycl/test-e2e/Reduction/reduction_usm.cpp
index 353de97907cf0..24dd84f66236f 100644
--- a/sycl/test-e2e/Reduction/reduction_usm.cpp
+++ b/sycl/test-e2e/Reduction/reduction_usm.cpp
@@ -1,9 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// `Group algorithms are not supported on host device.` on Nvidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_usm_dw.cpp
index 56e07083587cd..6d00451a0701e 100644
--- a/sycl/test-e2e/Reduction/reduction_usm_dw.cpp
+++ b/sycl/test-e2e/Reduction/reduction_usm_dw.cpp
@@ -1,10 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// `Group algorithms are not supported on host device` on Nvidia.
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14973
-
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
diff --git a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
index e1309ee0edb1a..0c0e1750805f2 100644
--- a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
+++ b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
@@ -1,7 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s
-//
-// XFAIL: hip_nvidia
 
 // The tested functionality is disabled with Level Zero until it is supported by
 // the adapter.
diff --git a/sycl/test-e2e/Scheduler/MemObjRemapping.cpp b/sycl/test-e2e/Scheduler/MemObjRemapping.cpp
index f4de8a56217cd..55181dfbcc565 100644
--- a/sycl/test-e2e/Scheduler/MemObjRemapping.cpp
+++ b/sycl/test-e2e/Scheduler/MemObjRemapping.cpp
@@ -1,7 +1,6 @@
 // RUN: %{build} -Wno-error=deprecated-declarations -o %t.out
 // RUN: env SYCL_HOST_UNIFIED_MEMORY=1 SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s
-//
-// XFAIL: hip_nvidia
+
 #include <cassert>
 #include <cstddef>
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Scheduler/MultipleDevices.cpp b/sycl/test-e2e/Scheduler/MultipleDevices.cpp
index 3641e5d58b5ad..3976512e2d6e7 100644
--- a/sycl/test-e2e/Scheduler/MultipleDevices.cpp
+++ b/sycl/test-e2e/Scheduler/MultipleDevices.cpp
@@ -1,7 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run-unfiltered-devices} %t.out
-//
-// XFAIL: hip_nvidia
 
 //===- MultipleDevices.cpp - Test checking multi-device execution --------===//
 //
diff --git a/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp b/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp
index 880ec728f8951..fb634c832114d 100644
--- a/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp
+++ b/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp
@@ -1,7 +1,5 @@
 // RUN: %{build} -Wno-error=unused-command-line-argument -fsycl-dead-args-optimization -o %t.out
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s %if !windows %{--check-prefix=CHECK-RELEASE%}
-//
-// XFAIL: hip_nvidia
 
 //==------------------- ReleaseResourcesTests.cpp --------------------------==//
 //
diff --git a/sycl/test-e2e/Tracing/buffer_printers.cpp b/sycl/test-e2e/Tracing/buffer_printers.cpp
index 4e29cbb02ff6c..d712b0009c1ae 100644
--- a/sycl/test-e2e/Tracing/buffer_printers.cpp
+++ b/sycl/test-e2e/Tracing/buffer_printers.cpp
@@ -1,7 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s
-//
-// XFAIL: hip_nvidia
 
 #include <sycl/detail/core.hpp>
 #include <vector>

From b5471f8b3c03168975efdadaab1ce5bc6eb165db Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Fri, 6 Dec 2024 07:55:28 -0800
Subject: [PATCH 09/22] Fix formatting

---
 sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp    | 1 -
 sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp      | 1 -
 sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
index f9ff2f64140c7..eb27a5b76c9d9 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
@@ -1,7 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-
 // This test only checks that the method queue::parallel_for() accepting
 // reduction, can be properly translated into queue::submit + parallel_for().
 
diff --git a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
index 6a2c062ba9ffa..474c67aaf9b47 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp
@@ -1,7 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
diff --git a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
index c42505b01f3b5..a4e0a388fe9c0 100644
--- a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
+++ b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
@@ -51,7 +51,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 46
+// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 41
 //
 // List of improperly XFAIL-ed tests.
 // Remove the CHECK once the test has been properly XFAIL-ed.

From 6df2d88c46e913b749fcb911e57702cc7e5b41dc Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Fri, 6 Dec 2024 08:19:04 -0800
Subject: [PATCH 10/22] Fix typos and remove the tests from the list of
 improperly XFAIL-ed tests

---
 "\\"                                          | 113 ++++++++++++++++++
 sycl/test-e2e/Basic/queue/release.cpp         |   3 -
 .../no-xfail-without-tracker.cpp              |  36 ------
 3 files changed, 113 insertions(+), 39 deletions(-)
 create mode 100644 "\\"

diff --git "a/\\" "b/\\"
new file mode 100644
index 0000000000000..2a112ba4740d5
--- /dev/null
+++ "b/\\"
@@ -0,0 +1,113 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+// REQUIRES: aspect-usm_shared_allocations
+#include <numeric>
+
+#include <sycl/detail/core.hpp>
+
+#include <sycl/sycl_span.hpp>
+#include <sycl/usm.hpp>
+#include <sycl/usm/usm_allocator.hpp>
+
+using namespace sycl;
+
+void testSpanCapture() {
+  // This test creates spans that are backed by USM.
+  // ensures they can be captured by device lambda
+  // and that read and write operations function correctly
+  // across capture.
+  queue Q;
+
+  constexpr long numReadTests = 2;
+  const range<1> NumberOfReadTestsRange(numReadTests);
+  buffer<int, 1> SpanRead(NumberOfReadTestsRange);
+
+  // span from a vector
+  // We will create a vector, backed by a USM allocator. And a span from that.
+  using vec_alloc = usm_allocator<int, usm::alloc::shared>;
+  // Create allocator for device associated with q
+  vec_alloc myAlloc(Q);
+  // Create std vector with the allocator
+  std::vector<int, vec_alloc> vecUSM(4, myAlloc);
+  std::iota(vecUSM.begin(), vecUSM.end(), 1);
+  sycl::span<int> vecUSM_span{vecUSM};
+  vecUSM_span[0] += 100; // 101  modify first value using span affordance.
+
+  // span from USM memory
+  auto *usm_data = malloc_shared<int>(4, Q);
+  sycl::span<int> usm_span(usm_data, 4);
+  std::iota(usm_span.begin(), usm_span.end(), 1);
+  usm_span[0] += 100; // 101 modify first value using span affordance.
+
+  event E = Q.submit([&](handler &cgh) {
+    auto can_read_from_span_acc = SpanRead.get_access<access::mode::write>(cgh);
+    cgh.single_task<class hi>([=] {
+      // read from the spans.
+      can_read_from_span_acc[0] = vecUSM_span[0];
+      can_read_from_span_acc[1] = usm_span[0];
+
+      // write to the spans
+      vecUSM_span[1] += 1000;
+      usm_span[1] += 1000;
+    });
+  });
+  E.wait();
+
+  // check out the read operations, should have gotten 101 from each
+  host_accessor can_read_from_span_acc(SpanRead, read_only);
+  for (int i = 0; i < numReadTests; i++) {
+    assert(can_read_from_span_acc[i] == 101 &&
+           "read check should have gotten 100");
+  }
+
+  // were the spans successfully modified via write?
+  assert(vecUSM_span[1] == 1002 &&
+         "vecUSM_span write check should have gotten 1001");
+  assert(usm_span[1] == 1002 && "usm_span write check should have gotten 1001");
+
+  free(usm_data, Q);
+}
+
+void set_all_span_values(sycl::span<int> container, int v) {
+  for (auto &e : container)
+    e = v;
+}
+
+void testSpanOnDevice() {
+  // this test creates a simple span on device,
+  // passes it to a function that operates on it
+  // and ensures it worked correctly
+  queue Q;
+  constexpr long numReadTests = 4;
+  const range<1> NumberOfReadTestsRange(numReadTests);
+  buffer<int, 1> SpanRead(NumberOfReadTestsRange);
+
+  event E = Q.submit([&](handler &cgh) {
+    auto can_read_from_span_acc = SpanRead.get_access<access::mode::write>(cgh);
+    cgh.single_task<class ha>([=] {
+      // create a span on device, pass it to function that modifies it
+      // read values back out.
+      int a[]{1, 2, 3, 4};
+      sycl::span<int> a_span{a};
+      set_all_span_values(a_span, 10);
+      for (int i = 0; i < numReadTests; i++)
+        can_read_from_span_acc[i] = a_span[i];
+    });
+  });
+  E.wait();
+
+  // check out the read operations, should have gotten 10 from each
+  host_accessor can_read_from_span_acc(SpanRead, read_only);
+  for (int i = 0; i < numReadTests; i++) {
+    assert(can_read_from_span_acc[i] == 10 &&
+           "read check should have gotten 10");
+  }
+}
+
+int main() {
+  testSpanCapture();
+  testSpanOnDevice();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Basic/queue/release.cpp b/sycl/test-e2e/Basic/queue/release.cpp
index b041871fb31a3..13ee5d6ee22bf 100644
--- a/sycl/test-e2e/Basic/queue/release.cpp
+++ b/sycl/test-e2e/Basic/queue/release.cpp
@@ -1,8 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s %if !windows %{--check-prefixes=CHECK-RELEASE%}
-//
-// XFAIL: hip_nvidia
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16197
 
 #include <sycl/detail/core.hpp>
 int main() {
diff --git a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
index a4e0a388fe9c0..66257a935610e 100644
--- a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
+++ b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
@@ -58,33 +58,14 @@
 //
 // CHECK: AddressSanitizer/nullpointer/private_nullptr.cpp
 // CHECK-NEXT: Basic/aspects.cpp
-// CHECK-NEXT: Basic/buffer/reinterpret.cpp
 // CHECK-NEXT: Basic/device_event.cpp
 // CHECK-NEXT: Basic/diagnostics/handler.cpp
 // CHECK-NEXT: Basic/max_linear_work_group_size_props.cpp
 // CHECK-NEXT: Basic/max_work_group_size_props.cpp
 // CHECK-NEXT: Basic/partition_supported.cpp
-// CHECK-NEXT: Basic/queue/queue.cpp
-// CHECK-NEXT: Basic/queue/release.cpp
-// CHECK-NEXT: Basic/span.cpp
-// CHECK-NEXT: Basic/stream/auto_flush.cpp
-// CHECK-NEXT: DeprecatedFeatures/queue_old_interop.cpp
-// CHECK-NEXT: DeviceCodeSplit/split-per-kernel.cpp
-// CHECK-NEXT: DeviceCodeSplit/split-per-source-main.cpp
 // CHECK-NEXT: DeviceLib/assert-windows.cpp
 // CHECK-NEXT: ESIMD/hardware_dispatch.cpp
-// CHECK-NEXT: GroupAlgorithm/root_group.cpp
-// CHECK-NEXT: GroupLocalMemory/group_local_memory.cpp
-// CHECK-NEXT: GroupLocalMemory/no_early_opt.cpp
 // CHECK-NEXT: InlineAsm/asm_multiple_instructions.cpp
-// CHECK-NEXT: InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp
-// CHECK-NEXT: InvokeSimd/Feature/invoke_simd_struct.cpp
-// CHECK-NEXT: InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp
-// CHECK-NEXT: InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp
-// CHECK-NEXT: InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp
-// CHECK-NEXT: InvokeSimd/Spec/tuple.cpp
-// CHECK-NEXT: InvokeSimd/Spec/tuple_return.cpp
-// CHECK-NEXT: InvokeSimd/Spec/tuple_vadd.cpp
 // CHECK-NEXT: KernelAndProgram/kernel-bundle-merge-options.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_annotated_ptr.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -108,28 +89,11 @@
 // CHECK-NEXT: Printf/mixed-address-space.cpp
 // CHECK-NEXT: Printf/percent-symbol.cpp
 // CHECK-NEXT: Reduction/reduction_big_data.cpp
-// CHECK-NEXT: Reduction/reduction_nd_N_queue_shortcut.cpp
-// CHECK-NEXT: Reduction/reduction_nd_conditional.cpp
-// CHECK-NEXT: Reduction/reduction_nd_dw.cpp
-// CHECK-NEXT: Reduction/reduction_nd_ext_double.cpp
-// CHECK-NEXT: Reduction/reduction_nd_ext_half.cpp
-// CHECK-NEXT: Reduction/reduction_nd_queue_shortcut.cpp
 // CHECK-NEXT: Reduction/reduction_nd_reducer_skip.cpp
-// CHECK-NEXT: Reduction/reduction_nd_rw.cpp
-// CHECK-NEXT: Reduction/reduction_range_queue_shortcut.cpp
-// CHECK-NEXT: Reduction/reduction_range_usm_dw.cpp
 // CHECK-NEXT: Reduction/reduction_reducer_op_eq.cpp
-// CHECK-NEXT: Reduction/reduction_span_pack.cpp
-// CHECK-NEXT: Reduction/reduction_usm.cpp
-// CHECK-NEXT: Reduction/reduction_usm_dw.cpp
 // CHECK-NEXT: Regression/build_log.cpp
 // CHECK-NEXT: Regression/complex_global_object.cpp
 // CHECK-NEXT: Regression/context_is_destroyed_after_exception.cpp
 // CHECK-NEXT: Regression/kernel_bundle_ignore_sycl_external.cpp
 // CHECK-NEXT: Regression/multiple-targets.cpp
 // CHECK-NEXT: Regression/reduction_resource_leak_dw.cpp
-// CHECK-NEXT: Scheduler/InOrderQueueDeps.cpp
-// CHECK-NEXT: Scheduler/MemObjRemapping.cpp
-// CHECK-NEXT: Scheduler/MultipleDevices.cpp
-// CHECK-NEXT: Scheduler/ReleaseResourcesTest.cpp
-// CHECK-NEXT: Tracing/buffer_printers.cpp

From 357539d89045e77baa6e49aa8bb973dc33ffd22f Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Fri, 6 Dec 2024 11:19:55 -0500
Subject: [PATCH 11/22] Delete rogue file

---
 "\\" | 113 -----------------------------------------------------------
 1 file changed, 113 deletions(-)
 delete mode 100644 "\\"

diff --git "a/\\" "b/\\"
deleted file mode 100644
index 2a112ba4740d5..0000000000000
--- "a/\\"
+++ /dev/null
@@ -1,113 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-//
-// REQUIRES: aspect-usm_shared_allocations
-#include <numeric>
-
-#include <sycl/detail/core.hpp>
-
-#include <sycl/sycl_span.hpp>
-#include <sycl/usm.hpp>
-#include <sycl/usm/usm_allocator.hpp>
-
-using namespace sycl;
-
-void testSpanCapture() {
-  // This test creates spans that are backed by USM.
-  // ensures they can be captured by device lambda
-  // and that read and write operations function correctly
-  // across capture.
-  queue Q;
-
-  constexpr long numReadTests = 2;
-  const range<1> NumberOfReadTestsRange(numReadTests);
-  buffer<int, 1> SpanRead(NumberOfReadTestsRange);
-
-  // span from a vector
-  // We will create a vector, backed by a USM allocator. And a span from that.
-  using vec_alloc = usm_allocator<int, usm::alloc::shared>;
-  // Create allocator for device associated with q
-  vec_alloc myAlloc(Q);
-  // Create std vector with the allocator
-  std::vector<int, vec_alloc> vecUSM(4, myAlloc);
-  std::iota(vecUSM.begin(), vecUSM.end(), 1);
-  sycl::span<int> vecUSM_span{vecUSM};
-  vecUSM_span[0] += 100; // 101  modify first value using span affordance.
-
-  // span from USM memory
-  auto *usm_data = malloc_shared<int>(4, Q);
-  sycl::span<int> usm_span(usm_data, 4);
-  std::iota(usm_span.begin(), usm_span.end(), 1);
-  usm_span[0] += 100; // 101 modify first value using span affordance.
-
-  event E = Q.submit([&](handler &cgh) {
-    auto can_read_from_span_acc = SpanRead.get_access<access::mode::write>(cgh);
-    cgh.single_task<class hi>([=] {
-      // read from the spans.
-      can_read_from_span_acc[0] = vecUSM_span[0];
-      can_read_from_span_acc[1] = usm_span[0];
-
-      // write to the spans
-      vecUSM_span[1] += 1000;
-      usm_span[1] += 1000;
-    });
-  });
-  E.wait();
-
-  // check out the read operations, should have gotten 101 from each
-  host_accessor can_read_from_span_acc(SpanRead, read_only);
-  for (int i = 0; i < numReadTests; i++) {
-    assert(can_read_from_span_acc[i] == 101 &&
-           "read check should have gotten 100");
-  }
-
-  // were the spans successfully modified via write?
-  assert(vecUSM_span[1] == 1002 &&
-         "vecUSM_span write check should have gotten 1001");
-  assert(usm_span[1] == 1002 && "usm_span write check should have gotten 1001");
-
-  free(usm_data, Q);
-}
-
-void set_all_span_values(sycl::span<int> container, int v) {
-  for (auto &e : container)
-    e = v;
-}
-
-void testSpanOnDevice() {
-  // this test creates a simple span on device,
-  // passes it to a function that operates on it
-  // and ensures it worked correctly
-  queue Q;
-  constexpr long numReadTests = 4;
-  const range<1> NumberOfReadTestsRange(numReadTests);
-  buffer<int, 1> SpanRead(NumberOfReadTestsRange);
-
-  event E = Q.submit([&](handler &cgh) {
-    auto can_read_from_span_acc = SpanRead.get_access<access::mode::write>(cgh);
-    cgh.single_task<class ha>([=] {
-      // create a span on device, pass it to function that modifies it
-      // read values back out.
-      int a[]{1, 2, 3, 4};
-      sycl::span<int> a_span{a};
-      set_all_span_values(a_span, 10);
-      for (int i = 0; i < numReadTests; i++)
-        can_read_from_span_acc[i] = a_span[i];
-    });
-  });
-  E.wait();
-
-  // check out the read operations, should have gotten 10 from each
-  host_accessor can_read_from_span_acc(SpanRead, read_only);
-  for (int i = 0; i < numReadTests; i++) {
-    assert(can_read_from_span_acc[i] == 10 &&
-           "read check should have gotten 10");
-  }
-}
-
-int main() {
-  testSpanCapture();
-  testSpanOnDevice();
-
-  return 0;
-}

From 999006d5fb7ba068296cad671abd8e2f32d2d86e Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Mon, 9 Dec 2024 08:49:16 -0800
Subject: [PATCH 12/22] Add tracking infor for XFAIL tests and remove
 hip_nvidia xfails

---
 sycl/test-e2e/Basic/aspects.cpp                              | 3 ---
 sycl/test-e2e/Basic/device_event.cpp                         | 4 ----
 sycl/test-e2e/Basic/diagnostics/handler.cpp                  | 5 +----
 sycl/test-e2e/Basic/partition_supported.cpp                  | 3 ---
 .../KernelAndProgram/kernel-bundle-merge-options.cpp         | 1 -
 sycl/test-e2e/Printf/int.cpp                                 | 2 --
 sycl/test-e2e/Printf/mixed-address-space.cpp                 | 2 +-
 sycl/test-e2e/Printf/percent-symbol.cpp                      | 2 +-
 .../Regression/context_is_destroyed_after_exception.cpp      | 2 --
 sycl/test-e2e/Regression/multiple-targets.cpp                | 3 ---
 10 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/sycl/test-e2e/Basic/aspects.cpp b/sycl/test-e2e/Basic/aspects.cpp
index 114f069eee40b..ea1bbec27762d 100644
--- a/sycl/test-e2e/Basic/aspects.cpp
+++ b/sycl/test-e2e/Basic/aspects.cpp
@@ -1,8 +1,5 @@
 // RUN: %{build} -o %t.out -DSYCL_DISABLE_IMAGE_ASPECT_WARNING
 // RUN: %{run-unfiltered-devices} %t.out
-//
-// Hip is missing some of the parameters tested here so it fails with NVIDIA
-// XFAIL: hip_nvidia
 
 //==--------------- aspects.cpp - SYCL device test ------------------------==//
 //
diff --git a/sycl/test-e2e/Basic/device_event.cpp b/sycl/test-e2e/Basic/device_event.cpp
index 25631fac20843..c306fa69c4538 100644
--- a/sycl/test-e2e/Basic/device_event.cpp
+++ b/sycl/test-e2e/Basic/device_event.cpp
@@ -1,9 +1,5 @@
 // RUN: %{build} -o %t.run
 // RUN: %{run} %t.run
-//
-// Returns error "Barrier is not supported on the host device
-// yet." with Nvidia.
-// XFAIL: hip_nvidia
 
 //==--------device_event.cpp - SYCL class device_event test ----------------==//
 //
diff --git a/sycl/test-e2e/Basic/diagnostics/handler.cpp b/sycl/test-e2e/Basic/diagnostics/handler.cpp
index 70b5dbba1472c..a36a2c197f0a2 100644
--- a/sycl/test-e2e/Basic/diagnostics/handler.cpp
+++ b/sycl/test-e2e/Basic/diagnostics/handler.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
-//
-// Appears to fail on HIP Nvidia because 'no device of requested type available'
-// when constructing a queue with an exception_list.
-// XFAIL: hip_nvidia
+
 //==------------------- handler.cpp ----------------------------------------==//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/sycl/test-e2e/Basic/partition_supported.cpp b/sycl/test-e2e/Basic/partition_supported.cpp
index 1d659e726c8ef..ef0cf813d216d 100644
--- a/sycl/test-e2e/Basic/partition_supported.cpp
+++ b/sycl/test-e2e/Basic/partition_supported.cpp
@@ -1,8 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// Nvidia should not allow sub_devices but does not throw corresponding error.
-// XFAIL: hip_nvidia
 /* Check that:
 1) if partition_equally is supported, then we check that the correct
 invalid errc is returned if more than max_compute_units are requested
diff --git a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
index f8f84be97f6cb..2b16b5d8663e9 100644
--- a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
+++ b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
@@ -5,7 +5,6 @@
 
 // Debug option -g is not passed to device code compiler when CL-style driver
 // is used and /DEBUG options is passed.
-// XFAIL: cl_options
 
 #include "kernel-bundle-merge-options.hpp"
 
diff --git a/sycl/test-e2e/Printf/int.cpp b/sycl/test-e2e/Printf/int.cpp
index fccb679cb0297..479434b0ab234 100644
--- a/sycl/test-e2e/Printf/int.cpp
+++ b/sycl/test-e2e/Printf/int.cpp
@@ -6,8 +6,6 @@
 //
 // UNSUPPORTED: hip_amd
 // FIXME: The 'short' type gets overflown with sporadic values on CUDA.
-// XFAIL: cuda
-//
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
 // FIXME: Remove dedicated constant address space testing once generic AS
diff --git a/sycl/test-e2e/Printf/mixed-address-space.cpp b/sycl/test-e2e/Printf/mixed-address-space.cpp
index 9f089847307cd..d79013007ca03 100644
--- a/sycl/test-e2e/Printf/mixed-address-space.cpp
+++ b/sycl/test-e2e/Printf/mixed-address-space.cpp
@@ -3,7 +3,7 @@
 //
 // UNSUPPORTED: hip_amd
 // XFAIL: cuda && windows
-//
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14733
 // FIXME: Drop the test once generic AS support is considered stable and the
 //        dedicated constant AS overload of printf is removed from the library.
 //
diff --git a/sycl/test-e2e/Printf/percent-symbol.cpp b/sycl/test-e2e/Printf/percent-symbol.cpp
index 30d65698a0a40..f08cd3e085d0d 100644
--- a/sycl/test-e2e/Printf/percent-symbol.cpp
+++ b/sycl/test-e2e/Printf/percent-symbol.cpp
@@ -6,7 +6,7 @@
 //
 // UNSUPPORTED: hip_amd
 // XFAIL: cuda && windows
-//
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14733
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
 // FIXME: Remove dedicated constant address space testing once generic AS
diff --git a/sycl/test-e2e/Regression/context_is_destroyed_after_exception.cpp b/sycl/test-e2e/Regression/context_is_destroyed_after_exception.cpp
index 57e48df0c664a..813b3be226626 100644
--- a/sycl/test-e2e/Regression/context_is_destroyed_after_exception.cpp
+++ b/sycl/test-e2e/Regression/context_is_destroyed_after_exception.cpp
@@ -2,8 +2,6 @@
 
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out %if !windows %{2>&1 | FileCheck %s %}
-//
-// XFAIL: hip_nvidia
 
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/Regression/multiple-targets.cpp b/sycl/test-e2e/Regression/multiple-targets.cpp
index 8a8c893f45ff2..a2498c3301b99 100644
--- a/sycl/test-e2e/Regression/multiple-targets.cpp
+++ b/sycl/test-e2e/Regression/multiple-targets.cpp
@@ -15,9 +15,6 @@
 //
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} -fsycl-device-code-split=per_kernel -o %t4.out %s
 // RUN: %{run} %t4.out
-//
-// XFAIL: hip_nvidia
-//
 
 #include <sycl/detail/core.hpp>
 

From e9df0a4d0cd9ac7dead4471217334eff3e5888f2 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Mon, 9 Dec 2024 13:14:13 -0500
Subject: [PATCH 13/22] Uncomment check commands to get failures

---
 .../KernelAndProgram/kernel-bundle-merge-options.cpp      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
index 2b16b5d8663e9..a10a0ec140e87 100644
--- a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
+++ b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
@@ -13,7 +13,7 @@
 
 // TODO: Uncomment when build options are properly passed to compile and link
 //       commands for kernel_bundle
-// xCHECK: <--- urProgramCompile(
-// xCHECK-SAME: -g
-// xCHECK: <--- urProgramLink(
-// xCHECK-SAME: -g
+// CHECK: <--- urProgramCompile(
+// CHECK-SAME: -g
+// CHECK: <--- urProgramLink(
+// CHECK-SAME: -g

From 3be9b214da3e3f3254cb2ce1c6adfb40aab689ba Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Mon, 9 Dec 2024 14:16:02 -0500
Subject: [PATCH 14/22] Add XFAIL tracker issue to
 kernel-bundle-merge-options.cpp

---
 .../KernelAndProgram/kernel-bundle-merge-options.cpp   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
index a10a0ec140e87..ba09fa1ea16ea 100644
--- a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
+++ b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp
@@ -5,6 +5,8 @@
 
 // Debug option -g is not passed to device code compiler when CL-style driver
 // is used and /DEBUG options is passed.
+// XFAIL: cl_options
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16309
 
 #include "kernel-bundle-merge-options.hpp"
 
@@ -13,7 +15,7 @@
 
 // TODO: Uncomment when build options are properly passed to compile and link
 //       commands for kernel_bundle
-// CHECK: <--- urProgramCompile(
-// CHECK-SAME: -g
-// CHECK: <--- urProgramLink(
-// CHECK-SAME: -g
+// xCHECK: <--- urProgramCompile(
+// xCHECK-SAME: -g
+// xCHECK: <--- urProgramLink(
+// xCHECK-SAME: -g

From 58a3767a87377be658e4dd5da0aba7200e880770 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Mon, 9 Dec 2024 14:20:27 -0500
Subject: [PATCH 15/22] Update number of XFAILs without tracker

---
 sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
index 66257a935610e..3f90f8243a3fc 100644
--- a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
+++ b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
@@ -51,7 +51,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 41
+// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 31
 //
 // List of improperly XFAIL-ed tests.
 // Remove the CHECK once the test has been properly XFAIL-ed.

From 48ed23ed1c310c0e2308ca9b09492592f46ebb2f Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Mon, 9 Dec 2024 14:45:03 -0500
Subject: [PATCH 16/22] Update no-xfail-without-tracker.cpp

---
 .../e2e_test_requirements/no-xfail-without-tracker.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
index 3f90f8243a3fc..c0355385350d5 100644
--- a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
+++ b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
@@ -57,16 +57,11 @@
 // Remove the CHECK once the test has been properly XFAIL-ed.
 //
 // CHECK: AddressSanitizer/nullpointer/private_nullptr.cpp
-// CHECK-NEXT: Basic/aspects.cpp
-// CHECK-NEXT: Basic/device_event.cpp
-// CHECK-NEXT: Basic/diagnostics/handler.cpp
 // CHECK-NEXT: Basic/max_linear_work_group_size_props.cpp
 // CHECK-NEXT: Basic/max_work_group_size_props.cpp
-// CHECK-NEXT: Basic/partition_supported.cpp
 // CHECK-NEXT: DeviceLib/assert-windows.cpp
 // CHECK-NEXT: ESIMD/hardware_dispatch.cpp
 // CHECK-NEXT: InlineAsm/asm_multiple_instructions.cpp
-// CHECK-NEXT: KernelAndProgram/kernel-bundle-merge-options.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_annotated_ptr.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
@@ -85,15 +80,10 @@
 // CHECK-NEXT: NewOffloadDriver/sycl-external-with-optional-features.cpp
 // CHECK-NEXT: OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp
 // CHECK-NEXT: PerformanceTests/Reduction/reduce_over_sub_group.cpp
-// CHECK-NEXT: Printf/int.cpp
-// CHECK-NEXT: Printf/mixed-address-space.cpp
-// CHECK-NEXT: Printf/percent-symbol.cpp
 // CHECK-NEXT: Reduction/reduction_big_data.cpp
 // CHECK-NEXT: Reduction/reduction_nd_reducer_skip.cpp
 // CHECK-NEXT: Reduction/reduction_reducer_op_eq.cpp
 // CHECK-NEXT: Regression/build_log.cpp
 // CHECK-NEXT: Regression/complex_global_object.cpp
-// CHECK-NEXT: Regression/context_is_destroyed_after_exception.cpp
 // CHECK-NEXT: Regression/kernel_bundle_ignore_sycl_external.cpp
-// CHECK-NEXT: Regression/multiple-targets.cpp
 // CHECK-NEXT: Regression/reduction_resource_leak_dw.cpp

From 78eefa4b80406f890f98828735e5359810994839 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Mon, 9 Dec 2024 16:38:28 -0500
Subject: [PATCH 17/22] Update int.cpp

---
 sycl/test-e2e/Printf/int.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sycl/test-e2e/Printf/int.cpp b/sycl/test-e2e/Printf/int.cpp
index 479434b0ab234..17b3e212c5988 100644
--- a/sycl/test-e2e/Printf/int.cpp
+++ b/sycl/test-e2e/Printf/int.cpp
@@ -6,6 +6,9 @@
 //
 // UNSUPPORTED: hip_amd
 // FIXME: The 'short' type gets overflown with sporadic values on CUDA.
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14734
+
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
 // FIXME: Remove dedicated constant address space testing once generic AS

From a50cc22052a74344b3e8720863872c02978093d4 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 12 Dec 2024 07:01:07 -0800
Subject: [PATCH 18/22] Add XFAIL trackers for joint matrix tests

---
 sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp       | 3 ++-
 .../Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp  | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp    | 1 +
 sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp      | 1 +
 .../Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp      | 1 +
 .../Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp      | 1 +
 sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp          | 1 +
 sycl/test-e2e/Matrix/SG32/joint_matrix_prefetch.cpp            | 1 +
 sycl/test-e2e/Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp | 1 +
 sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp         | 1 +
 10 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
index 21f101e000b52..96a6cb00dd062 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
@@ -13,7 +13,8 @@
 
 // Currently row major B fails when annotated_ptr is used
 // XFAIL: gpu
-
+// XFAIL-TRACKER: GSD-4181
+ 
 #include "../common.hpp"
 
 #define SG_SZ 32
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index aba19833ee581..57a41d55f8fee 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -15,7 +15,7 @@
 // then VNNI transform. This is currently only available on AMX
 
 // XFAIL: gpu
-
+// XFAIL-TRACKER: GSD-5768
 #include "../common.hpp"
 #include <iostream>
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
index 36346ba4a9f1a..1944a331bfe3a 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
@@ -15,6 +15,7 @@
 // RUN: %{run} %t.out
 
 // XFAIL: gpu
+// XFAIL-TRACKER: GSD-4181
 
 #include "../common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
index 0c3b778457036..4186ad1acc943 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
@@ -12,6 +12,7 @@
 // RUN: %{run} %t.out
 
 // XFAIL: run-mode
+// XFAIL-TRACKER: GSD-5768
 
 #include "../common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
index 177e4d6d861fa..7db1b0a618823 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -15,6 +15,7 @@
 // then VNNI transform. This is currently only available on AMX
 
 // XFAIL: gpu
+// XFAIL-TRACKER: GSD-5768
 
 #include "../common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp
index d73a2300888d3..3068b6bcca684 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp
@@ -14,6 +14,7 @@
 // RUN: %{run} %t.out
 
 // XFAIL: gpu
+// XFAIL-TRACKER: GSD-4181
 
 #include "../common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
index 3464677d17398..97a6d17b4e1ee 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
@@ -14,6 +14,7 @@
 // RUN: %{run} %t.out
 
 // XFAIL:gpu
+// XFAIL-TRACKER: GSD-4181
 
 #include "../common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_prefetch.cpp
index a6ce09fe13b9f..7e1520e95ff1c 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_prefetch.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_prefetch.cpp
@@ -11,6 +11,7 @@
 // RUN: %{run} %t.out
 
 // XFAIL: gpu
+// XFAIL-TRACKER: GSD-4181
 
 // SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
 // UNSUPPORTED: gpu-intel-dg2
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp
index 0fe45a24c499f..ca7310f6f5e15 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp
@@ -16,6 +16,7 @@
 
 // Sub-group size 32 support for this test is not currently available in IGC
 // XFAIL: gpu
+// XFAIL-TRACKER: GSD-4181
 
 #include "../common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
index ceb196e07b9ca..342648810fea4 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
@@ -14,6 +14,7 @@
 // RUN: %{run} %t.out
 
 // XFAIL:gpu
+// XFAIL-TRACKER: GSD-4181
 
 #include "../common.hpp"
 

From 7cbfaca72e032279c6e05027bed40277f3218a16 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 12 Dec 2024 07:11:00 -0800
Subject: [PATCH 19/22] Update count of improperly XFAIL-ed tests

---
 .../joint_matrix_bfloat16_colmajorA_colmajorB.cpp |  1 +
 .../Matrix/joint_matrix_colA_rowB_colC.cpp        |  1 +
 .../joint_matrix_int8_colmajorA_colmajorB.cpp     |  1 +
 .../no-xfail-without-tracker.cpp                  | 15 +--------------
 4 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index a9326e2ba312b..82bedf7043e9d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -14,6 +14,7 @@
 // then VNNI transform. This is currently only available on AMX
 
 // XFAIL: gpu
+// XFAIL-TRACKER: GSD-5768
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
index 338564207c4ac..c62175a8af439 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
@@ -11,6 +11,7 @@
 // RUN: %{run} %t.out
 
 // XFAIL: run-mode
+// XFAIL-TRACKER: GSD-5768
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
index f8dc8452668a1..aa4bac536e357 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -14,6 +14,7 @@
 // then VNNI transform. This is currently only available on AMX
 
 // XFAIL: gpu
+// XFAIL-TRACKER: GSD-5768
 
 #include "common.hpp"
 
diff --git a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
index c0355385350d5..4134c192bebb0 100644
--- a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
+++ b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp
@@ -51,7 +51,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 31
+// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 18
 //
 // List of improperly XFAIL-ed tests.
 // Remove the CHECK once the test has been properly XFAIL-ed.
@@ -62,19 +62,6 @@
 // CHECK-NEXT: DeviceLib/assert-windows.cpp
 // CHECK-NEXT: ESIMD/hardware_dispatch.cpp
 // CHECK-NEXT: InlineAsm/asm_multiple_instructions.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_annotated_ptr.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_out_bounds.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_prefetch.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_unaligned_k.cpp
-// CHECK-NEXT: Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
-// CHECK-NEXT: Matrix/joint_matrix_colA_rowB_colC.cpp
-// CHECK-NEXT: Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
 // CHECK-NEXT: NewOffloadDriver/multisource.cpp
 // CHECK-NEXT: NewOffloadDriver/split-per-source-main.cpp
 // CHECK-NEXT: NewOffloadDriver/sycl-external-with-optional-features.cpp

From 03e9e85fc6d46c30d569c4458fff9dc6d19f152a Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 12 Dec 2024 07:21:58 -0800
Subject: [PATCH 20/22] Update count of improperly XFAIL-ed tests

---
 sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
index 96a6cb00dd062..13656b8e3f4ec 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
@@ -14,7 +14,7 @@
 // Currently row major B fails when annotated_ptr is used
 // XFAIL: gpu
 // XFAIL-TRACKER: GSD-4181
- 
+
 #include "../common.hpp"
 
 #define SG_SZ 32

From e33db4cb7037ce831b3fa3923f970e517779e5a0 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Tue, 17 Dec 2024 10:31:22 -0800
Subject: [PATCH 21/22] Remove XFAILs from remainign E2E tests

---
 sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp       | 1 -
 sycl/test-e2e/Basic/max_work_group_size_props.cpp              | 1 -
 sycl/test-e2e/DeviceLib/assert-windows.cpp                     | 1 -
 sycl/test-e2e/ESIMD/hardware_dispatch.cpp                      | 1 -
 sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp          | 1 -
 sycl/test-e2e/NewOffloadDriver/multisource.cpp                 | 1 -
 sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp       | 1 -
 .../NewOffloadDriver/sycl-external-with-optional-features.cpp  | 1 -
 .../throw-exception-for-out-of-registers-on-kernel-launch.cpp  | 1 -
 .../PerformanceTests/Reduction/reduce_over_sub_group.cpp       | 1 -
 sycl/test-e2e/Reduction/reduction_big_data.cpp                 | 3 ---
 sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp          | 2 --
 sycl/test-e2e/Reduction/reduction_reducer_op_eq.cpp            | 2 --
 sycl/test-e2e/Regression/build_log.cpp                         | 2 --
 sycl/test-e2e/Regression/complex_global_object.cpp             | 1 -
 .../test-e2e/Regression/kernel_bundle_ignore_sycl_external.cpp | 1 -
 sycl/test-e2e/Regression/reduction_resource_leak_dw.cpp        | 2 --
 17 files changed, 23 deletions(-)

diff --git a/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp b/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp
index 7009ca367d8e9..a762f009a1d72 100644
--- a/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp
+++ b/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp
@@ -2,7 +2,6 @@
 // RUN: %{run} %t.out
 
 // This property is not yet supported by all UR adapters
-// XFAIL: level_zero, opencl, hip
 
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/Basic/max_work_group_size_props.cpp b/sycl/test-e2e/Basic/max_work_group_size_props.cpp
index 96439971d904a..8f1e1825f38b2 100644
--- a/sycl/test-e2e/Basic/max_work_group_size_props.cpp
+++ b/sycl/test-e2e/Basic/max_work_group_size_props.cpp
@@ -2,7 +2,6 @@
 // RUN: %{run} %t.out
 
 // This property is not yet supported by all UR adapters
-// XFAIL: level_zero, opencl, hip
 
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/DeviceLib/assert-windows.cpp b/sycl/test-e2e/DeviceLib/assert-windows.cpp
index c138bda944b69..77f81b80eea9b 100644
--- a/sycl/test-e2e/DeviceLib/assert-windows.cpp
+++ b/sycl/test-e2e/DeviceLib/assert-windows.cpp
@@ -2,7 +2,6 @@
 //
 // FIXME: OpenCL CPU backend compiler crashes on a call to _wassert.
 // Disable the test until the fix reaches SYCL test infrastructure.
-// XFAIL: *
 //
 // RUN: %{build} -o %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/hardware_dispatch.cpp b/sycl/test-e2e/ESIMD/hardware_dispatch.cpp
index 4166d5dfbd44e..546fdac46425a 100644
--- a/sycl/test-e2e/ESIMD/hardware_dispatch.cpp
+++ b/sycl/test-e2e/ESIMD/hardware_dispatch.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// XFAIL: igc-dev
 // REQUIRES: ocloc && arch-intel_gpu_tgllp
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp %s -o %t.out
 // RUN: %{run-unfiltered-devices} %t.out
diff --git a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
index 1e0140611389f..4e8c64eb4597a 100644
--- a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp
@@ -6,7 +6,6 @@
 // RUN: %{run} %t.out
 // The test is failing when writing directly to output buffer.
 // If temporary variable is used (see TO_PASS mode) the test succeeded.
-// XFAIL: gpu && run-mode
 #include "include/asmhelper.h"
 #include <iostream>
 #include <vector>
diff --git a/sycl/test-e2e/NewOffloadDriver/multisource.cpp b/sycl/test-e2e/NewOffloadDriver/multisource.cpp
index 73c7ab87f3222..21bb4a9ea9cfe 100644
--- a/sycl/test-e2e/NewOffloadDriver/multisource.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/multisource.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// XFAIL: cuda
 // Separate kernel sources and host code sources
 // Test with `--offload-new-driver`
 // RUN: %{build} --offload-new-driver -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL
diff --git a/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp b/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp
index f3e1108becba1..2b527996f2af1 100644
--- a/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp
@@ -2,7 +2,6 @@
 // RUN: --offload-new-driver -fsycl-dead-args-optimization
 // RUN: %{run} %t.out
 //
-// XFAIL: hip_nvidia, cuda
 
 #include "Inputs/split-per-source.h"
 
diff --git a/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp b/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp
index c8d86d84747a9..d1e8ef20a9a70 100644
--- a/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp
@@ -3,7 +3,6 @@
 // RUN: %{build} -DSOURCE2 --offload-new-driver -c -o %t2.o
 // RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t1.o %t2.o -o %t.exe
 // RUN: %{run} %t.exe
-// XFAIL: cuda
 
 #ifdef SOURCE1
 #include <iostream>
diff --git a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp
index a701b7960c232..bfe01871fb70c 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp
@@ -1,7 +1,6 @@
 // REQUIRES: cuda
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: *
 
 #include <numeric>
 #include <string_view>
diff --git a/sycl/test-e2e/PerformanceTests/Reduction/reduce_over_sub_group.cpp b/sycl/test-e2e/PerformanceTests/Reduction/reduce_over_sub_group.cpp
index c28a3406b33c0..ea09a668d9997 100644
--- a/sycl/test-e2e/PerformanceTests/Reduction/reduce_over_sub_group.cpp
+++ b/sycl/test-e2e/PerformanceTests/Reduction/reduce_over_sub_group.cpp
@@ -1,4 +1,3 @@
-// XFAIL: native_cpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Reduction/reduction_big_data.cpp b/sycl/test-e2e/Reduction/reduction_big_data.cpp
index 40e4e4f3e7ff7..c376fa3f31a34 100644
--- a/sycl/test-e2e/Reduction/reduction_big_data.cpp
+++ b/sycl/test-e2e/Reduction/reduction_big_data.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// Group algorithms are not supported on Nvidia.
-// XFAIL: hip_nvidia
-//
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
index ab2a583ce2f4c..cb345c1dd85fe 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
@@ -1,8 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// Group algorithms are not supported on Nvidia.
-// XFAIL: hip_nvidia
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Reduction/reduction_reducer_op_eq.cpp b/sycl/test-e2e/Reduction/reduction_reducer_op_eq.cpp
index 02419e04a4240..1fa43c91ef553 100644
--- a/sycl/test-e2e/Reduction/reduction_reducer_op_eq.cpp
+++ b/sycl/test-e2e/Reduction/reduction_reducer_op_eq.cpp
@@ -1,8 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// On nvidia a reduction appears to be unexpectedly executed via the host.
-// XFAIL: hip_nvidia
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Regression/build_log.cpp b/sycl/test-e2e/Regression/build_log.cpp
index d3dd3971122c5..bae74102a583e 100644
--- a/sycl/test-e2e/Regression/build_log.cpp
+++ b/sycl/test-e2e/Regression/build_log.cpp
@@ -5,8 +5,6 @@
 // RUN: FileCheck %s --check-prefix=CHECK-EXPECTED-ERROR --input-file %t.out
 // CHECK-EXPECTED-ERROR: error: backend compiler failed build
 
-// XFAIL: *
-
 #include <stdio.h>
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/Regression/complex_global_object.cpp b/sycl/test-e2e/Regression/complex_global_object.cpp
index 06a7187a46926..132a48cc5cdbf 100644
--- a/sycl/test-e2e/Regression/complex_global_object.cpp
+++ b/sycl/test-e2e/Regression/complex_global_object.cpp
@@ -1,6 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: gpu && linux
 
 // SYCL runtime may construct global objects at function scope. The test ensures
 // such objects do not cause problems if the first call to SYCL is inside main
diff --git a/sycl/test-e2e/Regression/kernel_bundle_ignore_sycl_external.cpp b/sycl/test-e2e/Regression/kernel_bundle_ignore_sycl_external.cpp
index 765cbbad75327..413dc4ab9827f 100644
--- a/sycl/test-e2e/Regression/kernel_bundle_ignore_sycl_external.cpp
+++ b/sycl/test-e2e/Regression/kernel_bundle_ignore_sycl_external.cpp
@@ -1,7 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// XFAIL: cuda
 // UNSUPPORTED: hip
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/Regression/reduction_resource_leak_dw.cpp b/sycl/test-e2e/Regression/reduction_resource_leak_dw.cpp
index ac4cf20ad00f8..81eb463f2d19c 100644
--- a/sycl/test-e2e/Regression/reduction_resource_leak_dw.cpp
+++ b/sycl/test-e2e/Regression/reduction_resource_leak_dw.cpp
@@ -1,6 +1,4 @@
 // REQUIRES: level_zero, level_zero_dev_kit
-// XFAIL: windows
-//
 // RUN: %{build} %level_zero_options -o %t.out
 // RUN: %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s
 //

From f5816a4eafe6224daccbbae573ba1d1cdd4e57bb Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Tue, 17 Dec 2024 10:31:52 -0800
Subject: [PATCH 22/22] Remove XFAILs from remainign E2E tests

---
 sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp b/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp
index 27b4b5a6fab17..f33ed512dcd97 100644
--- a/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp
+++ b/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp
@@ -7,7 +7,6 @@
 // RUN: %{run} not %t3.out 2>&1 | FileCheck %s
 
 // FIXME: There's an issue in gfx driver, so this test pending here.
-// XFAIL: *
 
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/oneapi/experimental/address_cast.hpp>