diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
index 0b5052f7b..0ee38df7f 100644
--- a/cmake/BuildFlags.cmake
+++ b/cmake/BuildFlags.cmake
@@ -120,14 +120,17 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-fp32-correctly-rounded-divide-sqrt")
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")
 
-  if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL ""))
-    set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device $ENV{TORCH_XPU_ARCH_LIST}")
+  if(WIN32)
+    set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h")
   else()
-    set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device pvc,xe-lpg,ats-m150")
-    message(STATUS "'TORCH_XPU_ARCH_LIST' not set. Using default configuration for a full AOT build." 
-              "Try specifying from 'pvc,xe-lpg,ats-m150' if you don't need.")
+    set(AOT_TARGETS "pvc,xe-lpg,ats-m150")
+  endif()
+  if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL ""))
+    set(AOT_TARGETS "$ENV{TORCH_XPU_ARCH_LIST}")
   endif()
-  message(STATUS "    SYCL_OFFLINE_COMPILER_AOT_OPTIONS: ${SYCL_OFFLINE_COMPILER_AOT_OPTIONS}")
+
+  set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device ${AOT_TARGETS}")
+  message(STATUS "Compile Intel GPU AOT Targets for ${AOT_TARGETS}")
 
   set(SYCL_OFFLINE_COMPILER_FLAGS "${SYCL_OFFLINE_COMPILER_AOT_OPTIONS} ${SYCL_OFFLINE_COMPILER_CG_OPTIONS}")
 else()
diff --git a/cmake/Modules/FindSYCL/run_sycl.cmake b/cmake/Modules/FindSYCL/run_sycl.cmake
index dcc99c18f..f56d49768 100644
--- a/cmake/Modules/FindSYCL/run_sycl.cmake
+++ b/cmake/Modules/FindSYCL/run_sycl.cmake
@@ -56,6 +56,12 @@ endforeach()
 # Choose host flags in FindSYCL.cmake
 @SYCL_host_flags@
 
+# Adding permissive flag for MSVC build to overcome ambiguous symbol error.
+if(WIN32)
+  string(APPEND SYCL_host_compiler_flags "/permissive- ")
+endif()
+
+
 list(REMOVE_DUPLICATES CMAKE_HOST_FLAGS)
 foreach(flag ${CMAKE_HOST_FLAGS})
   # Extra quotes are added around each flag to help SYCL parse out flags with spaces.
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index 815ad018f..57c4783cd 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -1,10 +1,13 @@
 # ATen XPU sources
 
-file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp")
+file(GLOB xpu_cpp "xpu/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
+list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
 list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
 
 set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
+set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index ba3d3a58b..22a031bc6 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -31,7 +31,7 @@ Tensor& arange_out(
 
         TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
         TORCH_CHECK(
-            std::isfinite(xstart) && std::isfinite(xend),
+            std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
             "unsupported range: ",
             xstart,
             " -> ",
@@ -87,5 +87,33 @@ Tensor& arange_out(
 
   return xpu::arange_kernel(start, end, step, out);
 }
+
+Tensor& range_xpu_out(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& out) {
+  auto xstart = start.to<double>();
+  auto xend = end.to<double>();
+  auto xstep = step.to<double>();
+
+  TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+  TORCH_CHECK(
+      std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
+      "unsupported range: ",
+      xstart,
+      " -> ",
+      xend);
+  TORCH_CHECK(
+      ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+      "upper bound and larger bound inconsistent with step sign");
+  int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
+  if (out.numel() != size) {
+    out.resize_({size});
+  }
+
+  return at::native::xpu::range_kernel(start, end, step, out);
+}
+
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/sycl/AbsKernel.cpp b/src/ATen/native/xpu/sycl/AbsKernel.cpp
index 596000ecb..3df01baee 100644
--- a/src/ATen/native/xpu/sycl/AbsKernel.cpp
+++ b/src/ATen/native/xpu/sycl/AbsKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/AbsKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/AbsKernel.h b/src/ATen/native/xpu/sycl/AbsKernel.h
index 3b51e993e..3423c32cf 100644
--- a/src/ATen/native/xpu/sycl/AbsKernel.h
+++ b/src/ATen/native/xpu/sycl/AbsKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void abs_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
index bfb6eb58c..ee73856ee 100644
--- a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationEluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.h b/src/ATen/native/xpu/sycl/ActivationEluKernels.h
index 555bfb5f4..a15eaa590 100644
--- a/src/ATen/native/xpu/sycl/ActivationEluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.h
@@ -4,13 +4,13 @@
 
 namespace at::native::xpu {
 
-void elu_kernel(
+TORCH_XPU_API void elu_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,
     const Scalar& input_scale);
 
-void elu_backward_kernel(
+TORCH_XPU_API void elu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,
diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
index 29d6c5d09..2bfcccb69 100644
--- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
@@ -6,6 +6,8 @@
 #include <comm/XPUMathCompat.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
index 6c373a7cd..660fae9ff 100644
--- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
+++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
@@ -6,9 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate);
+TORCH_XPU_API void gelu_kernel(
+    TensorIteratorBase& iter,
+    c10::string_view approximate);
 
-void gelu_backward_kernel(
+TORCH_XPU_API void gelu_backward_kernel(
     TensorIteratorBase& iter,
     c10::string_view approximate);
 
diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
index 056e8c332..f60613ec7 100644
--- a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -2,9 +2,12 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorIterator.h>
 
+#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ActivationGluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.h b/src/ATen/native/xpu/sycl/ActivationGluKernels.h
index c0fd6e22c..a4b11dbb4 100644
--- a/src/ATen/native/xpu/sycl/ActivationGluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.h
@@ -4,9 +4,9 @@
 
 namespace at::native::xpu {
 
-void glu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter);
 
-void glu_backward_kernel(
+TORCH_XPU_API void glu_backward_kernel(
     const TensorIteratorBase& iter,
     int64_t gI_stride,
     int64_t I_stride);
diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
index 376d8d7ca..c3fe41fd8 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
index ee5f0e87c..47bfa636c 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void hardsigmoid_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter);
 
-void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
index 09270a5c0..8c6e47f77 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
@@ -6,6 +6,8 @@
 #include <comm/XPUMathCompat.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardswishKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
index ea4e81fc9..6b741e744 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
@@ -6,9 +6,9 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardswish_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_kernel(TensorIterator& iter);
 
-void hardswish_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
index 730d65350..1dcd78b88 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardtanhKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
index c2d110bae..745c0f69b 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardtanh_backward_kernel(
+TORCH_XPU_API void hardtanh_backward_kernel(
     TensorIterator& iter,
     const Scalar& min,
     const Scalar& max);
diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
index 9db04defc..88d170352 100644
--- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLeakyReluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
index 758701ce9..5656c661e 100644
--- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
@@ -4,9 +4,11 @@
 
 namespace at::native::xpu {
 
-void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_);
+TORCH_XPU_API void leaky_relu_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& negval_);
 
-void leaky_relu_backward_kernel(
+TORCH_XPU_API void leaky_relu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& negval_);
 
diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
index 113ea32ed..f4051184e 100644
--- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
index eb009d0cf..42ee9dbea 100644
--- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
 
-void log_sigmoid_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
index d7a78bb0f..9033b103c 100644
--- a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
@@ -8,6 +8,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationMishKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
index 1c663dfe5..7c4efc0af 100644
--- a/src/ATen/native/xpu/sycl/ActivationMishKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void mish_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter);
 
-void mish_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void mish_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
index 8a2c7012f..9ec4e598b 100644
--- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationPreluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -40,4 +42,4 @@ void prelu_backward_kernel(TensorIterator& iter) {
       });
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
index 23f60c242..4d33e9064 100644
--- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void prelu_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_kernel(TensorIterator& iter);
 
-void prelu_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
index da656dc00..7d9f0872a 100644
--- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationSiluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
index c31acdc9a..eaec67986 100644
--- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void silu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter);
 
-void silu_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
index 99b1c5716..8dc3684e3 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftplusKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
index 8a5e5ef2b..a9b6027bc 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void softplus_kernel(
+TORCH_XPU_API void softplus_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);
 
-void softplus_backward_kernel(
+TORCH_XPU_API void softplus_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
index 4393576e9..b96ab461e 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h
index 481d1e5a1..398e3f4a4 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h
@@ -4,8 +4,12 @@
 
 namespace at::native::xpu {
 
-void softshrink_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void softshrink_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
-void softshrink_backward_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void softshrink_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
index 0b34c013f..3915a5560 100644
--- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
@@ -4,6 +4,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationThresholdKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h
index c22a6b18c..df38c6d3d 100644
--- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h
+++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void threshold_kernel(
+TORCH_XPU_API void threshold_kernel(
     TensorIteratorBase& iter,
     const Scalar& threshold,
     const Scalar& value);
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
index ab7dd1396..86fd7edfe 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -6,6 +6,8 @@
 #include <comm/xpu_aten.h>
 #include <vector>
 
+#include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu;
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
index 9b6d9a046..0d58a6e7f 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void adaptive_avg_pool2d_backward_kernel(
+TORCH_XPU_API void adaptive_avg_pool2d_backward_kernel(
     Tensor& gradInput,
     const Tensor& gradOutput,
     const Tensor& input);
 
-void adaptive_avg_pool2d_kernel(
+TORCH_XPU_API void adaptive_avg_pool2d_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size);
diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
index 4a8540bde..dacae3e68 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
@@ -12,6 +12,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename index_t>
diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
index 1a77f69cd..2714e6627 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
@@ -4,13 +4,13 @@
 
 namespace at::native::xpu {
 
-void adaptive_max_pool2d_kernel(
+TORCH_XPU_API void adaptive_max_pool2d_kernel(
     const Tensor& input,
     IntArrayRef output_size,
     const Tensor& output,
     const Tensor& indices);
 
-void adaptive_max_pool2d_backward_kernel(
+TORCH_XPU_API void adaptive_max_pool2d_backward_kernel(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& indices,
diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp
index 87354eea9..137ae3902 100644
--- a/src/ATen/native/xpu/sycl/AmpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/AmpKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/AmpKernels.h b/src/ATen/native/xpu/sycl/AmpKernels.h
index bfc17487f..7c703e70e 100644
--- a/src/ATen/native/xpu/sycl/AmpKernels.h
+++ b/src/ATen/native/xpu/sycl/AmpKernels.h
@@ -3,17 +3,17 @@
 
 namespace at::native::xpu {
 
-void amp_non_finite_check_and_unscale_kernel(
+TORCH_XPU_API void amp_non_finite_check_and_unscale_kernel(
     Tensor& scaled_grad,
     Tensor& found_inf,
     const Tensor& inv_scale);
 
-void amp_foreach_non_finite_check_and_unscale_kernel(
+TORCH_XPU_API void amp_foreach_non_finite_check_and_unscale_kernel(
     std::vector<std::vector<at::Tensor>> scaled_grads,
     Tensor& found_inf,
     const Tensor& inv_scale);
 
-Tensor& amp_update_scale_kernel(
+TORCH_XPU_API Tensor& amp_update_scale_kernel(
     Tensor& current_scale,
     Tensor& growth_tracker,
     const Tensor& found_inf,
diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
index c2a0458c9..e7a2c2e1f 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
@@ -9,6 +9,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
+
 namespace at::native {
 namespace xpu {
 
diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
index 3229d306a..7667fe021 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
@@ -2,7 +2,7 @@
 
 namespace at::native::xpu {
 
-void avg_pool2d_kernel(
+TORCH_XPU_API void avg_pool2d_kernel(
     const Tensor& input_,
     int64_t kH_,
     int64_t kW_,
@@ -15,7 +15,7 @@ void avg_pool2d_kernel(
     c10::optional<int64_t> divisor_override,
     const Tensor& output);
 
-void avg_pool2d_backward_kernel(
+TORCH_XPU_API void avg_pool2d_backward_kernel(
     const Tensor& gradOutput_,
     const Tensor& input_,
     IntArrayRef kernel_size,
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index 417b0e24a..5e14a0a94 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -14,6 +14,14 @@
 #include <comm/XPUMathCompat.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/BatchNormKernels.h>
+
+#ifdef _WIN32
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+
 namespace at {
 namespace native {
 namespace xpu {
@@ -1009,9 +1017,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
   }
 
   BatchNormCollectStatisticsChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ input,
-      accscalar_t* __restrict__ out_mean,
-      accscalar_t* __restrict__ out_invstd,
+      const scalar_t* RESTRICT input,
+      accscalar_t* RESTRICT out_mean,
+      accscalar_t* RESTRICT out_invstd,
       volatile accscalar_t* staging_data,
       int* semaphores,
       const int reduction_size,
@@ -1029,9 +1037,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
         wg_size_(wg_size) {}
 
  private:
-  const scalar_t* __restrict__ input_;
-  accscalar_t* __restrict__ out_mean_;
-  accscalar_t* __restrict__ out_invstd_;
+  const scalar_t* RESTRICT input_;
+  accscalar_t* RESTRICT out_mean_;
+  accscalar_t* RESTRICT out_invstd_;
   volatile accscalar_t* staging_data_;
   int* semaphores_;
   const int reduction_size_;
@@ -1376,13 +1384,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor {
   }
 
   BatchNormTransformInputChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ input,
-      const scalar_t* __restrict__ z,
-      const accscalar_t* __restrict__ mean,
-      const accscalar_t* __restrict__ inv_std,
-      const layerscalar_t* __restrict__ weight,
-      const layerscalar_t* __restrict__ shift,
-      scalar_t* __restrict__ out,
+      const scalar_t* RESTRICT input,
+      const scalar_t* RESTRICT z,
+      const accscalar_t* RESTRICT mean,
+      const accscalar_t* RESTRICT inv_std,
+      const layerscalar_t* RESTRICT weight,
+      const layerscalar_t* RESTRICT shift,
+      scalar_t* RESTRICT out,
       const int reduction_size,
       const int stride,
       const bool fuse_relu)
@@ -1398,13 +1406,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor {
         fuse_relu_(fuse_relu) {}
 
  private:
-  const scalar_t* __restrict__ input_;
-  const scalar_t* __restrict__ z_;
-  const accscalar_t* __restrict__ mean_;
-  const accscalar_t* __restrict__ inv_std_;
-  const layerscalar_t* __restrict__ weight_;
-  const layerscalar_t* __restrict__ shift_;
-  scalar_t* __restrict__ out_;
+  const scalar_t* RESTRICT input_;
+  const scalar_t* RESTRICT z_;
+  const accscalar_t* RESTRICT mean_;
+  const accscalar_t* RESTRICT inv_std_;
+  const layerscalar_t* RESTRICT weight_;
+  const layerscalar_t* RESTRICT shift_;
+  scalar_t* RESTRICT out_;
   const int reduction_size_;
   const int stride_;
   const bool fuse_relu_;
@@ -2035,14 +2043,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
   }
 
   BatchNormBackwardReduceChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ input,
-      const scalar_t* __restrict__ grad_output,
-      const accscalar_t* __restrict__ mean,
-      const accscalar_t* __restrict__ inv_std,
-      accscalar_t* __restrict__ sum_dy_o,
-      accscalar_t* __restrict__ sum_dy_xmu_o,
-      layerscalar_t* __restrict__ grad_weight,
-      layerscalar_t* __restrict__ grad_bias,
+      const scalar_t* RESTRICT input,
+      const scalar_t* RESTRICT grad_output,
+      const accscalar_t* RESTRICT mean,
+      const accscalar_t* RESTRICT inv_std,
+      accscalar_t* RESTRICT sum_dy_o,
+      accscalar_t* RESTRICT sum_dy_xmu_o,
+      layerscalar_t* RESTRICT grad_weight,
+      layerscalar_t* RESTRICT grad_bias,
       volatile accscalar_t* staging_data,
       int* semaphores,
       const int reduction_size,
@@ -2063,14 +2071,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
         wg_size_(wg_size) {}
 
  private:
-  const scalar_t* __restrict__ input_;
-  const scalar_t* __restrict__ grad_output_;
-  const accscalar_t* __restrict__ mean_;
-  const accscalar_t* __restrict__ inv_std_;
-  accscalar_t* __restrict__ sum_dy_o_;
-  accscalar_t* __restrict__ sum_dy_xmu_o_;
-  layerscalar_t* __restrict__ grad_weight_;
-  layerscalar_t* __restrict__ grad_bias_;
+  const scalar_t* RESTRICT input_;
+  const scalar_t* RESTRICT grad_output_;
+  const accscalar_t* RESTRICT mean_;
+  const accscalar_t* RESTRICT inv_std_;
+  accscalar_t* RESTRICT sum_dy_o_;
+  accscalar_t* RESTRICT sum_dy_xmu_o_;
+  layerscalar_t* RESTRICT grad_weight_;
+  layerscalar_t* RESTRICT grad_bias_;
   volatile accscalar_t* staging_data_;
   int* semaphores_;
   const int reduction_size_;
@@ -2396,7 +2404,7 @@ struct BatchNormBackwardElemtKernelFunctor {
       GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>
           grad_input,
       const stat_accscalar_t norm_fct,
-      const int* __restrict__ numel = nullptr,
+      const int* RESTRICT numel = nullptr,
       const int world_size = 0)
       : input_(input),
         grad_output_(grad_output),
@@ -2452,7 +2460,7 @@ struct BatchNormBackwardElemtKernelFunctor {
   GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>
       grad_input_;
   const stat_accscalar_t norm_fct_;
-  const int* __restrict__ numel_;
+  const int* RESTRICT numel_;
   const int world_size_;
 };
 
@@ -2681,18 +2689,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor {
   }
 
   BatchNormBackwardElemtChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ grad_output,
-      const scalar_t* __restrict__ input,
-      const accscalar_t* __restrict__ mean,
-      const accscalar_t* __restrict__ inv_std,
-      const layerscalar_t* __restrict__ weight,
-      const accscalar_t* __restrict__ sum_dy,
-      const accscalar_t* __restrict__ sum_dy_xmu,
-      scalar_t* __restrict__ grad_input,
+      const scalar_t* RESTRICT grad_output,
+      const scalar_t* RESTRICT input,
+      const accscalar_t* RESTRICT mean,
+      const accscalar_t* RESTRICT inv_std,
+      const layerscalar_t* RESTRICT weight,
+      const accscalar_t* RESTRICT sum_dy,
+      const accscalar_t* RESTRICT sum_dy_xmu,
+      scalar_t* RESTRICT grad_input,
       const accscalar_t norm_fct,
       const int reduction_size,
       const int stride,
-      const int* __restrict__ numel = nullptr,
+      const int* RESTRICT numel = nullptr,
       const int64_t world_size = 0)
       : grad_output_(grad_output),
         input_(input),
@@ -2709,18 +2717,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor {
         world_size_(world_size) {}
 
  private:
-  const scalar_t* __restrict__ grad_output_;
-  const scalar_t* __restrict__ input_;
-  const accscalar_t* __restrict__ mean_;
-  const accscalar_t* __restrict__ inv_std_;
-  const layerscalar_t* __restrict__ weight_;
-  const accscalar_t* __restrict__ sum_dy_;
-  const accscalar_t* __restrict__ sum_dy_xmu_;
-  scalar_t* __restrict__ grad_input_;
+  const scalar_t* RESTRICT grad_output_;
+  const scalar_t* RESTRICT input_;
+  const accscalar_t* RESTRICT mean_;
+  const accscalar_t* RESTRICT inv_std_;
+  const layerscalar_t* RESTRICT weight_;
+  const accscalar_t* RESTRICT sum_dy_;
+  const accscalar_t* RESTRICT sum_dy_xmu_;
+  scalar_t* RESTRICT grad_input_;
   const accscalar_t norm_fct_;
   const int reduction_size_;
   const int stride_;
-  const int* __restrict__ numel_;
+  const int* RESTRICT numel_;
   const int64_t world_size_;
 };
 
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.h b/src/ATen/native/xpu/sycl/BatchNormKernels.h
index 3bc559b38..9d9ae780b 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.h
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.h
@@ -6,11 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-std::tuple<Tensor, Tensor> batch_norm_stats_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> batch_norm_stats_kernel(
     const Tensor& self,
     double epsilon);
 
-void batch_norm_elemt_kernel(
+TORCH_XPU_API void batch_norm_elemt_kernel(
     Tensor& out,
     const Tensor& self,
     const c10::optional<Tensor>& weight_opt,
@@ -18,7 +18,8 @@ void batch_norm_elemt_kernel(
     const Tensor& mean_,
     const Tensor& invstd_);
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor, Tensor>
+batch_norm_backward_reduce_kernel(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& mean,
@@ -28,7 +29,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_kernel(
     bool weight_g,
     bool bias_g);
 
-Tensor batch_norm_backward_elemt_kernel(
+TORCH_XPU_API Tensor batch_norm_backward_elemt_kernel(
     const Tensor& self,
     const Tensor& input,
     const Tensor& mean,
@@ -38,13 +39,13 @@ Tensor batch_norm_backward_elemt_kernel(
     const Tensor& sum_dy_xmu,
     const Tensor& count);
 
-std::tuple<Tensor, Tensor> batch_norm_update_stats_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> batch_norm_update_stats_kernel(
     const Tensor& self,
     const c10::optional<Tensor>& running_mean_opt,
     const c10::optional<Tensor>& running_var_opt,
     double momentum);
 
-std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_kernel(
+TORCH_XPU_API std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_kernel(
     const Tensor& self,
     const c10::optional<Tensor>& weight_opt,
     const c10::optional<Tensor>& bias_opt,
@@ -57,7 +58,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_kernel(
     Tensor& save_mean,
     Tensor& save_invstd);
 
-std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_kernel(
     const Tensor& grad_out,
     const Tensor& input,
     const c10::optional<Tensor>& weight_opt,
diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
index 7634921d2..2c2dd1550 100644
--- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h
index 4247c0a79..f9712b85d 100644
--- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h
@@ -6,11 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void bitwise_and_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_and_kernel(TensorIteratorBase& iter);
 
-void bitwise_or_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_or_kernel(TensorIteratorBase& iter);
 
-void bitwise_xor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_xor_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp
index e10d25dd5..6f35c0cb6 100644
--- a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/BinaryInternal.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryKernels.h>
+
 namespace at::native::xpu {
 
 void div_true_kernel(TensorIteratorBase& iter) {
diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
index 862162807..e0e8487f6 100644
--- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryGeometricKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
index 588d52c4f..61af8b511 100644
--- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void atan2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void atan2_kernel(TensorIteratorBase& iter);
 
-void hypot_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hypot_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.cpp b/src/ATen/native/xpu/sycl/BinaryKernels.cpp
index f6ab24de9..daafadd23 100644
--- a/src/ATen/native/xpu/sycl/BinaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/BinaryInternal.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryKernels.h>
+
 namespace at::native::xpu {
 
 template <typename opmath_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.h b/src/ATen/native/xpu/sycl/BinaryKernels.h
index 78dfaa0da..f862595eb 100644
--- a/src/ATen/native/xpu/sycl/BinaryKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryKernels.h
@@ -4,16 +4,16 @@
 
 namespace at::native::xpu {
 
-void add_kernel(TensorIteratorBase& iter, const Scalar& alpha);
+TORCH_XPU_API void add_kernel(TensorIteratorBase& iter, const Scalar& alpha);
 
-void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha);
+TORCH_XPU_API void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha);
 
-void mul_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mul_kernel(TensorIteratorBase& iter);
 
-void div_true_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void div_true_kernel(TensorIteratorBase& iter);
 
-void div_trunc_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void div_trunc_kernel(TensorIteratorBase& iter);
 
-void div_floor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void div_floor_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
index be4a25539..3268ab90f 100644
--- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/BinaryInternal.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
index 552f961c3..dce1a1a5e 100644
--- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void logical_and_kernel(TensorIterator& iter);
+TORCH_XPU_API void logical_and_kernel(TensorIterator& iter);
 
-void logical_or_kernel(TensorIterator& iter);
+TORCH_XPU_API void logical_or_kernel(TensorIterator& iter);
 
-void logical_xor_kernel(TensorIterator& iter);
+TORCH_XPU_API void logical_xor_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
index fde7f5186..73732e02f 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h
index 37553191d..8bea1f44c 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h
@@ -4,10 +4,12 @@
 
 namespace at::native::xpu {
 
-void sigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sigmoid_backward_kernel(TensorIteratorBase& iter);
 
-void tanh_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void tanh_backward_kernel(TensorIteratorBase& iter);
 
-void logit_backward_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar);
+TORCH_XPU_API void logit_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& eps_scalar);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
index 8d63e48b5..d96e5064e 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryMiscOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
index 17672ec29..ffe08be3e 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void mse_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mse_kernel(TensorIteratorBase& iter);
 
-void smooth_l1_kernel(TensorIteratorBase& iter, double beta);
+TORCH_XPU_API void smooth_l1_kernel(TensorIteratorBase& iter, double beta);
 
-void huber_kernel(TensorIterator& iter, double delta);
+TORCH_XPU_API void huber_kernel(TensorIterator& iter, double delta);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
index 7a3e7908f..92eac3da2 100644
--- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryRemainderKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h
index 1d400951f..f3d3e2390 100644
--- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h
+++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h
@@ -6,9 +6,9 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void remainder_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void remainder_kernel(TensorIteratorBase& iter);
 
-void fmod_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void fmod_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp
index 64adba17e..1a0027d2f 100644
--- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryShiftOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -45,4 +47,4 @@ void rshift_kernel(TensorIteratorBase& iter) {
   });
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h
index 7e661d919..64ded1705 100644
--- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void lshift_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lshift_kernel(TensorIteratorBase& iter);
 
-void rshift_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void rshift_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
index d56eff222..c5686babb 100644
--- a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/BucketizationUtils.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/BucketizationKernels.h>
+
 namespace at::native::xpu {
 
 // customized lower_bound func to ensure the low bound of 'nan', 'inf' etc. be
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.h b/src/ATen/native/xpu/sycl/BucketizationKernels.h
index f47cea2af..1c86ece08 100644
--- a/src/ATen/native/xpu/sycl/BucketizationKernels.h
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.h
@@ -2,11 +2,13 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-void searchsorted_kernel(
+
+TORCH_XPU_API void searchsorted_kernel(
     Tensor& result,
     const Tensor& input,
     const Tensor& sorted_sequence,
     bool out_int32,
     bool right,
     const Tensor& sorter);
-} // namespace at::native::xpu
\ No newline at end of file
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
index 2ff7cbc96..9d1396209 100644
--- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
@@ -5,6 +5,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/Col2ImKernel.h>
+
 namespace at::native::xpu {
 
 template <typename T>
diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.h b/src/ATen/native/xpu/sycl/Col2ImKernel.h
index 337aec367..62d87b406 100644
--- a/src/ATen/native/xpu/sycl/Col2ImKernel.h
+++ b/src/ATen/native/xpu/sycl/Col2ImKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void col2im_kernel(
+TORCH_XPU_API void col2im_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
diff --git a/src/ATen/native/xpu/sycl/CompareKernels.cpp b/src/ATen/native/xpu/sycl/CompareKernels.cpp
index 8c403a819..91c3ac614 100644
--- a/src/ATen/native/xpu/sycl/CompareKernels.cpp
+++ b/src/ATen/native/xpu/sycl/CompareKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/CompareKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/CompareKernels.h b/src/ATen/native/xpu/sycl/CompareKernels.h
index d68827631..52cbc151c 100644
--- a/src/ATen/native/xpu/sycl/CompareKernels.h
+++ b/src/ATen/native/xpu/sycl/CompareKernels.h
@@ -6,17 +6,17 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void eq_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void eq_kernel(TensorIteratorBase& iter);
 
-void ne_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void ne_kernel(TensorIteratorBase& iter);
 
-void lt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lt_kernel(TensorIteratorBase& iter);
 
-void le_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void le_kernel(TensorIteratorBase& iter);
 
-void gt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void gt_kernel(TensorIteratorBase& iter);
 
-void ge_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void ge_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
index 87504bd5e..4e44d9064 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ComplexKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h
index d51556b4f..900bbaca6 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void complex_kernel(TensorIterator& iter);
+TORCH_XPU_API void complex_kernel(TensorIterator& iter);
 
-void polar_kernel(TensorIterator& iter);
+TORCH_XPU_API void polar_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index 552d148ae..dc2991cc5 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -7,6 +7,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/CopyKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.h b/src/ATen/native/xpu/sycl/CopyKernel.h
index da078f3ea..3a8e4d263 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.h
+++ b/src/ATen/native/xpu/sycl/CopyKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void copy_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void copy_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.cpp b/src/ATen/native/xpu/sycl/CopysignKernel.cpp
index 3b8351abf..327828adc 100644
--- a/src/ATen/native/xpu/sycl/CopysignKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopysignKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/CopysignKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.h b/src/ATen/native/xpu/sycl/CopysignKernel.h
index cf856728c..86b6a40d9 100644
--- a/src/ATen/native/xpu/sycl/CopysignKernel.h
+++ b/src/ATen/native/xpu/sycl/CopysignKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void copysign_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void copysign_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CrossKernel.cpp b/src/ATen/native/xpu/sycl/CrossKernel.cpp
index 0ed5f11eb..313261c9d 100644
--- a/src/ATen/native/xpu/sycl/CrossKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CrossKernel.cpp
@@ -4,6 +4,8 @@
 #include <c10/core/WrapDimMinimal.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/CrossKernel.h>
+
 namespace at::native::xpu {
 template <typename scalar_t>
 struct CrossKernelFunctor {
diff --git a/src/ATen/native/xpu/sycl/CrossKernel.h b/src/ATen/native/xpu/sycl/CrossKernel.h
index cc77229c4..8d382b97e 100644
--- a/src/ATen/native/xpu/sycl/CrossKernel.h
+++ b/src/ATen/native/xpu/sycl/CrossKernel.h
@@ -1,9 +1,12 @@
 #pragma once
 #include <ATen/ATen.h>
+
 namespace at::native::xpu {
-void linalg_cross_kernel(
+
+TORCH_XPU_API void linalg_cross_kernel(
     const Tensor& result,
     const Tensor& x1,
     const Tensor& x2,
     int64_t dim);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.cpp b/src/ATen/native/xpu/sycl/CumprodKernel.cpp
index f35cc7979..375e6a80c 100644
--- a/src/ATen/native/xpu/sycl/CumprodKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CumprodKernel.cpp
@@ -3,6 +3,9 @@
 
 #include <ATen/native/xpu/sycl/ScanUtils.h>
 
+#include <ATen/native/xpu/sycl/CumprodKernel.h>
+#include <ATen/native/xpu/sycl/ScanKernels.h>
+
 namespace at::native::xpu {
 
 void launch_cumprod_kernel(
diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.h b/src/ATen/native/xpu/sycl/CumprodKernel.h
index 71e8c7693..d78301f04 100644
--- a/src/ATen/native/xpu/sycl/CumprodKernel.h
+++ b/src/ATen/native/xpu/sycl/CumprodKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void launch_cumprod_kernel(
+TORCH_XPU_API void launch_cumprod_kernel(
     const Tensor& result,
     const Tensor& self,
     int64_t dim);
diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.cpp b/src/ATen/native/xpu/sycl/CumsumKernel.cpp
index c2754869b..0523db84b 100644
--- a/src/ATen/native/xpu/sycl/CumsumKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CumsumKernel.cpp
@@ -3,6 +3,9 @@
 
 #include <ATen/native/xpu/sycl/ScanUtils.h>
 
+#include <ATen/native/xpu/sycl/CumsumKernel.h>
+#include <ATen/native/xpu/sycl/ScanKernels.h>
+
 namespace at::native::xpu {
 
 void launch_cumsum_kernel(
diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.h b/src/ATen/native/xpu/sycl/CumsumKernel.h
index 79c299608..77ce62b61 100644
--- a/src/ATen/native/xpu/sycl/CumsumKernel.h
+++ b/src/ATen/native/xpu/sycl/CumsumKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void launch_cumsum_kernel(
+TORCH_XPU_API void launch_cumsum_kernel(
     const Tensor& result,
     const Tensor& self,
     int64_t dim);
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index 86283fba6..ba0283b8b 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -15,6 +15,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
+
 namespace at::native::xpu {
 
 static inline int p_start(
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
index 578b43ae7..d530560e6 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void max_pool2d_with_indices_kernel(
+TORCH_XPU_API void max_pool2d_with_indices_kernel(
     const Tensor& input,
     IntArrayRef kernel_size,
     IntArrayRef stride,
@@ -14,7 +14,7 @@ void max_pool2d_with_indices_kernel(
     const Tensor& output,
     const Tensor& indices);
 
-void max_pool2d_with_indices_backward_kernel(
+TORCH_XPU_API void max_pool2d_with_indices_backward_kernel(
     const Tensor& gradInput,
     const Tensor& gradOutput,
     const Tensor& input,
diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
index 2f0fc68c1..5006dd8b4 100644
--- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
@@ -2,6 +2,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/DistanceKernels.h>
+
 namespace at::native::xpu {
 template <typename scalar_t>
 static double device_sqrt(scalar_t val) {
@@ -177,6 +179,8 @@ static inline scalar_t group_reduce_agg_without_broadcast(
   do {
     agg = subgroup_reduce_agg_without_broadcast<scalar_t, F, nd_item>(
         item, agg, sg_size);
+    if (num_active_sg == 1)
+      return agg;
     item.barrier(sycl_local_fence);
     if (0 == lane_id) {
       local_shared_mem[sg_id] = agg;
@@ -184,14 +188,13 @@ static inline scalar_t group_reduce_agg_without_broadcast(
     item.barrier(sycl_local_fence);
     agg =
         local_id < num_active_sg ? local_shared_mem[local_id] : (scalar_t)0.0f;
-    num_active_sg = (num_active_sg + sg_size - 1) / sg_size;
+    if (num_active_sg > sg_size)
+      num_active_sg = (num_active_sg + sg_size - 1) / sg_size;
   } while (num_active_sg > sg_size);
 
   // num of active sgs < sg_size
   item.barrier(sycl_local_fence);
   if (0 == sg_id) {
-    agg =
-        local_id < num_active_sg ? local_shared_mem[local_id] : (scalar_t)0.0f;
     agg = subgroup_reduce_agg_without_broadcast<scalar_t, F, nd_item>(
         item, agg, sg_size);
   }
diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.h b/src/ATen/native/xpu/sycl/DistanceKernels.h
index ea263d94c..46a34d031 100644
--- a/src/ATen/native/xpu/sycl/DistanceKernels.h
+++ b/src/ATen/native/xpu/sycl/DistanceKernels.h
@@ -2,9 +2,11 @@
 #include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
-void cdist_kernel(
+
+TORCH_XPU_API void cdist_kernel(
     Tensor& result,
     const Tensor& x1_expanded,
     const Tensor& x2_expanded,
     double p);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
index 0c17910be..c3de4c593 100644
--- a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
@@ -7,6 +7,8 @@
 #include <comm/Runtime.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
index 4c1b83689..cfc3e1dfa 100644
--- a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
@@ -7,6 +7,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at::native::xpu {
 
 void exponential_kernel(
diff --git a/src/ATen/native/xpu/sycl/DistributionKernels.h b/src/ATen/native/xpu/sycl/DistributionKernels.h
index e5700f26e..88d62933f 100644
--- a/src/ATen/native/xpu/sycl/DistributionKernels.h
+++ b/src/ATen/native/xpu/sycl/DistributionKernels.h
@@ -4,41 +4,43 @@
 
 namespace at::native::xpu {
 
-void random_from_to_kernel(
+TORCH_XPU_API void random_from_to_kernel(
     TensorIteratorBase& iter,
     uint64_t range,
     int64_t base,
     c10::optional<Generator> gen_);
 
-void random_full_64_bits_range_kernel(
+TORCH_XPU_API void random_full_64_bits_range_kernel(
     TensorIteratorBase& iter,
     c10::optional<Generator> gen_);
 
-void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_);
+TORCH_XPU_API void random_kernel(
+    TensorIteratorBase& iter,
+    c10::optional<Generator> gen_);
 
-void uniform_kernel(
+TORCH_XPU_API void uniform_kernel(
     TensorIteratorBase& iter,
     double from,
     double to,
     c10::optional<Generator> gen);
 
-void normal_kernel(
+TORCH_XPU_API void normal_kernel(
     const TensorBase& self,
     double mean,
     double std,
     c10::optional<Generator> gen);
 
-void bernoulli_tensor_kernel(
+TORCH_XPU_API void bernoulli_tensor_kernel(
     const TensorBase& self,
     const TensorBase& p_,
     c10::optional<Generator> gen_);
 
-void bernoulli_scalar_kernel(
+TORCH_XPU_API void bernoulli_scalar_kernel(
     const TensorBase& self,
     double p,
     c10::optional<Generator> gen);
 
-void exponential_kernel(
+TORCH_XPU_API void exponential_kernel(
     TensorIteratorBase& iter,
     double lambda,
     c10::optional<Generator> gen);
diff --git a/src/ATen/native/xpu/sycl/DistributionNormal.cpp b/src/ATen/native/xpu/sycl/DistributionNormal.cpp
index 9630eaf7f..3aacf6639 100644
--- a/src/ATen/native/xpu/sycl/DistributionNormal.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionNormal.cpp
@@ -9,6 +9,8 @@
 #include <comm/Runtime.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
index 995f4979c..e4698a723 100644
--- a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
@@ -9,6 +9,8 @@
 #include <comm/Runtime.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DistributionUniform.cpp b/src/ATen/native/xpu/sycl/DistributionUniform.cpp
index 321dffd58..17ff4d698 100644
--- a/src/ATen/native/xpu/sycl/DistributionUniform.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionUniform.cpp
@@ -9,6 +9,8 @@
 #include <comm/Runtime.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp
index 2432c804a..54b1b4d4c 100644
--- a/src/ATen/native/xpu/sycl/Dropout.cpp
+++ b/src/ATen/native/xpu/sycl/Dropout.cpp
@@ -13,6 +13,8 @@
 #include <ATen/ops/ones_like.h>
 #include <ATen/ops/zeros_like.h>
 
+#include <ATen/native/xpu/sycl/DropoutKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DropoutKernels.h b/src/ATen/native/xpu/sycl/DropoutKernels.h
index a72e8e2f8..922b2838c 100644
--- a/src/ATen/native/xpu/sycl/DropoutKernels.h
+++ b/src/ATen/native/xpu/sycl/DropoutKernels.h
@@ -6,15 +6,13 @@ namespace at {
 namespace native {
 namespace xpu {
 
-std::tuple<Tensor, Tensor> dropout_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> dropout_kernel(
     const Tensor& self,
     double p,
     c10::optional<bool> train);
 
-Tensor dropout_backward_kernel(
-    const Tensor& grad,
-    const Tensor& mask,
-    double scale);
+TORCH_XPU_API Tensor
+dropout_backward_kernel(const Tensor& grad, const Tensor& mask, double scale);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/Embedding.cpp b/src/ATen/native/xpu/sycl/Embedding.cpp
index 0ed27196b..4ea45b4c2 100644
--- a/src/ATen/native/xpu/sycl/Embedding.cpp
+++ b/src/ATen/native/xpu/sycl/Embedding.cpp
@@ -3,6 +3,8 @@
 #include <ATen/native/xpu/sycl/EmbeddingBackwardKernel.h>
 #include <ATen/native/xpu/sycl/pstl/PSTLFunctions.h>
 
+#include <ATen/native/xpu/sycl/EmbeddingKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index c4cf312fd..0e2d02b6a 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/EmbeddingBag.h>
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
 
+#include <ATen/native/xpu/sycl/EmbeddingBagKernels.h>
+
 namespace at::native::xpu {
 
 std::pair<Tensor, Tensor> promoteIndicesAndOffsets(
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
index dd8812f97..f73dabb6b 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_kernel(
     const Tensor& weight_t,
     const Tensor& indices_t,
     const Tensor& offsets_t,
diff --git a/src/ATen/native/xpu/sycl/EmbeddingKernels.h b/src/ATen/native/xpu/sycl/EmbeddingKernels.h
index 5aaa3257a..f2e75b37d 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingKernels.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-Tensor embedding_dense_backward_kernel(
+TORCH_XPU_API Tensor embedding_dense_backward_kernel(
     const Tensor& grad_,
     const Tensor& indices_,
     int64_t num_weights,
diff --git a/src/ATen/native/xpu/sycl/FillKernel.cpp b/src/ATen/native/xpu/sycl/FillKernel.cpp
index 483177210..6dbb55c67 100644
--- a/src/ATen/native/xpu/sycl/FillKernel.cpp
+++ b/src/ATen/native/xpu/sycl/FillKernel.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/FillKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/FillKernel.h b/src/ATen/native/xpu/sycl/FillKernel.h
index 82bdd5f39..11c813cc0 100644
--- a/src/ATen/native/xpu/sycl/FillKernel.h
+++ b/src/ATen/native/xpu/sycl/FillKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void fill_kernel(TensorIterator& iter, const Scalar& scalar);
+TORCH_XPU_API void fill_kernel(TensorIterator& iter, const Scalar& scalar);
 
 }
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
index 1fc19d34d..2c501a2bc 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
@@ -31,11 +31,11 @@ namespace at::native::xpu {
   std::vector<Tensor> FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)( \
       TensorList tensor1, TensorList tensor2, const Scalar& alpha)
 
-FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add);
-FOREACH_BINARY_LIST_ALPHA_KERNEL(add);
-FOREACH_BINARY_LIST_INPLACE_KERNEL(mul);
-FOREACH_BINARY_LIST_KERNEL(mul);
-FOREACH_BINARY_LIST_INPLACE_KERNEL(div);
-FOREACH_BINARY_LIST_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(div);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
index fe63fb8de..00044fa7d 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
@@ -17,11 +17,11 @@ namespace at::native::xpu {
   std::vector<Tensor> FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)( \
       TensorList tensors, const Scalar& scalar)
 
-FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add);
-FOREACH_BINARY_SCALAR_KERNEL(add);
-FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul);
-FOREACH_BINARY_SCALAR_KERNEL(mul);
-FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div);
-FOREACH_BINARY_SCALAR_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(div);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
index 723b07619..bafd220c2 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
@@ -17,11 +17,11 @@ namespace at::native::xpu {
   std::vector<Tensor> FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)( \
       TensorList tensors, at::ArrayRef<Scalar> scalars)
 
-FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add);
-FOREACH_BINARY_SCALARLIST_KERNEL(add);
-FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul);
-FOREACH_BINARY_SCALARLIST_KERNEL(mul);
-FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div);
-FOREACH_BINARY_SCALARLIST_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(div);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
index 4fc55fcf3..9623b7370 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
@@ -11,9 +11,9 @@ namespace at::native::xpu {
 #define FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(NAME) \
   FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(NAME)
 
-FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv);
-FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
index 94445b86d..f15795ef5 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
@@ -17,9 +17,9 @@ namespace at::native::xpu {
       TensorList tensors2,                               \
       const Scalar& scalar)
 
-FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv);
-FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
index 8df3aa4e0..8a5bf444d 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
@@ -17,9 +17,9 @@ namespace at::native::xpu {
       TensorList tensors2,                                   \
       at::ArrayRef<Scalar> scalars)
 
-FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv);
-FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
index 849a77443..cc90fa893 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
+
 enum class NormType { L1, L2 };
 
 namespace at::native::xpu {
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
index a7dceefec..cefb3d840 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
@@ -3,7 +3,7 @@
 
 namespace at::native::xpu {
 
-std::vector<Tensor> foreach_norm_kernel(
+TORCH_XPU_API std::vector<Tensor> foreach_norm_kernel(
     TensorList tensors,
     const Scalar& ord,
     double p,
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
index fa7062cfa..21168f8a3 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
@@ -5,6 +5,9 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
+#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
+#include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
index 50392598e..7b776b1eb 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
@@ -3,13 +3,13 @@
 
 namespace at::native::xpu {
 
-void foreach_lerp_list_kernel(
+TORCH_XPU_API void foreach_lerp_list_kernel(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3,
     TensorList result);
 
-void foreach_lerp_list_kernel_(
+TORCH_XPU_API void foreach_lerp_list_kernel_(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3);
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
index 0f36a5215..9d5e252eb 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
@@ -3,13 +3,13 @@
 
 namespace at::native::xpu {
 
-void foreach_lerp_scalar_kernel(
+TORCH_XPU_API void foreach_lerp_scalar_kernel(
     TensorList tensors1,
     TensorList tensors2,
     const Scalar& weight,
     TensorList result);
 
-void foreach_lerp_scalar_kernel_(
+TORCH_XPU_API void foreach_lerp_scalar_kernel_(
     TensorList tensors1,
     TensorList tensors2,
     const Scalar& weight);
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
index db0ac1769..9341c0ab6 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
@@ -8,6 +8,8 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
+#include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, template <class> class Op>
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
index cd5312106..26ab95fa8 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
@@ -3,8 +3,8 @@
 
 namespace at::native::xpu {
 
-std::vector<Tensor> foreach_sqrt_kernel(TensorList tensors);
+TORCH_XPU_API std::vector<Tensor> foreach_sqrt_kernel(TensorList tensors);
 
-void foreach_sqrt_kernel_(TensorList tensors);
+TORCH_XPU_API void foreach_sqrt_kernel_(TensorList tensors);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
index 6c832270c..fb74e19bc 100644
--- a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/GcdLcmKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -20,4 +22,4 @@ void gcd_kernel(TensorIteratorBase& iter) {
   });
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.h b/src/ATen/native/xpu/sycl/GcdLcmKernels.h
index acf855d40..ba429d762 100644
--- a/src/ATen/native/xpu/sycl/GcdLcmKernels.h
+++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gcd_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void gcd_kernel(TensorIteratorBase& iter);
 
 }
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp
index affd6b02b..2bfd0505a 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.cpp
+++ b/src/ATen/native/xpu/sycl/GridSampler.cpp
@@ -14,6 +14,8 @@
 #include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/GridSampler.h>
 
+#include <ATen/native/xpu/sycl/GridSamplerKernels.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu::detail;
diff --git a/src/ATen/native/xpu/sycl/GridSamplerKernels.h b/src/ATen/native/xpu/sycl/GridSamplerKernels.h
index deb35006f..b56ed8dcd 100644
--- a/src/ATen/native/xpu/sycl/GridSamplerKernels.h
+++ b/src/ATen/native/xpu/sycl/GridSamplerKernels.h
@@ -4,14 +4,14 @@
 
 namespace at::native::xpu {
 
-Tensor grid_sampler_2d_kernel(
+TORCH_XPU_API Tensor grid_sampler_2d_kernel(
     const Tensor& input,
     const Tensor& grid,
     int64_t interpolation_mode,
     int64_t padding_mode,
     bool align_corners);
 
-void grid_sampler_2d_backward_kernel(
+TORCH_XPU_API void grid_sampler_2d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_grid,
     const Tensor& grad_output,
@@ -22,4 +22,4 @@ void grid_sampler_2d_backward_kernel(
     bool align_corners,
     std::array<bool, 2> output_mask);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
index c7d0818ee..572b4d62b 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -10,6 +10,8 @@
 #include <comm/XPUMathCompat.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/GroupNormKernels.h>
+
 namespace at::native::xpu {
 
 template <
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.h b/src/ATen/native/xpu/sycl/GroupNormKernels.h
index 7f0cf38d3..943d6bcb7 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.h
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void group_norm_kernel(
+TORCH_XPU_API void group_norm_kernel(
     const Tensor& X,
     const Tensor& gamma,
     const Tensor& beta,
@@ -17,7 +17,7 @@ void group_norm_kernel(
     Tensor& mean,
     Tensor& rstd);
 
-void group_norm_backward_kernel(
+TORCH_XPU_API void group_norm_backward_kernel(
     const Tensor& dY,
     const Tensor& X,
     const Tensor& mean,
diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h
index bb865da20..581ee04c5 100644
--- a/src/ATen/native/xpu/sycl/HistogramKernels.h
+++ b/src/ATen/native/xpu/sycl/HistogramKernels.h
@@ -4,14 +4,14 @@
 
 namespace at::native::xpu {
 
-void histogramdd_kernel(
+TORCH_XPU_API void histogramdd_kernel(
     const Tensor& self,
     const std::optional<Tensor>& weight,
     bool density,
     Tensor& hist,
     const TensorList& bin_edges_);
 
-void histogramdd_linear_kernel(
+TORCH_XPU_API void histogramdd_linear_kernel(
     const Tensor& self,
     const std::optional<Tensor>& weight,
     bool density,
@@ -19,10 +19,10 @@ void histogramdd_linear_kernel(
     const TensorList& bin_edges_,
     bool local_search);
 
-void histogram_select_outer_bin_edges_kernel(
+TORCH_XPU_API void histogram_select_outer_bin_edges_kernel(
     const Tensor& input,
     const int64_t N,
     std::vector<double>& leftmost_edges,
     std::vector<double>& rightmost_edges);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
index 3b5accfb9..d269d94ae 100644
--- a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
+++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
@@ -17,6 +17,8 @@
 #include <ATen/ops/linspace.h>
 #endif
 
+#include <ATen/native/xpu/sycl/HistogramKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
index 1922d9498..dc6394b39 100644
--- a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
@@ -5,6 +5,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/Im2ColKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.h b/src/ATen/native/xpu/sycl/Im2ColKernel.h
index 4efae219e..3ca9a852b 100644
--- a/src/ATen/native/xpu/sycl/Im2ColKernel.h
+++ b/src/ATen/native/xpu/sycl/Im2ColKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void im2col_kernel(
+TORCH_XPU_API void im2col_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef kernel_size,
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index dd588e7e1..8c755a64b 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -18,6 +18,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/TensorInfo.h>
 
+#include <ATen/native/xpu/sycl/IndexingKernels.h>
+
 using namespace at::xpu::detail;
 using namespace at::xpu;
 
diff --git a/src/ATen/native/xpu/sycl/IndexingKernels.h b/src/ATen/native/xpu/sycl/IndexingKernels.h
index 89f7c2ca5..ca2b866b8 100644
--- a/src/ATen/native/xpu/sycl/IndexingKernels.h
+++ b/src/ATen/native/xpu/sycl/IndexingKernels.h
@@ -3,20 +3,22 @@
 
 namespace at::native::xpu {
 
-void index_kernel(
+TORCH_XPU_API void index_kernel(
     TensorIteratorBase& iter,
     at::IntArrayRef index_size,
     at::IntArrayRef index_stride);
 
-void index_select_kernel(
+TORCH_XPU_API void index_select_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Tensor& out);
 
-void masked_fill_kernel(TensorIterator& iter, const Scalar& value);
+TORCH_XPU_API void masked_fill_kernel(
+    TensorIterator& iter,
+    const Scalar& value);
 
-void index_add_kernel(
+TORCH_XPU_API void index_add_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
@@ -24,27 +26,27 @@ void index_add_kernel(
     const Scalar& alpha,
     const Tensor& out);
 
-void index_fill_kernel(
+TORCH_XPU_API void index_fill_kernel(
     TensorIterator& iter,
     const int64_t dim,
     const int64_t self_dim_size,
     const int64_t self_dim_stride,
     const Scalar& source);
 
-void index_put_kernel(
+TORCH_XPU_API void index_put_kernel(
     TensorIterator& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride,
     bool accumulate);
 
-void index_put_deterministic_kernel(
+TORCH_XPU_API void index_put_deterministic_kernel(
     Tensor& self,
     const c10::List<c10::optional<Tensor>>& indices,
     const Tensor& value,
     bool accumulate,
     bool unsafe);
 
-void masked_scatter_kernel(
+TORCH_XPU_API void masked_scatter_kernel(
     const TensorBase& self,
     const TensorBase& mask,
     const TensorBase& maskPrefixSum,
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index b73ee96b3..6a4efc440 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -9,6 +9,8 @@
 #include <ATen/native/xpu/sycl/Norm.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/LayerNormKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.h b/src/ATen/native/xpu/sycl/LayerNormKernels.h
index 0797d50fd..0c57a61ba 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.h
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-std::tuple<Tensor, Tensor, Tensor> layer_norm_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> layer_norm_kernel(
     const Tensor& X,
     const Tensor& gamma,
     const Tensor& beta,
@@ -17,7 +17,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_kernel(
     Tensor& mean,
     Tensor& rstd);
 
-std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_kernel(
     const Tensor& dY,
     const Tensor& X,
     const Tensor& mean,
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp
index b0f480ac3..1648f193b 100644
--- a/src/ATen/native/xpu/sycl/LerpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/LerpKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.h b/src/ATen/native/xpu/sycl/LerpKernels.h
index c455adee8..6f13ec6eb 100644
--- a/src/ATen/native/xpu/sycl/LerpKernels.h
+++ b/src/ATen/native/xpu/sycl/LerpKernels.h
@@ -4,8 +4,10 @@
 
 namespace at::native::xpu {
 
-void lerp_tensor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lerp_tensor_kernel(TensorIteratorBase& iter);
 
-void lerp_scalar_kernel(TensorIteratorBase& iter, const c10::Scalar& weight);
+TORCH_XPU_API void lerp_scalar_kernel(
+    TensorIteratorBase& iter,
+    const c10::Scalar& weight);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
index 8d3128e9e..0f478eceb 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
@@ -1,6 +1,8 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/LinearAlgebraKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
index 32f987a2e..ca61016b9 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
@@ -3,6 +3,9 @@
 
 namespace at::native::xpu {
 
-void addr_kernel(TensorIterator& iter, const Scalar& beta, const Scalar& alpha);
+TORCH_XPU_API void addr_kernel(
+    TensorIterator& iter,
+    const Scalar& beta,
+    const Scalar& alpha);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp
index 69b6a73a1..d79964295 100644
--- a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp
@@ -5,6 +5,9 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <c10/util/MathConstants.h>
+
+#include <ATen/native/xpu/sycl/LogAddExpKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.h b/src/ATen/native/xpu/sycl/LogAddExpKernels.h
index 01b9f400f..3a5ae4c34 100644
--- a/src/ATen/native/xpu/sycl/LogAddExpKernels.h
+++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void logaddexp_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logaddexp_kernel(TensorIteratorBase& iter);
 
-void logaddexp2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logaddexp2_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LossKernels.cpp b/src/ATen/native/xpu/sycl/LossKernels.cpp
index f7c260535..69315adf5 100644
--- a/src/ATen/native/xpu/sycl/LossKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossKernels.cpp
@@ -4,6 +4,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/LossKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LossKernels.h b/src/ATen/native/xpu/sycl/LossKernels.h
index 17f61afbb..560eb2c44 100644
--- a/src/ATen/native/xpu/sycl/LossKernels.h
+++ b/src/ATen/native/xpu/sycl/LossKernels.h
@@ -3,14 +3,14 @@
 
 namespace at::native::xpu {
 
-Tensor& binary_cross_entropy_kernel(
+TORCH_XPU_API Tensor& binary_cross_entropy_kernel(
     const Tensor& input,
     const Tensor& target,
     const Tensor& weight,
     int64_t reduction,
     Tensor& loss);
 
-Tensor& binary_cross_entropy_backward_kernel(
+TORCH_XPU_API Tensor& binary_cross_entropy_backward_kernel(
     const Tensor& grad,
     const Tensor& input,
     const Tensor& target,
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
index c84613a4f..4b93cb3c3 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
@@ -11,6 +11,8 @@
 #include <ATen/native/xpu/sycl/GroupReduceUtils.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/LossNLL2dKernels.h>
+
 namespace at::native::xpu {
 inline Tensor optional_contiguous(const Tensor& source) {
   return source.defined() ? source.contiguous() : source;
@@ -251,7 +253,15 @@ void nll_loss2d_forward_kernel(
       at::ScalarType::BFloat16,
       input.scalar_type(),
       "nll_loss2d_forward_kernel",
-      [&] {
+      [&input_,
+       &weight_,
+       &target_,
+       &output,
+       &total_weight,
+       &input,
+       &target,
+       &reduction,
+       &ignore_index] {
         using accscalar_t = acc_type_device<scalar_t, kXPU>;
         AT_DISPATCH_INDEX_TYPES(
             at::native::canUse32BitIndexMath(input_, INT_MAX)
@@ -527,4 +537,4 @@ void nll_loss2d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h
index cb2084649..87519b5e7 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h
@@ -3,7 +3,8 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-void nll_loss2d_forward_kernel(
+
+TORCH_XPU_API void nll_loss2d_forward_kernel(
     Tensor& output,
     Tensor& total_weight,
     const Tensor& input,
@@ -12,7 +13,7 @@ void nll_loss2d_forward_kernel(
     int64_t reduction,
     int64_t ignore_index);
 
-void nll_loss2d_backward_kernel(
+TORCH_XPU_API void nll_loss2d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
@@ -21,4 +22,5 @@ void nll_loss2d_backward_kernel(
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight);
-} // namespace at::native::xpu
\ No newline at end of file
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
index 27610c814..d45d06545 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
@@ -4,6 +4,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/LossNLLKernel.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu;
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.h b/src/ATen/native/xpu/sycl/LossNLLKernel.h
index bde50aa7b..f680aeb29 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.h
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.h
@@ -3,7 +3,7 @@
 
 namespace at::native::xpu {
 
-void nll_loss_forward_kernel(
+TORCH_XPU_API void nll_loss_forward_kernel(
     const Tensor& self,
     const Tensor& target,
     const OptionalTensorRef weight_opt,
@@ -12,7 +12,7 @@ void nll_loss_forward_kernel(
     const Tensor& output,
     const Tensor& total_weight);
 
-void nll_loss_backward_kernel(
+TORCH_XPU_API void nll_loss_backward_kernel(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -22,4 +22,4 @@ void nll_loss_backward_kernel(
     const Tensor& total_weight,
     const Tensor& grad_input);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/Math.h b/src/ATen/native/xpu/sycl/MathExtensions.h
similarity index 93%
rename from src/ATen/native/xpu/sycl/Math.h
rename to src/ATen/native/xpu/sycl/MathExtensions.h
index 71e49d902..3a66e5983 100644
--- a/src/ATen/native/xpu/sycl/Math.h
+++ b/src/ATen/native/xpu/sycl/MathExtensions.h
@@ -9,12 +9,12 @@ namespace at::native::xpu {
  * For licensing information, please refer to the cpu implementation located in
  * "ATen/native/Math.h".
  */
-template <typename scalar_t>
+template <typename scalar_t, typename pi_t = double>
 static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   // [C++ Standard Reference: Gamma Function]
   // https://en.cppreference.com/w/cpp/numeric/math/tgamma
   using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
-  static const double PI_f64 = 3.14159265358979323846;
+  static const pi_t PI_f64 = 3.14159265358979323846;
   const accscalar_t PSI_10 = 2.25175258906672110764;
   const accscalar_t A[] = {
       8.33333333333333333333E-2,
@@ -27,7 +27,7 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   };
 
   accscalar_t x = static_cast<accscalar_t>(in);
-  if (x == 0) {
+  if (x == accscalar_t(0)) {
     // As per C++ standard for gamma related functions and SciPy,
     // If the argument is ±0, ±∞ is returned
     return std::copysign(static_cast<scalar_t>(INFINITY), -x);
@@ -35,7 +35,7 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
 
   bool x_is_integer = x == std::trunc(x);
   accscalar_t result = 0;
-  if (x < 0) {
+  if (x < accscalar_t(0)) {
     if (x_is_integer) {
       // As per C++ standard for gamma related functions and SciPy,
       // If the argument is a negative integer, NaN is returned
@@ -46,23 +46,23 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
     // mathematically equivalent since both x and r are in radians and tan() has
     // a periodicity of pi, in practice the computation of pi * x is a source of
     // error (when |x| > 1).
-    double q, r;
-    r = std::modf(static_cast<double>(x), &q);
+    pi_t q, r;
+    r = std::modf(static_cast<pi_t>(x), &q);
     result = static_cast<accscalar_t>(-PI_f64 / std::tan(PI_f64 * r));
     x = 1 - x;
   }
 
-  while (x < 10) {
+  while (x < accscalar_t(10)) {
     result -= 1 / x;
     x += 1;
   }
-  if (x == 10) {
+  if (x == accscalar_t(10)) {
     return static_cast<scalar_t>(result + PSI_10);
   }
 
   accscalar_t y = 0;
-  if (x < 1.0e17) {
-    accscalar_t z = 1 / (x * x);
+  if (x < accscalar_t(1.0e17)) {
+    accscalar_t z = accscalar_t(1) / (x * x);
 
     accscalar_t polevl_result = 0;
     for (int i = 0; i <= 6; i++) {
@@ -82,20 +82,23 @@ static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) {
   accscalar_t x = static_cast<accscalar_t>(in);
   accscalar_t sign = +1;
   accscalar_t result = 0;
-  if (x < 0.5f) {
+  if (x < accscalar_t(0.5)) {
     sign = -1;
     accscalar_t sin_pi_x = std::sin(PI * x);
     result -= (PI * PI) / (sin_pi_x * sin_pi_x);
-    x = 1 - x;
+    x = accscalar_t(1) - x;
   }
   for (int i = 0; i < 6; ++i) {
-    result += 1 / (x * x);
-    x += 1;
+    result += accscalar_t(1) / (x * x);
+    x += accscalar_t(1);
   }
-  const accscalar_t one = static_cast<scalar_t>(1);
-  const accscalar_t ixx = 1 / (x * x);
-  result += (1 + 1 / (2 * x) +
-             ixx * (one / 6 - ixx * (one / 30 - ixx * (one / 42)))) /
+  const accscalar_t one = accscalar_t(1);
+  const accscalar_t ixx = accscalar_t(1) / (x * x);
+  result +=
+      (accscalar_t(1) + accscalar_t(1) / (accscalar_t(2) * x) +
+       ixx *
+           (one / accscalar_t(6) -
+            ixx * (one / accscalar_t(30) - ixx * (one / accscalar_t(42))))) /
       x;
   return static_cast<scalar_t>(sign * result);
 }
@@ -122,7 +125,7 @@ chbevl(scalar_t _x, const scalar_t array[], size_t len) {
     b0 = _x * b1 - b2 + array[i];
   }
 
-  return (0.5 * (b0 - b2));
+  return (scalar_t(0.5) * (b0 - b2));
 }
 
 /*
diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp
index 0d38a4882..6d535a40d 100644
--- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp
+++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/MaxMinElementwiseKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h
index 063a086cd..b54c69717 100644
--- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h
+++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void maximum_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void maximum_kernel(TensorIteratorBase& iter);
 
-void minimum_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void minimum_kernel(TensorIteratorBase& iter);
 
-void fmax_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void fmax_kernel(TensorIteratorBase& iter);
 
-void fmin_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void fmin_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp
index 866f62a1a..d8418fef9 100644
--- a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp
+++ b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp
@@ -15,6 +15,9 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
+
+#include <ATen/native/xpu/sycl/MultinomialKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename item_t>
diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.h b/src/ATen/native/xpu/sycl/MultinomialKernel.h
index d400e51cd..5adede6f6 100644
--- a/src/ATen/native/xpu/sycl/MultinomialKernel.h
+++ b/src/ATen/native/xpu/sycl/MultinomialKernel.h
@@ -3,10 +3,10 @@
 
 namespace at::native::xpu {
 
-void multinomial_kernel(
+TORCH_XPU_API void multinomial_kernel(
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
     c10::optional<Generator> generator);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/NMSKernel.cpp b/src/ATen/native/xpu/sycl/NMSKernel.cpp
index ea4d0b08f..fe42226c5 100644
--- a/src/ATen/native/xpu/sycl/NMSKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NMSKernel.cpp
@@ -2,6 +2,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/NMSKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/NMSKernel.h b/src/ATen/native/xpu/sycl/NMSKernel.h
index 17aec3f7b..198223731 100644
--- a/src/ATen/native/xpu/sycl/NMSKernel.h
+++ b/src/ATen/native/xpu/sycl/NMSKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold);
+TORCH_XPU_API Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold);
 
 }
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
index e1c50a263..0cb77d52f 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
@@ -5,6 +5,8 @@
 #include <comm/SYCLHelpers.h>
 #include <comm/TensorInfo.h>
 
+#include <ATen/native/xpu/sycl/NonzeroKernel.h>
+
 namespace at::native::xpu {
 
 struct FlattenIdxtoRealIdxKernelFunctor {
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.h b/src/ATen/native/xpu/sycl/NonzeroKernel.h
index ae79d01f8..f8e602ed2 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.h
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.h
@@ -3,6 +3,6 @@
 
 namespace at::native::xpu {
 
-void nonzero_kernel(const Tensor& self, Tensor& out);
+TORCH_XPU_API void nonzero_kernel(const Tensor& self, Tensor& out);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
index 68ef76bff..10f3765ac 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/PointwiseOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
index 8c8bc0d26..230b693f5 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -4,18 +4,20 @@
 
 namespace at::native::xpu {
 
-void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value);
 
-void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value);
 
-void mse_backward_kernel(TensorIterator& iter, const Scalar& value);
+TORCH_XPU_API void mse_backward_kernel(
+    TensorIterator& iter,
+    const Scalar& value);
 
-void smooth_l1_backward_kernel(
+TORCH_XPU_API void smooth_l1_backward_kernel(
     TensorIterator& iter,
     const Scalar& norm,
     double beta);
 
-void huber_backward_kernel(
+TORCH_XPU_API void huber_backward_kernel(
     TensorIterator& iter,
     const Scalar& norm,
     double delta);
diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp
index f96b3f482..e080511d2 100644
--- a/src/ATen/native/xpu/sycl/PowKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PowKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/UnaryFractionKernels.h>
 #include <ATen/native/xpu/sycl/UnaryKernels.h>
 
+#include <ATen/native/xpu/sycl/PowKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
@@ -35,6 +37,16 @@ static inline c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
 
 } // namespace impl
 
+#ifdef _MSC_VER
+// Divergence for MSVC due to accuracy issue. https://github.com/intel/torch-xpu-ops/issues/842.
+template <typename scalar_t>
+struct PowTensorTensorCastFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  opmath_t operator()(opmath_t base, opmath_t exp) const {
+    return impl::pow_(base, exp);
+  }
+};
+#else
 template <typename scalar_t>
 struct PowTensorTensorCastFunctor {
   scalar_t operator()(scalar_t base, scalar_t exp) const {
@@ -42,6 +54,7 @@ struct PowTensorTensorCastFunctor {
     return impl::pow_(opmath_t{base}, opmath_t{exp});
   }
 };
+#endif
 
 template <typename scalar_t>
 struct PowTensorTensorFunctor {
diff --git a/src/ATen/native/xpu/sycl/PowKernels.h b/src/ATen/native/xpu/sycl/PowKernels.h
index 31a31c420..d21d0d1a7 100644
--- a/src/ATen/native/xpu/sycl/PowKernels.h
+++ b/src/ATen/native/xpu/sycl/PowKernels.h
@@ -6,11 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void pow_tensor_scalar_kernel(
+TORCH_XPU_API void pow_tensor_scalar_kernel(
     TensorIteratorBase& iter,
     const Scalar& exp_scalar);
 
-void pow_tensor_tensor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void pow_tensor_tensor_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.cpp b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
index cfc595fb5..824cd2d0e 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
@@ -9,6 +9,8 @@
 
 #include <ATen/ops/arange.h>
 
+#include <ATen/native/xpu/sycl/RandpermKernel.h>
+
 namespace at::native::xpu {
 
 template <typename T, typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.h b/src/ATen/native/xpu/sycl/RandpermKernel.h
index 3547c74ad..ef73145d4 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.h
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.h
@@ -2,8 +2,8 @@
 #include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
-Tensor randperm_kernel(
-    Tensor& result,
-    int64_t n,
-    c10::optional<Generator> generator);
-} // namespace at::native::xpu
\ No newline at end of file
+
+TORCH_XPU_API Tensor
+randperm_kernel(Tensor& result, int64_t n, c10::optional<Generator> generator);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
index b7d542b48..152d75eae 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -5,6 +5,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/RangeFactoriesKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
index 3cf08ca5d..71dc4f5fb 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
@@ -6,13 +6,13 @@ namespace at {
 namespace native {
 namespace xpu {
 
-Tensor& arange_kernel(
+TORCH_XPU_API Tensor& arange_kernel(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
     Tensor& result);
 
-Tensor& range_kernel(
+TORCH_XPU_API Tensor& range_kernel(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
diff --git a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
index 121761053..a2df2bd63 100644
--- a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
index 1e18b2e5b..c522dbe7c 100644
--- a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
index 3c9f8453d..60f92ef3b 100644
--- a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename acc_t = scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
index b9148494a..3c70c8951 100644
--- a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at::native::xpu {
 template <typename scalar_t>
 struct AndFunctor {
diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
index 16095056c..3dfbb05c2 100644
--- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceMaxValuesKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h
index 6503f8842..9a669ec7a 100644
--- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h
+++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void max_values_kernel(TensorIterator& iter);
+TORCH_XPU_API void max_values_kernel(TensorIterator& iter);
 
-void max_kernel(TensorIterator& iter);
+TORCH_XPU_API void max_kernel(TensorIterator& iter);
 
-void max_all_kernel(TensorIterator& iter);
+TORCH_XPU_API void max_all_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
index 2a0ce889c..f73df51a0 100644
--- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceMinValuesKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h
index d5600f5d4..65f3acf8e 100644
--- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h
+++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void min_values_kernel(TensorIterator& iter);
+TORCH_XPU_API void min_values_kernel(TensorIterator& iter);
 
-void min_kernel(TensorIterator& iter);
+TORCH_XPU_API void min_kernel(TensorIterator& iter);
 
-void min_all_kernel(TensorIterator& iter);
+TORCH_XPU_API void min_all_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
index 6d0e75680..7e24f7ecc 100644
--- a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
index 074e8022d..ef405be49 100644
--- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -1,8 +1,10 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/ops/imag.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
+
+#include <ATen/native/xpu/sycl/ReduceNormKernel.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.h b/src/ATen/native/xpu/sycl/ReduceNormKernel.h
index 5e255dd5b..bc14826ec 100644
--- a/src/ATen/native/xpu/sycl/ReduceNormKernel.h
+++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void norm_kernel(TensorIterator& iter, const Scalar& val);
+TORCH_XPU_API void norm_kernel(TensorIterator& iter, const Scalar& val);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
index d85754efb..8b03fb137 100644
--- a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
@@ -4,26 +4,29 @@
 
 namespace at::native::xpu {
 
-void argmax_kernel(TensorIterator& iter);
+TORCH_XPU_API void argmax_kernel(TensorIterator& iter);
 
-void argmin_kernel(TensorIterator& iter);
+TORCH_XPU_API void argmin_kernel(TensorIterator& iter);
 
-void and_kernel(TensorIterator& iter);
+TORCH_XPU_API void and_kernel(TensorIterator& iter);
 
-void or_kernel(TensorIterator& iter);
+TORCH_XPU_API void or_kernel(TensorIterator& iter);
 
-void mean_kernel(TensorIterator& iter);
+TORCH_XPU_API void mean_kernel(TensorIterator& iter);
 
-void sum_kernel(TensorIterator& iter);
+TORCH_XPU_API void sum_kernel(TensorIterator& iter);
 
-void prod_kernel(TensorIterator& iter);
+TORCH_XPU_API void prod_kernel(TensorIterator& iter);
 
-void nansum_kernel(TensorIterator& iter);
+TORCH_XPU_API void nansum_kernel(TensorIterator& iter);
 
-void std_var_kernel(TensorIterator& iter, double correction, bool take_sqrt);
+TORCH_XPU_API void std_var_kernel(
+    TensorIterator& iter,
+    double correction,
+    bool take_sqrt);
 
-void aminmax_kernel(TensorIterator& iter);
+TORCH_XPU_API void aminmax_kernel(TensorIterator& iter);
 
-void aminmax_allreduce_kernel(TensorIterator& iter);
+TORCH_XPU_API void aminmax_allreduce_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
index f250b34ad..7bdc3a188 100644
--- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
index d90f63e18..ef838a969 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
@@ -13,6 +13,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/ReflectionPadKernels.h>
+
 namespace at::native::xpu {
 
 inline std::pair<int64_t, int64_t> get_index_mapping1d(
@@ -613,7 +615,7 @@ void reflection_pad1d_backward_kernel(
 }
 
 void reflection_pad2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding) {
   TORCH_CHECK(
@@ -710,7 +712,7 @@ void reflection_pad2d_kernel(
 }
 
 void reflection_pad2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding) {
@@ -873,4 +875,4 @@ void reflection_pad3d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
index efd61f208..fc41b0b75 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
@@ -4,37 +4,37 @@
 
 namespace at::native::xpu {
 
-void reflection_pad1d_kernel(
+TORCH_XPU_API void reflection_pad1d_kernel(
     const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
-void reflection_pad1d_backward_kernel(
+TORCH_XPU_API void reflection_pad1d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding);
 
-void reflection_pad2d_kernel(
-    Tensor& output,
+TORCH_XPU_API void reflection_pad2d_kernel(
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
-void reflection_pad2d_backward_kernel(
-    Tensor& grad_input,
+TORCH_XPU_API void reflection_pad2d_backward_kernel(
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding);
 
-void reflection_pad3d_kernel(
+TORCH_XPU_API void reflection_pad3d_kernel(
     const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
-void reflection_pad3d_backward_kernel(
+TORCH_XPU_API void reflection_pad3d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RenormKernel.cpp b/src/ATen/native/xpu/sycl/RenormKernel.cpp
index 0a1696d73..0f08a6566 100644
--- a/src/ATen/native/xpu/sycl/RenormKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RenormKernel.cpp
@@ -2,6 +2,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/RenormKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/RenormKernel.h b/src/ATen/native/xpu/sycl/RenormKernel.h
index a619cf708..488e8365a 100644
--- a/src/ATen/native/xpu/sycl/RenormKernel.h
+++ b/src/ATen/native/xpu/sycl/RenormKernel.h
@@ -3,6 +3,8 @@
 
 namespace at::native::xpu {
 
-void renorm_scale_factor_kernel(TensorIteratorBase& iter, double maxnorm);
+TORCH_XPU_API void renorm_scale_factor_kernel(
+    TensorIteratorBase& iter,
+    double maxnorm);
 
 }
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
index 0b10f982a..f1eeb426b 100644
--- a/src/ATen/native/xpu/sycl/RepeatKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
@@ -2,6 +2,9 @@
 #include <ATen/native/Repeat.h>
 #include <ATen/native/xpu/sycl/RepeatKernel.h>
 #include <comm/SYCLContext.h>
+
+#include <ATen/native/xpu/sycl/RepeatKernel.h>
+
 namespace at::native::xpu {
 template <typename index_t>
 struct RepeatInterleaveKernelFunctor {
@@ -72,4 +75,4 @@ Tensor repeat_interleave_kernel(
   });
   return output;
 }
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h
index 3df41a956..9856c4305 100644
--- a/src/ATen/native/xpu/sycl/RepeatKernel.h
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.h
@@ -2,8 +2,8 @@
 #include <ATen/ATen.h>
 namespace at::native::xpu {
 
-Tensor repeat_interleave_kernel(
+TORCH_XPU_API Tensor repeat_interleave_kernel(
     const Tensor& repeats,
     c10::optional<int64_t> output_size);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
index 6b85e9318..75b9322b3 100644
--- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
@@ -13,6 +13,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ReplicationPaddingKernels.h>
+
 namespace at::native::xpu {
 
 inline int imin(int a, int b) {
@@ -517,7 +519,7 @@ void replication_pad2d_kernel(
 }
 
 void replication_pad2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
@@ -763,4 +765,4 @@ void replication_pad3d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
index 545b26da8..35e20c269 100644
--- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
@@ -4,37 +4,37 @@
 
 namespace at::native::xpu {
 
-void replication_pad1d_kernel(
+TORCH_XPU_API void replication_pad1d_kernel(
     const Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad1d_backward_kernel(
+TORCH_XPU_API void replication_pad1d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad2d_kernel(
+TORCH_XPU_API void replication_pad2d_kernel(
     const Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad2d_backward_kernel(
-    Tensor& grad_input,
+TORCH_XPU_API void replication_pad2d_backward_kernel(
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad3d_kernel(
+TORCH_XPU_API void replication_pad3d_kernel(
     const Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad3d_backward_kernel(
+TORCH_XPU_API void replication_pad3d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
index 195d526d8..0835b53ff 100644
--- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
@@ -3,6 +3,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/XPUGuard.h>
 
+#include <ATen/native/xpu/sycl/ResizeKernel.h>
+
 namespace at::native::xpu {
 
 void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) {
@@ -58,7 +60,7 @@ TensorImpl* resize_impl_xpu_(
     TensorImpl* self,
     IntArrayRef size,
     at::OptionalIntArrayRef stride,
-    bool device_guard = true) {
+    bool device_guard) {
   if (self->sizes() == size && (!stride || self->strides() == stride)) {
     return self;
   }
diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.h b/src/ATen/native/xpu/sycl/ResizeKernel.h
index 5cef196ed..c5c113194 100644
--- a/src/ATen/native/xpu/sycl/ResizeKernel.h
+++ b/src/ATen/native/xpu/sycl/ResizeKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-TensorImpl* resize_impl_xpu_(
+TORCH_XPU_API TensorImpl* resize_impl_xpu_(
     TensorImpl* self,
     IntArrayRef size,
     at::OptionalIntArrayRef stride,
diff --git a/src/ATen/native/xpu/sycl/ScanKernels.h b/src/ATen/native/xpu/sycl/ScanKernels.h
index af632fa83..ea20e4460 100644
--- a/src/ATen/native/xpu/sycl/ScanKernels.h
+++ b/src/ATen/native/xpu/sycl/ScanKernels.h
@@ -3,8 +3,14 @@
 
 namespace at::native::xpu {
 
-void cumsum_kernel(const Tensor& result, const Tensor& self, int64_t dim);
+TORCH_XPU_API void cumsum_kernel(
+    const Tensor& result,
+    const Tensor& self,
+    int64_t dim);
 
-void cumprod_kernel(const Tensor& result, const Tensor& self, int64_t dim);
+TORCH_XPU_API void cumprod_kernel(
+    const Tensor& result,
+    const Tensor& self,
+    int64_t dim);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
index 81d4b9e9a..597be8553 100644
--- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
@@ -14,6 +14,8 @@
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ScatterGatherKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h
index 0f8b7e5be..4d7d375b7 100644
--- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h
+++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h
@@ -6,45 +6,45 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gather_kernel(
+TORCH_XPU_API void gather_kernel(
     const Tensor& result,
     const Tensor& self,
     int64_t dim,
     const Tensor& index);
 
-void scatter_kernel(
+TORCH_XPU_API void scatter_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Tensor& src);
 
-void scatter_fill_kernel(
+TORCH_XPU_API void scatter_fill_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Scalar& src);
 
-void scatter_add_kernel(
+TORCH_XPU_API void scatter_add_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Tensor& src);
 
-void scatter_reduce_kernel(
+TORCH_XPU_API void scatter_reduce_kernel(
     const Tensor& self,
     const int64_t dim,
     const Tensor& index,
     const Tensor& src,
     const ReductionType& reduce);
 
-void scatter_reduce_two_kernel(
+TORCH_XPU_API void scatter_reduce_two_kernel(
     const Tensor& self,
     const int64_t dim,
     const Tensor& index,
     const Tensor& src,
     const ReductionType& reduce);
 
-void scatter_scalar_reduce_kernel(
+TORCH_XPU_API void scatter_scalar_reduce_kernel(
     const Tensor& self,
     const int64_t dim,
     const Tensor& index,
diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
index 23d94b037..eb1f0e090 100644
--- a/src/ATen/native/xpu/sycl/Shape.cpp
+++ b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -11,6 +11,8 @@
 #include <ATen/ops/narrow.h>
 #include <ATen/ops/size_native.h>
 
+#include <ATen/native/xpu/sycl/ShapeKernels.h>
+
 namespace at::native::xpu {
 
 // The best performance is achieved for parallel computing with 1024 batch sizes
diff --git a/src/ATen/native/xpu/sycl/ShapeKernels.h b/src/ATen/native/xpu/sycl/ShapeKernels.h
index 494f37020..8f758bfb2 100644
--- a/src/ATen/native/xpu/sycl/ShapeKernels.h
+++ b/src/ATen/native/xpu/sycl/ShapeKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void cat_out_kernel(
+TORCH_XPU_API void cat_out_kernel(
     const ITensorListRef& tensors,
     int64_t dim,
     int64_t valid,
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index e0a35a295..45c9cb016 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -10,6 +10,8 @@
 
 #include <ATen/ops/empty_like_native.h>
 
+#include <ATen/native/xpu/sycl/SoftMaxKernels.h>
+
 using namespace xpu::sycl;
 
 namespace at {
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
index 4b8741a2a..87d205442 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
@@ -6,26 +6,26 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void _softmax_kernel(
+TORCH_XPU_API void _softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
     const Tensor& output);
 
-void _log_softmax_kernel(
+TORCH_XPU_API void _log_softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
     const Tensor& output);
 
-void _softmax_backward_kernel(
+TORCH_XPU_API void _softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
     bool half_to_float,
     const Tensor& grad_input);
 
-void _log_softmax_backward_kernel(
+TORCH_XPU_API void _log_softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
diff --git a/src/ATen/native/xpu/sycl/Sorting.cpp b/src/ATen/native/xpu/sycl/Sorting.cpp
index f75b50ac5..cf41810dc 100644
--- a/src/ATen/native/xpu/sycl/Sorting.cpp
+++ b/src/ATen/native/xpu/sycl/Sorting.cpp
@@ -17,6 +17,8 @@
 #include <c10/macros/Macros.h>
 
 #include <ATen/ops/empty_strided.h>
+#include <ATen/native/xpu/sycl/Sorting.h>
+
 namespace at::native::xpu {
 
 template <typename key_t, typename value_t, typename func_t>
diff --git a/src/ATen/native/xpu/sycl/Sorting.h b/src/ATen/native/xpu/sycl/Sorting.h
index f4ce097bc..14164ddf9 100644
--- a/src/ATen/native/xpu/sycl/Sorting.h
+++ b/src/ATen/native/xpu/sycl/Sorting.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void sort_stable_kernel(
+TORCH_XPU_API void sort_stable_kernel(
     const TensorBase& self_base,
     const TensorBase& values_base,
     const TensorBase& indices_base,
@@ -12,7 +12,7 @@ void sort_stable_kernel(
     bool descending,
     bool stable);
 
-void launch_median_kernel(
+TORCH_XPU_API void launch_median_kernel(
     const TensorBase& vals,
     const TensorBase& inds,
     const TensorBase& self,
diff --git a/src/ATen/native/xpu/sycl/StepKernels.cpp b/src/ATen/native/xpu/sycl/StepKernels.cpp
index d3d76926b..34f703591 100644
--- a/src/ATen/native/xpu/sycl/StepKernels.cpp
+++ b/src/ATen/native/xpu/sycl/StepKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/StepKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/StepKernels.h b/src/ATen/native/xpu/sycl/StepKernels.h
index 90f949d9a..c026a6b65 100644
--- a/src/ATen/native/xpu/sycl/StepKernels.h
+++ b/src/ATen/native/xpu/sycl/StepKernels.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void nextafter_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void nextafter_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp
index 265170821..6e88a987f 100644
--- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp
@@ -10,6 +10,8 @@
 #include <comm/SYCLHelpers.h>
 #include <comm/TensorInfo.h>
 
+#include <ATen/native/xpu/sycl/SummaryOpsKernels.h>
+
 namespace at::native::xpu {
 using namespace at::native;
 using namespace at::xpu::detail;
diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
index 4d0f06dfb..50f863212 100644
--- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
@@ -3,9 +3,7 @@
 
 namespace at::native::xpu {
 
-Tensor bincount_kernel(
-    const Tensor& self,
-    const Tensor& weights,
-    int64_t minlength);
+TORCH_XPU_API Tensor
+bincount_kernel(const Tensor& self, const Tensor& weights, int64_t minlength);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
index 7185b6453..c6471aac2 100644
--- a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/TensorCompareKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.h b/src/ATen/native/xpu/sycl/TensorCompareKernels.h
index 51cb74a40..0e83f33e9 100644
--- a/src/ATen/native/xpu/sycl/TensorCompareKernels.h
+++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.h
@@ -4,20 +4,24 @@
 
 namespace at::native::xpu {
 
-void where_kernel(TensorIterator& iter);
+TORCH_XPU_API void where_kernel(TensorIterator& iter);
 
-void clamp_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void clamp_kernel(TensorIteratorBase& iter);
 
-void clamp_scalar_kernel(
+TORCH_XPU_API void clamp_scalar_kernel(
     TensorIteratorBase& iter,
     const Scalar& min,
     const Scalar& max);
 
-void clamp_min_scalar_kernel(TensorIteratorBase& iter, Scalar min);
+TORCH_XPU_API void clamp_min_scalar_kernel(
+    TensorIteratorBase& iter,
+    Scalar min);
 
-void clamp_max_scalar_kernel(TensorIteratorBase& iter, Scalar max);
+TORCH_XPU_API void clamp_max_scalar_kernel(
+    TensorIteratorBase& iter,
+    Scalar max);
 
-void isin_kernel(
+TORCH_XPU_API void isin_kernel(
     const Tensor& elements,
     const Tensor& test_elements,
     bool invert,
diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
index 581182ec9..8bd6c7c6b 100644
--- a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Sorting.h>
 #include <ATen/native/xpu/sycl/SortingKernels.h>
 
+#include <ATen/native/xpu/sycl/TensorTopKKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.h b/src/ATen/native/xpu/sycl/TensorTopKKernel.h
index 7259e3f44..f04d5065e 100644
--- a/src/ATen/native/xpu/sycl/TensorTopKKernel.h
+++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void topk_kernel(
+TORCH_XPU_API void topk_kernel(
     const at::Tensor& input,
     int64_t k,
     int64_t dim,
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
index 79d27b93a..9cdb0dacf 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
@@ -6,6 +6,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/TensorTransformationsKernels.h>
+
 #ifdef _WIN32
 #define RESTRICT __restrict
 #else
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
index 84fa03f0a..1de970634 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
@@ -4,9 +4,9 @@
 
 namespace at::native::xpu {
 
-void flip_kernel(TensorIterator& iter, bool quantized);
+TORCH_XPU_API void flip_kernel(TensorIterator& iter, bool quantized);
 
-void roll_kernel(
+TORCH_XPU_API void roll_kernel(
     const Tensor& input,
     Tensor& output,
     IntArrayRef shifts,
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
index c503e5b16..45fcc655e 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/Resize.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/TriangularOpsKernels.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu;
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
index 85e144308..0cf9630b9 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
@@ -3,8 +3,14 @@
 
 namespace at::native::xpu {
 
-void tril_kernel(const Tensor& result, const Tensor& self, int64_t k);
+TORCH_XPU_API void tril_kernel(
+    const Tensor& result,
+    const Tensor& self,
+    int64_t k);
 
-void triu_kernel(const Tensor& result, const Tensor& self, int64_t k);
+TORCH_XPU_API void triu_kernel(
+    const Tensor& result,
+    const Tensor& self,
+    int64_t k);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
index d00720cd8..31117dd17 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
@@ -9,6 +9,10 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/CopyKernel.h>
+#include <ATen/native/xpu/sycl/Loops.h>
+#include <ATen/native/xpu/sycl/UnaryComplexKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
index 877a1e311..aef6cde9f 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void conj_kernel(TensorIterator& iter);
+TORCH_XPU_API void conj_kernel(TensorIterator& iter);
 
-void conj_physical_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void conj_physical_kernel(TensorIteratorBase& iter);
 
-void neg_conj_kernel(TensorIterator& iter);
+TORCH_XPU_API void neg_conj_kernel(TensorIterator& iter);
 
-void neg_kernel(TensorIterator& iter);
+TORCH_XPU_API void neg_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
index 96510a0e0..a8551c262 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryFractionKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
index 10f4889b0..a3a2015df 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
@@ -4,16 +4,18 @@
 
 namespace at::native::xpu {
 
-void reciprocal_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void reciprocal_kernel(TensorIteratorBase& iter);
 
-void floor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void floor_kernel(TensorIteratorBase& iter);
 
-void ceil_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void ceil_kernel(TensorIteratorBase& iter);
 
-void round_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void round_kernel(TensorIteratorBase& iter);
 
-void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals);
+TORCH_XPU_API void round_decimals_kernel(
+    TensorIteratorBase& iter,
+    int64_t decimals);
 
-void frac_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void frac_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp
index 7b23e22ed..d2546e842 100644
--- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp
@@ -5,16 +5,23 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/CopyKernel.h>
 #include <ATen/native/xpu/sycl/Loops.h>
-#include <ATen/native/xpu/sycl/Math.h>
+#include <ATen/native/xpu/sycl/MathExtensions.h>
 #include <c10/core/ScalarType.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnaryGammaKernels.h>
+
 namespace at::native::xpu {
 
-template <typename scalar_t>
+template <typename scalar_t, bool USE_FP64_PI>
 struct DigammaFunctor {
   scalar_t operator()(scalar_t a) const {
-    return calc_digamma(a);
+    if constexpr (USE_FP64_PI) {
+      return calc_digamma<scalar_t, double>(a);
+    } else {
+      using pi_t = at::acc_type_device<scalar_t, kXPU>;
+      return calc_digamma<scalar_t, pi_t>(a);
+    }
   }
 };
 
@@ -24,7 +31,13 @@ void digamma_kernel(TensorIteratorBase& iter) {
       at::ScalarType::BFloat16,
       iter.common_dtype(),
       "digamma_xpu",
-      [&]() { gpu_kernel(iter, DigammaFunctor<scalar_t>()); });
+      [&]() {
+        if (syclHasFloat64()) {
+          gpu_kernel(iter, DigammaFunctor<scalar_t, true>());
+        } else {
+          gpu_kernel(iter, DigammaFunctor<scalar_t, false>());
+        }
+      });
 }
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h
index 5eda977f6..2908a4b13 100644
--- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void digamma_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void digamma_kernel(TensorIteratorBase& iter);
 
-void polygamma_kernel(TensorIteratorBase& iter, int64_t n);
+TORCH_XPU_API void polygamma_kernel(TensorIteratorBase& iter, int64_t n);
 
-void lgamma_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lgamma_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp
index e1b8ff80f..d35dedf96 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h>
+
 namespace at::native::xpu {
 template <typename scalar_t, typename acc_t = scalar_t>
 struct AcosFunctor {
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h
index 4ff06d256..17acb1061 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h
@@ -3,6 +3,7 @@
 #include <ATen/native/TensorIterator.h>
 
 namespace at::native::xpu {
-void acos_kernel(TensorIteratorBase& iter);
 
-} // namespace at::native::xpu
\ No newline at end of file
+TORCH_XPU_API void acos_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp
index cd9864275..9062ee17a 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename acc_t = scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h
index 98dc2eed5..732d307ec 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void acosh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void acosh_kernel(TensorIteratorBase& iter);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
index c8b1aaca6..1819b47dc 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
index 194ce6479..ff449e2cd 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void asin_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void asin_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
index 009a68b47..24169a1dc 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
index 4d37288de..2668ee963 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void asinh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void asinh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
index f241e9da3..c6ce4b1b0 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
index 022720223..6ee127539 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void atan_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void atan_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
index 630a64d39..3033ff8f8 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
index 5536641e7..267b59eb6 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void atanh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void atanh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp
index 21552d99d..e4eee5e7a 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricCosKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h
index 35123a666..fc31580b7 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void cos_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void cos_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
index 11440b3d3..16cffe893 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
index 7f031e3ff..bbd10e1aa 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void cosh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void cosh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp
index 2a17cefcb..27e3e2906 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricSinKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h
index 76dca73c8..ecd4da1d4 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void sin_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sin_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp
index 2155aafad..06309f45c 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h
index e3ceaf643..a5005e1d0 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void sinh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sinh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp
index 57c0049b1..996da809b 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricTanKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h
index b96a35f93..9446ff058 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void tan_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void tan_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp
index dcbe8a970..1b71b3974 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h
index 6cecc99e2..831421a12 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void tanh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void tanh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
index d9291022d..614838f65 100644
--- a/src/ATen/native/xpu/sycl/UnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
@@ -11,6 +11,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnaryKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.h b/src/ATen/native/xpu/sycl/UnaryKernels.h
index cc394de6b..db7005523 100644
--- a/src/ATen/native/xpu/sycl/UnaryKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryKernels.h
@@ -4,17 +4,17 @@
 
 namespace at::native::xpu {
 
-void sqrt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sqrt_kernel(TensorIteratorBase& iter);
 
-void rsqrt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void rsqrt_kernel(TensorIteratorBase& iter);
 
-void bitwise_not_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_not_kernel(TensorIteratorBase& iter);
 
-void exp_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void exp_kernel(TensorIteratorBase& iter);
 
-void expm1_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void expm1_kernel(TensorIteratorBase& iter);
 
-void nan_to_num_kernel(
+TORCH_XPU_API void nan_to_num_kernel(
     TensorIteratorBase& iter,
     std::optional<double> nan,
     std::optional<double> pos_inf,
diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
index eb531cfcc..6be7cee18 100644
--- a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
@@ -9,6 +9,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnaryLogKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.h b/src/ATen/native/xpu/sycl/UnaryLogKernels.h
index c8d16ceae..72db9235a 100644
--- a/src/ATen/native/xpu/sycl/UnaryLogKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void log_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_kernel(TensorIteratorBase& iter);
 
-void log10_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log10_kernel(TensorIteratorBase& iter);
 
-void log1p_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log1p_kernel(TensorIteratorBase& iter);
 
-void log2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log2_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
index 655c1b503..7e878fdeb 100644
--- a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
@@ -7,6 +7,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnarySignKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.h b/src/ATen/native/xpu/sycl/UnarySignKernels.h
index 98b6eeb5f..01759a725 100644
--- a/src/ATen/native/xpu/sycl/UnarySignKernels.h
+++ b/src/ATen/native/xpu/sycl/UnarySignKernels.h
@@ -4,14 +4,14 @@
 
 namespace at::native::xpu {
 
-void logical_not_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_not_kernel(TensorIteratorBase& iter);
 
-void neg_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void neg_kernel(TensorIteratorBase& iter);
 
-void sgn_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sgn_kernel(TensorIteratorBase& iter);
 
-void sign_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sign_kernel(TensorIteratorBase& iter);
 
-void signbit_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void signbit_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index e5150176a..05cdd31b2 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -11,6 +11,8 @@
 #include <c10/util/complex.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/UnarySpecialOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
index 6bed9c6de..f810da576 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
@@ -4,16 +4,18 @@
 
 namespace at::native::xpu {
 
-void sigmoid_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sigmoid_kernel(TensorIteratorBase& iter);
 
-void erf_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void erf_kernel(TensorIteratorBase& iter);
 
-void erfc_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void erfc_kernel(TensorIteratorBase& iter);
 
-void erfinv_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void erfinv_kernel(TensorIteratorBase& iter);
 
-void exp2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void exp2_kernel(TensorIteratorBase& iter);
 
-void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar);
+TORCH_XPU_API void logit_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& eps_scalar);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
index d4b0d32f2..77f310029 100644
--- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
@@ -8,6 +8,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnfoldBackwardKernels.h>
+
 namespace at::native::xpu {
 
 constexpr int n_elems_per_work_item = 4; // UNROLLED_ELEM_PER_WORK_ITEM;
diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
index 2d618ce03..c74a8a950 100644
--- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
+++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void unfold_backward_kernel(
+TORCH_XPU_API void unfold_backward_kernel(
     Tensor& grad_out,
     const Tensor& grad_in,
     int64_t dim,
diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.cpp b/src/ATen/native/xpu/sycl/UniqueKernels.cpp
index b57274a8e..359712d1f 100644
--- a/src/ATen/native/xpu/sycl/UniqueKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UniqueKernels.cpp
@@ -3,6 +3,8 @@
 #include <ATen/native/xpu/sycl/pstl/PSTLFunctions.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UniqueKernels.h>
+
 namespace at::native::xpu {
 
 struct UniqueNotEqualFunctor {
diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.h b/src/ATen/native/xpu/sycl/UniqueKernels.h
index b7a7cd1ae..25b36cc64 100644
--- a/src/ATen/native/xpu/sycl/UniqueKernels.h
+++ b/src/ATen/native/xpu/sycl/UniqueKernels.h
@@ -3,29 +3,30 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-std::tuple<Tensor, Tensor, Tensor> unique_consecutive_kernel(
+
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_consecutive_kernel(
     const Tensor& self,
     const bool return_inverse,
     const bool return_counts,
     c10::optional<int64_t> dim);
 
-std::tuple<Tensor, Tensor, Tensor> unique_dim_consecutive_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_dim_consecutive_kernel(
     const Tensor& self,
     const int64_t dim,
     const bool return_inverse,
     const bool return_counts);
 
-std::tuple<Tensor, Tensor, Tensor> unique_dim_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_dim_kernel(
     const Tensor& self,
     const int64_t dim,
     const bool return_inverse,
     const bool return_counts);
 
-std::tuple<Tensor, Tensor> _unique_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> _unique_kernel(
     const Tensor& self,
     const bool return_inverse);
 
-std::tuple<Tensor, Tensor, Tensor> _unique2_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> _unique2_kernel(
     const Tensor& self,
     const bool return_inverse,
     const bool return_counts);
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
index 5023d5241..1a64e233b 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
@@ -6,6 +6,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename accscalar_t>
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
index a70a08c8c..fbf900238 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void upsample_bicubic2d_kernel(
+TORCH_XPU_API void upsample_bicubic2d_kernel(
     const Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
@@ -12,4 +12,4 @@ void upsample_bicubic2d_kernel(
     std::optional<double> scales_h,
     std::optional<double> scales_w);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
index f9075c139..653a293d8 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
@@ -14,6 +14,8 @@
 #include <ATen/native/xpu/sycl/Atomics.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename accscalar_t>
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
index 3071c51c3..aa5ee2c09 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void upsample_bilinear2d_out_kernel(
+TORCH_XPU_API void upsample_bilinear2d_out_kernel(
     const Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
@@ -12,7 +12,7 @@ void upsample_bilinear2d_out_kernel(
     std::optional<double> scales_h,
     std::optional<double> scales_w);
 
-void upsample_bilinear2d_backward_out_kernel(
+TORCH_XPU_API void upsample_bilinear2d_backward_out_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
@@ -21,4 +21,4 @@ void upsample_bilinear2d_backward_out_kernel(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
index f3ae596af..2bce3027e 100644
--- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
@@ -14,6 +14,8 @@
 #include "ATen/Context.h"
 #include "ATen/core/TensorBase.h"
 
+#include <ATen/native/xpu/sycl/UpSampleLinear1dKernels.h>
+
 namespace at::native::xpu {
 template <typename scalar_t, typename accscalar_t>
 struct UpsampleLinear1dKernelFunctor {
@@ -213,4 +215,4 @@ void upsample_linear1d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
index 8928ea906..70befad77 100644
--- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
@@ -3,14 +3,15 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-void upsample_linear1d_kernel(
+
+TORCH_XPU_API void upsample_linear1d_kernel(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
     std::optional<double> scales,
     const Tensor& output);
 
-void upsample_linear1d_backward_kernel(
+TORCH_XPU_API void upsample_linear1d_backward_kernel(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
@@ -18,4 +19,4 @@ void upsample_linear1d_backward_kernel(
     std::optional<double> scales,
     const Tensor& grad_input);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
index b482978f2..0751749fc 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
@@ -5,6 +5,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/UpSampleNearest1dKernels.h>
+
 namespace at::native {
 namespace xpu {
 
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
index 9f7b80d21..79801b4d6 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
@@ -5,14 +5,14 @@
 
 namespace at::native::xpu {
 
-void upsample_nearest1d_kernel(
+TORCH_XPU_API void upsample_nearest1d_kernel(
     const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
     c10::optional<double> scales,
     bool is_exact);
 
-void upsample_nearest1d_backward_kernel(
+TORCH_XPU_API void upsample_nearest1d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
index 3eab9f202..76e95c1ed 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
@@ -707,4 +707,4 @@ void upsample_nearest2d_kernel(
   }
 }
 } // namespace xpu
-} // namespace at::native
\ No newline at end of file
+} // namespace at::native
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
index c6383a56c..ea12e7b39 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
@@ -5,7 +5,7 @@
 
 namespace at::native::xpu {
 
-void upsample_nearest2d_kernel(
+TORCH_XPU_API void upsample_nearest2d_kernel(
     const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
@@ -13,7 +13,7 @@ void upsample_nearest2d_kernel(
     c10::optional<double> scales_w,
     bool is_exact);
 
-void upsample_nearest2d_backward_kernel(
+TORCH_XPU_API void upsample_nearest2d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
index dd93f68c3..ac67d5d34 100644
--- a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
@@ -7,6 +7,8 @@
 #include <comm/TensorOptions.h>
 #include "comm/Runtime.h"
 
+#include <ATen/native/xpu/sycl/WeightNormKernels.h>
+
 namespace at::native::xpu {
 
 template <typename T>
@@ -1024,4 +1026,4 @@ std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
   return {grad_v, grad_g};
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.h b/src/ATen/native/xpu/sycl/WeightNormKernels.h
index 5b0d7afd2..88a45b298 100644
--- a/src/ATen/native/xpu/sycl/WeightNormKernels.h
+++ b/src/ATen/native/xpu/sycl/WeightNormKernels.h
@@ -2,15 +2,17 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-std::tuple<Tensor, Tensor> weight_norm_kernel(
+
+TORCH_XPU_API std::tuple<Tensor, Tensor> weight_norm_kernel(
     const Tensor& v,
     const Tensor& g,
     int64_t dim);
 
-std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
     const Tensor& grad_w,
     const Tensor& saved_v,
     const Tensor& saved_g,
     const Tensor& saved_norms,
     int64_t dim);
-} // namespace at::native::xpu
\ No newline at end of file
+
+} // namespace at::native::xpu
diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
new file mode 100644
index 000000000..3e304b31f
--- /dev/null
+++ b/src/BuildOnLinux.cmake
@@ -0,0 +1,79 @@
+# Build on Linux
+
+set(TORCH_XPU_OPS_LIBRARIES)
+set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
+
+add_library(
+  torch_xpu_ops
+  STATIC
+  ${ATen_XPU_CPP_SRCS}
+  ${ATen_XPU_NATIVE_CPP_SRCS}
+  ${ATen_XPU_GEN_SRCS})
+
+if(BUILD_SEPARATE_OPS)
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
+    set(sycl_lib torch-xpu-ops-sycl-${name})
+    sycl_add_library(
+      ${sycl_lib}
+      SHARED
+      SYCL_SOURCES ${sycl_src})
+    target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
+    list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+    # Decouple with PyTorch cmake definition.
+    install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  endforeach()
+else()
+  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
+  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
+  set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS)
+
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    endif()
+  endforeach()
+
+  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
+  sycl_add_library(
+    ${sycl_unary_binary_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
+  sycl_add_library(
+    ${sycl_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+endif()
+set(SYCL_LINK_LIBRARIES_KEYWORD)
+
+list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+
+foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
+  # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
+  torch_compile_options(${lib})
+  target_compile_options_if_supported(${lib} "-Wno-deprecated-copy")
+  target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS})
+
+  target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
+
+  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
+endforeach()
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
new file mode 100644
index 000000000..a8e00cb55
--- /dev/null
+++ b/src/BuildOnWindows.cmake
@@ -0,0 +1,106 @@
+# Build on Windows
+
+set(TORCH_XPU_OPS_LIBRARIES)
+set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
+
+# Walk around cyclic dependence
+# libtorch_xpu.so links to libtorch_xpu_ops.a
+# libtorch_xpu_ops.a dlopens libtorch_xpu_ops_aten.so (Break cycle)
+# libtorch_xpu_ops_aten.so links to libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so
+# libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so links to libtorch_xpu.so
+add_library(
+  torch_xpu_ops
+  STATIC
+  ${ATen_XPU_CPP_SRCS}
+  "bridge.cpp")
+set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
+target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
+
+add_library(
+  torch_xpu_ops_aten
+  SHARED
+  ${ATen_XPU_NATIVE_CPP_SRCS}
+  ${ATen_XPU_GEN_SRCS})
+install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
+
+if(BUILD_SEPARATE_OPS)
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
+    set(sycl_lib torch-xpu-ops-sycl-${name})
+    sycl_add_library(
+      ${sycl_lib}
+      SHARED
+      SYCL_SOURCES ${sycl_src})
+    target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
+    list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+    # Decouple with PyTorch cmake definition.
+    install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  endforeach()
+else()
+  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
+  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
+  set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS)
+
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    # Resolve cyclic dependences between
+    # torch_xpu_ops_sycl_unary_binary_kernels.dll and
+    # torch_xpu_ops_sycl_kernels.dll. Move definition and invoke of kernels
+    # into a same kernel library. Here we move elementwise kernel pow and copy
+    # into torch_xpu_ops_sycl_unary_binary_kernels.dll.
+    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
+    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
+    if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "" AND IS_POW STREQUAL "" AND IS_COPY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    endif()
+  endforeach()
+
+  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
+  sycl_add_library(
+    ${sycl_unary_binary_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
+  target_compile_definitions(${sycl_unary_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_binary_lib})
+  target_link_libraries(${sycl_unary_binary_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
+  sycl_add_library(
+    ${sycl_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS})
+  target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
+  target_link_libraries(${sycl_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+endif()
+set(SYCL_LINK_LIBRARIES_KEYWORD)
+
+list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
+
+foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
+  # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
+  torch_compile_options(${lib})
+  target_compile_options_if_supported(${lib} "-Wno-deprecated-copy")
+  target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS})
+
+  target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
+
+  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
+  target_link_libraries(${lib} PUBLIC c10_xpu)
+endforeach()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aa7551ebe..0716ca5af 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,95 +2,24 @@
 include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake)
 
 set(ATen_XPU_CPP_SRCS)
+set(ATen_XPU_NATIVE_CPP_SRCS)
 set(ATen_XPU_SYCL_SRCS)
 
 set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")
 
 add_subdirectory(ATen)
 
-set(TORCH_XPU_OPS_LIBRARIES)
-set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
-
-add_library(
-  torch_xpu_ops
-  STATIC
-  ${ATen_XPU_CPP_SRCS}
-  ${ATen_XPU_GEN_SRCS})
-
-if(BUILD_SEPARATE_OPS)
-  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
-    get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
-    set(sycl_lib torch-xpu-ops-sycl-${name})
-    sycl_add_library(
-      ${sycl_lib}
-      SHARED
-      SYCL_SOURCES ${sycl_src})
-    target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
-    if(USE_PER_OPERATOR_HEADERS)
-      target_compile_definitions(torch_xpu_ops PUBLIC -DAT_PER_OPERATOR_HEADERS)
-    endif()
-    list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
-
-    # Decouple with PyTorch cmake definition.
-    install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-  endforeach()
+# With the increasement of bin size, we have to split libtorch_xpu.so into
+# multiple libraries. Because of strict linkage requirements on Windows,
+# we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible.
+# To isolate impact and keep both maintainable, there are separate code pieces
+# to setup CMAKE targets. Will revert splitting library and unify these once
+# compression of device code is supported by SYCL/backend compiler.
+if(WIN32)
+  include(BuildOnWindows.cmake)
 else()
-  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
-  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
-  set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS)
-
-  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
-    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
-    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
-    if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src})
-    else()
-      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
-    endif()
-  endforeach()
-
-  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
-  sycl_add_library(
-    ${sycl_unary_binary_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  set(sycl_lib torch_xpu_ops_sycl_kernels)
-  sycl_add_library(
-    ${sycl_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  include(BuildOnLinux.cmake)
 endif()
-set(SYCL_LINK_LIBRARIES_KEYWORD)
-
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
-
-foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
-  # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
-  torch_compile_options(${lib})
-  target_compile_options_if_supported(${lib} "-Wno-deprecated-copy")
-  target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS})
-
-  if(USE_PER_OPERATOR_HEADERS)
-    target_compile_definitions(${lib}  PUBLIC -DAT_PER_OPERATOR_HEADERS)
-  endif()
-
-  target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS})
-  target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
-  target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
-
-  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
-endforeach()
 
 include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake)
 if(CLANG_FORMAT)
diff --git a/src/bridge.cpp b/src/bridge.cpp
new file mode 100644
index 000000000..c19ce2554
--- /dev/null
+++ b/src/bridge.cpp
@@ -0,0 +1,34 @@
+#include <windows.h>
+#include <sstream>
+#include <stdexcept>
+
+// The implementation helps walk around cyclic dependence, when we separate
+// kernels into multiple dll/so to avoid a large bin (>2GB).
+// The file is built into libtorch_xpu.so. libtorch_xpu.so won't depend on
+// libtorch_xpu_ops_aten.so but dlopen the library to walk around cyclic
+// dependence during linkage. To break cycle like,
+// libtorch_xpu.so -> (dlopen) libtorch_xpu_ops_aten.so -> (link)
+// libtorch_xpu_ops_kernels.so
+//                                                      -> (link)
+//                                                      libtorch_xpu_ops_unary_binary_kernels.so
+// libtorch_xpu_ops_kernels.so -> (link) libtorch_xpu.so
+// libtorch_xpu_ops_unary_binary_kernels.so -> (link) libtorch_xpu.so
+namespace {
+
+class LoadTorchXPUOps {
+ public:
+  LoadTorchXPUOps() {
+    if (NULL == LoadLibrary(PATH_TO_TORCH_XPU_OPS_ATEN_LIB)) {
+      std::ostringstream error;
+      error << "PyTorch XPU operators library is not loaded (ERROR: "
+            << GetLastError()
+            << "). Please check if PyTorch is installed correctly."
+            << " Or please file an issue on https://github.com/intel/torch-xpu-ops/issues.";
+      throw std::runtime_error(error.str());
+    }
+  }
+};
+
+static LoadTorchXPUOps init;
+
+} // namespace
diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h
index 0f4c084c8..01beffa74 100644
--- a/src/comm/DeviceProperties.h
+++ b/src/comm/DeviceProperties.h
@@ -185,5 +185,11 @@ uint32_t syclNativeVectorWidth(
       "Invalid data type to fetch native vector width!");
 }
 
+static inline bool syclHasFloat64(
+    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
+  return dev_prop->has_fp64;
+}
+
 } // namespace sycl
 } // namespace xpu
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 86ef8176a..9c7b2270f 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -1,165 +1,22 @@
 import os
 import sys
+from skip_list_common import skip_dict
+from skip_list_win import skip_dict as skip_dict_win
 
-skip_list = (
-    # Calculation error between XPU implementation and CPU implementation,
-    # 1. Compiler optimization causes failing to promote data type to higher precision.
-    # 2. Accumulate error is amplified by some operations in some extreme cases. (std::exp(extreme_large_num))
-    # 3. Accumulate error is amplified by a large number of accumalate operations.
-    # 4. Accumulate error is different on different implementations due to different accumulation order.
-    #     a. Different kernel implementations.
-    #     b. Different std functions. (std::log, std::tanh, std::exp)
-    # 5. The result of division between two same float values is not 1.
-    # 6. std functions get different results when input is nan or inf between GCC and SYCL.
-    "test_compare_cpu_cumsum_xpu_bfloat16",
-    "test_compare_cpu_cumsum_xpu_float16",
-    "test_compare_cpu_log_xpu_complex64",
-    "test_compare_cpu_log10_xpu_complex64",
-    "test_compare_cpu_log1p_xpu_complex64",
-    "test_compare_cpu_log2_xpu_complex64",
-    "test_compare_cpu_log2_xpu_complex128",
-    "test_compare_cpu_mul_xpu_complex64",
-    "test_compare_cpu_pow_xpu_complex128",
-    "test_compare_cpu_pow_xpu_complex64",
-    "test_compare_cpu_tan_xpu_complex128",
-    "test_compare_cpu_tan_xpu_complex64",
-    "test_compare_cpu_tanh_xpu_complex128",
-    "test_compare_cpu_tanh_xpu_complex64",
-    "test_compare_cpu_rsqrt_xpu_bfloat16",
-    "test_compare_cpu_pow_xpu_bfloat16",
-    # cuda has the same issue on this case
-    "test_compare_cpu__refs_rsub_xpu_bfloat16",
-    "test_compare_cpu_add_xpu_bfloat16",
-    "test_compare_cpu_sub_xpu_bfloat16",
-    "test_compare_cpu_acos_xpu_complex128",
-    "test_compare_cpu_acos_xpu_complex64",
-    "test_compare_cpu_acosh_xpu_complex64",
-    "test_compare_cpu_cross_xpu_float16",
-    "test_compare_cpu_floor_divide_xpu_bfloat16",
-    "test_compare_cpu_floor_divide_xpu_float16",
-    "test_compare_cpu_polygamma_polygamma_n_0_xpu_bfloat16",
-    "test_compare_cpu_exp_xpu_bfloat16",
-    "test_compare_cpu_exp_xpu_complex128",
-    "test_compare_cpu_exp_xpu_complex64",
-    "test_compare_cpu_acosh_xpu_complex64",
-    "test_compare_cpu_asin_xpu_complex128",
-    "test_compare_cpu_asin_xpu_complex64",
-    "test_compare_cpu_asinh_xpu_complex128",
-    "test_compare_cpu_asinh_xpu_complex64",
-    "test_compare_cpu_atan_xpu_complex128",
-    "test_compare_cpu_atan_xpu_complex64",
-    "test_compare_cpu_exp2_xpu_complex128",
-    "test_compare_cpu_exp2_xpu_complex64",
-    "test_compare_cpu_nextafter_xpu_bfloat16",
-    # CUDA does not support the data type either
-    "test_non_standard_bool_values_native_dropout_backward_xpu_bool",
-    # Need FP64 golden ref for more accurate comparison
-    "test_compare_cpu_log_softmax_xpu_bfloat16",
-    # TestCompositeCompliance
-    # CPU fallback fails
-    # Require implementing aten::embedding_renorm_
-    "test_view_replay_nn_functional_embedding_xpu_float32",
-    # TestCompositeCompliance::test_cow_input
-    # XPU Tensor fails in copy-on-write cases
-    # AssertionError: False is not true : Keyword argument 'output grad 0' during backward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_backward=False` in this operation's OpInfo, add the arg to the OpInfo's `allow_cow_input_materialize_backward` list, or change the implementation to avoid materialization.
-    # https://github.com/intel/torch-xpu-ops/issues/281
-    "test_cow_input",
-    # XPU implementation is correct.
-    # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified)
-    # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified)
-    # CPU implementation gets NaN in the cases.
-    # https://en.cppreference.com/w/cpp/numeric/complex/exp
-    "test_compare_cpu_sigmoid_xpu_complex64",
-    "test_compare_cpu_sigmoid_xpu_complex128",
-    # Special handle (different calculation order) in CPU reference impl.
-    # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300
-    "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16",
-    "test_compare_cpu_nn_functional_embedding_bag_xpu_float16",
-    # Not implemented operators, aten::embedding_renorm_.
-    # To retrieve cases when the operators are supported.
-    # https://github.com/intel/torch-xpu-ops/issues/380
-    "test_compare_cpu_nn_functional_embedding_bag_xpu_float32",
-    "test_compare_cpu_nn_functional_embedding_bag_xpu_float64",
-    "test_view_replay_nn_functional_embedding_bag_xpu_float32",
-    # Double and complex datatype matmul is not supported in oneDNN
-    "test_compare_cpu_cdist_xpu_float64",
-    # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision
-    # Not in CUDA test scope too
-    "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16",
-    "test_compare_cpu_nn_functional_upsample_bilinear_xpu_float16",
-    # CPU result is not golden reference
-    "test_compare_cpu_nn_functional_group_norm_xpu_bfloat16",
-    "test_compare_cpu_nn_functional_group_norm_xpu_float16",
-    "test_compare_cpu_nn_functional_nll_loss_xpu_bfloat16",
-    "test_compare_cpu_nn_functional_nll_loss_xpu_float16",
-    "test_compare_cpu_nn_functional_batch_norm_xpu_bfloat16",
-    "test_compare_cpu__batch_norm_with_update_xpu_bfloat16",
-    "test_compare_cpu__batch_norm_with_update_xpu_float16",
-    "test_compare_cpu_nn_functional_huber_loss_xpu_bfloat16",
-    "test_compare_cpu_nansum_xpu_bfloat16",
-    "test_compare_cpu_nanmean_xpu_bfloat16",
-    # Align with CUDA impl by using accumulate type. But CPU doesn't use.
-    # When XPU uses original data type, the case passes.
-    "test_compare_cpu_logit_xpu_bfloat16",
-    # precison error
-    #     Mismatched elements: 1 / 24 (4.2%)
-    # Greatest absolute difference: 0.03125 at index (0, 1, 0, 1) (up to 0.001 allowed)
-    # Greatest relative difference: 0.0048828125 at index (0, 1, 0, 1) (up to 0.001 allowed)
-    "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_bfloat16",
-    # RuntimeError: "compute_index_ranges_weights" not implemented for 'Half'
-    "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float16",
-    # AssertionError: False is not true : Argument 0 during forward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_forward=False...
-    "test_cow_input_nn_functional_interpolate_bilinear_xpu_float32",
-    "test_cow_input_nn_functional_interpolate_linear_xpu_float32",
-    "test_cow_input_nn_functional_interpolate_trilinear_xpu_float32",
-    #The results of XPU and CUDA are consistent, but the results of CPU and CUDA are inconsistent
-    "test_compare_cpu_nn_functional_interpolate_linear_xpu_bfloat16",
-    "test_compare_cpu_nn_functional_interpolate_linear_xpu_float16",
-    # bicubic interpolate includes large calculation steps, accuracy reduces in half-precision
-    # Not in CUDA test scope too
-    "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_bfloat16",
-    "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float16",
-    # Not all operators are implemented for XPU tested in the case.
-    # Retrieve it once the operator is implemented.
-    # Error: The operator 'aten::glu_jvp' is not currently implemented for the XPU device.
-    "test_forward_ad_nn_functional_glu_xpu_float32",
-    # Precision error.
-    # Mismatched elements: 1 / 812 (0.1%)
-    # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed)
-    # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed)
-    "test_compare_cpu_hypot_xpu_bfloat16",
-    # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
-    # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
-    "test_compare_cpu_polar_xpu_bfloat16",
-    # Precision error.
-    # Mismatched elements: 1 / 25 (4.0%)
-    # Greatest absolute difference: 0.00146484375 at index (0, 0) (up to 0.001 allowed)
-    # Greatest relative difference: 0.0163116455078125 at index (0, 0) (up to 0.001 allowed)
-    "test_compare_cpu_sub_xpu_float16",
-    # different results for value index due to unstable sort.
-    # XPU and CUDA have the same result.
-    "test_compare_cpu_median_xpu_int16",
-    "test_compare_cpu_median_xpu_int32",
-    "test_compare_cpu_median_xpu_int64",
-    "test_compare_cpu_median_xpu_int8",
-    "test_compare_cpu_median_xpu_uint8",
-    "test_compare_cpu_nanmedian_xpu_int16",
-    "test_compare_cpu_nanmedian_xpu_int32",
-    "test_compare_cpu_nanmedian_xpu_int64",
-    "test_compare_cpu_nanmedian_xpu_int8",
-    "test_compare_cpu_nanmedian_xpu_uint8",
-)
+IS_WINDOWS = sys.platform == "win32"
 
+skip_list = skip_dict["test_ops_xpu.py"]
+if IS_WINDOWS:
+    skip_list += skip_dict_win["test_ops_xpu.py"]
 
-skip_options = " -k 'not " + skip_list[0]
+skip_options = " -k \"not " + skip_list[0]
 for skip_case in skip_list[1:]:
     skip_option = " and not " + skip_case
     skip_options += skip_option
-skip_options += "'"
+skip_options += "\""
 
-test_command = "PYTORCH_TEST_WITH_SLOW=1 pytest -v test_ops_xpu.py"
+os.environ["PYTORCH_TEST_WITH_SLOW"]="1"
+test_command = "pytest -v test_ops_xpu.py"
 test_command += skip_options
-
 res = os.system(test_command)
-exit_code = os.WEXITSTATUS(res)
-sys.exit(exit_code)
+sys.exit(res)
diff --git a/test/xpu/extended/run_test_with_skip_arc.py b/test/xpu/extended/run_test_with_skip_arc.py
new file mode 100644
index 000000000..30fd2c0e0
--- /dev/null
+++ b/test/xpu/extended/run_test_with_skip_arc.py
@@ -0,0 +1,24 @@
+import os
+import sys
+from skip_list_common import skip_dict
+from skip_list_arc import skip_dict as skip_dict_specifical
+from skip_list_win import skip_dict as skip_dict_win
+from skip_list_win_arc import skip_dict as skip_dict_win_arc
+
+IS_WINDOWS = sys.platform == "win32"
+
+skip_list = skip_dict["test_ops_xpu.py"] + skip_dict_specifical["test_ops_xpu.py"]
+if IS_WINDOWS:
+    skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_arc["test_ops_xpu.py"]
+
+skip_options = " -k \"not " + skip_list[0]
+for skip_case in skip_list[1:]:
+    skip_option = " and not " + skip_case
+    skip_options += skip_option
+skip_options += "\""
+
+os.environ["PYTORCH_TEST_WITH_SLOW"]="1"
+test_command = "pytest -v test_ops_xpu.py"
+test_command += skip_options
+res = os.system(test_command)
+sys.exit(res)
\ No newline at end of file
diff --git a/test/xpu/extended/skip_list_arc.py b/test/xpu/extended/skip_list_arc.py
new file mode 100644
index 000000000..e1e701b84
--- /dev/null
+++ b/test/xpu/extended/skip_list_arc.py
@@ -0,0 +1,11 @@
+skip_dict = {
+    "test_ops_xpu.py": (
+        # RuntimeError: Required aspect fp64 is not supported on the device
+        # https://github.com/intel/torch-xpu-ops/issues/628
+        "test_compare_cpu_bincount_xpu_int16",
+        "test_compare_cpu_bincount_xpu_int32",
+        "test_compare_cpu_bincount_xpu_int64",
+        "test_compare_cpu_bincount_xpu_int8",
+        "test_compare_cpu_bincount_xpu_uint8",
+    ),
+}
diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py
new file mode 100644
index 000000000..d754c9150
--- /dev/null
+++ b/test/xpu/extended/skip_list_common.py
@@ -0,0 +1,166 @@
+skip_dict = {
+    "test_ops_xpu.py": (
+        # Calculation error between XPU implementation and CPU implementation,
+    # 1. Compiler optimization causes failing to promote data type to higher precision.
+    # 2. Accumulate error is amplified by some operations in some extreme cases. (std::exp(extreme_large_num))
+    # 3. Accumulate error is amplified by a large number of accumalate operations.
+    # 4. Accumulate error is different on different implementations due to different accumulation order.
+    #     a. Different kernel implementations.
+    #     b. Different std functions. (std::log, std::tanh, std::exp)
+    # 5. The result of division between two same float values is not 1.
+    # 6. std functions get different results when input is nan or inf between GCC and SYCL.
+    "test_compare_cpu_cumsum_xpu_bfloat16",
+    "test_compare_cpu_cumsum_xpu_float16",
+    "test_compare_cpu_log_xpu_complex64",
+    "test_compare_cpu_log10_xpu_complex64",
+    "test_compare_cpu_log1p_xpu_complex64",
+    "test_compare_cpu_log2_xpu_complex64",
+    "test_compare_cpu_log2_xpu_complex128",
+    "test_compare_cpu_mul_xpu_complex64",
+    "test_compare_cpu_pow_xpu_complex128",
+    "test_compare_cpu_pow_xpu_complex64",
+    "test_compare_cpu_tan_xpu_complex128",
+    "test_compare_cpu_tan_xpu_complex64",
+    "test_compare_cpu_tanh_xpu_complex128",
+    "test_compare_cpu_tanh_xpu_complex64",
+    "test_compare_cpu_rsqrt_xpu_bfloat16",
+    "test_compare_cpu_pow_xpu_bfloat16",
+    # cuda has the same issue on this case
+    "test_compare_cpu__refs_rsub_xpu_bfloat16",
+    "test_compare_cpu_add_xpu_bfloat16",
+    "test_compare_cpu_sub_xpu_bfloat16",
+    "test_compare_cpu_acos_xpu_complex128",
+    "test_compare_cpu_acos_xpu_complex64",
+    "test_compare_cpu_acosh_xpu_complex64",
+    "test_compare_cpu_cross_xpu_float16",
+    "test_compare_cpu_floor_divide_xpu_bfloat16",
+    "test_compare_cpu_floor_divide_xpu_float16",
+    "test_compare_cpu_polygamma_polygamma_n_0_xpu_bfloat16",
+    "test_compare_cpu_exp_xpu_bfloat16",
+    "test_compare_cpu_exp_xpu_complex128",
+    "test_compare_cpu_exp_xpu_complex64",
+    "test_compare_cpu_acosh_xpu_complex64",
+    "test_compare_cpu_asin_xpu_complex128",
+    "test_compare_cpu_asin_xpu_complex64",
+    "test_compare_cpu_asinh_xpu_complex128",
+    "test_compare_cpu_asinh_xpu_complex64",
+    "test_compare_cpu_atan_xpu_complex128",
+    "test_compare_cpu_atan_xpu_complex64",
+    "test_compare_cpu_exp2_xpu_complex128",
+    "test_compare_cpu_exp2_xpu_complex64",
+    "test_compare_cpu_nextafter_xpu_bfloat16",
+    # CUDA does not support the data type either
+    "test_non_standard_bool_values_native_dropout_backward_xpu_bool",
+    # Need FP64 golden ref for more accurate comparison
+    "test_compare_cpu_log_softmax_xpu_bfloat16",
+    # TestCompositeCompliance
+    # CPU fallback fails
+    # Require implementing aten::embedding_renorm_
+    "test_view_replay_nn_functional_embedding_xpu_float32",
+    # TestCompositeCompliance::test_cow_input
+    # XPU Tensor fails in copy-on-write cases
+    # AssertionError: False is not true : Keyword argument 'output grad 0' during backward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_backward=False` in this operation's OpInfo, add the arg to the OpInfo's `allow_cow_input_materialize_backward` list, or change the implementation to avoid materialization.
+    # https://github.com/intel/torch-xpu-ops/issues/281
+    "test_cow_input",
+    # XPU implementation is correct.
+    # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified)
+    # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified)
+    # CPU implementation gets NaN in the cases.
+    # https://en.cppreference.com/w/cpp/numeric/complex/exp
+    "test_compare_cpu_sigmoid_xpu_complex64",
+    "test_compare_cpu_sigmoid_xpu_complex128",
+    # Special handle (different calculation order) in CPU reference impl.
+    # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300
+    "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16",
+    "test_compare_cpu_nn_functional_embedding_bag_xpu_float16",
+    # Not implemented operators, aten::embedding_renorm_.
+    # To retrieve cases when the operators are supported.
+    # https://github.com/intel/torch-xpu-ops/issues/380
+    "test_compare_cpu_nn_functional_embedding_bag_xpu_float32",
+    "test_compare_cpu_nn_functional_embedding_bag_xpu_float64",
+    "test_view_replay_nn_functional_embedding_bag_xpu_float32",
+    # Double and complex datatype matmul is not supported in oneDNN
+    "test_compare_cpu_cdist_xpu_float64",
+    # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision
+    # Not in CUDA test scope too
+    "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16",
+    "test_compare_cpu_nn_functional_upsample_bilinear_xpu_float16",
+    # CPU result is not golden reference
+    "test_compare_cpu_nn_functional_group_norm_xpu_bfloat16",
+    "test_compare_cpu_nn_functional_group_norm_xpu_float16",
+    "test_compare_cpu_nn_functional_nll_loss_xpu_bfloat16",
+    "test_compare_cpu_nn_functional_nll_loss_xpu_float16",
+    "test_compare_cpu_nn_functional_batch_norm_xpu_bfloat16",
+    "test_compare_cpu__batch_norm_with_update_xpu_bfloat16",
+    "test_compare_cpu__batch_norm_with_update_xpu_float16",
+    "test_compare_cpu_nn_functional_huber_loss_xpu_bfloat16",
+    "test_compare_cpu_nansum_xpu_bfloat16",
+    "test_compare_cpu_nanmean_xpu_bfloat16",
+    # Align with CUDA impl by using accumulate type. But CPU doesn't use.
+    # When XPU uses original data type, the case passes.
+    "test_compare_cpu_logit_xpu_bfloat16",
+    # precison error
+    #     Mismatched elements: 1 / 24 (4.2%)
+    # Greatest absolute difference: 0.03125 at index (0, 1, 0, 1) (up to 0.001 allowed)
+    # Greatest relative difference: 0.0048828125 at index (0, 1, 0, 1) (up to 0.001 allowed)
+    "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_bfloat16",
+    # RuntimeError: "compute_index_ranges_weights" not implemented for 'Half'
+    "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float16",
+    # AssertionError: False is not true : Argument 0 during forward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_forward=False...
+    "test_cow_input_nn_functional_interpolate_bilinear_xpu_float32",
+    "test_cow_input_nn_functional_interpolate_linear_xpu_float32",
+    "test_cow_input_nn_functional_interpolate_trilinear_xpu_float32",
+    #The results of XPU and CUDA are consistent, but the results of CPU and CUDA are inconsistent
+    "test_compare_cpu_nn_functional_interpolate_linear_xpu_bfloat16",
+    "test_compare_cpu_nn_functional_interpolate_linear_xpu_float16",
+    # bicubic interpolate includes large calculation steps, accuracy reduces in half-precision
+    # Not in CUDA test scope too
+    "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_bfloat16",
+    "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float16",
+    # Not all operators are implemented for XPU tested in the case.
+    # Retrieve it once the operator is implemented.
+    # Error: The operator 'aten::glu_jvp' is not currently implemented for the XPU device.
+    "test_forward_ad_nn_functional_glu_xpu_float32",
+    # Precision error.
+    # Mismatched elements: 1 / 812 (0.1%)
+    # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed)
+    # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed)
+    "test_compare_cpu_hypot_xpu_bfloat16",
+    # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
+    # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
+    "test_compare_cpu_polar_xpu_bfloat16",
+    # Precision error.
+    # Mismatched elements: 1 / 25 (4.0%)
+    # Greatest absolute difference: 0.00146484375 at index (0, 0) (up to 0.001 allowed)
+    # Greatest relative difference: 0.0163116455078125 at index (0, 0) (up to 0.001 allowed)
+    "test_compare_cpu_sub_xpu_float16",
+    # different results for value index due to unstable sort.
+    # XPU and CUDA have the same result.
+    "test_compare_cpu_median_xpu_int16",
+    "test_compare_cpu_median_xpu_int32",
+    "test_compare_cpu_median_xpu_int64",
+    "test_compare_cpu_median_xpu_int8",
+    "test_compare_cpu_median_xpu_uint8",
+    "test_compare_cpu_nanmedian_xpu_int16",
+    "test_compare_cpu_nanmedian_xpu_int32",
+    "test_compare_cpu_nanmedian_xpu_int64",
+    "test_compare_cpu_nanmedian_xpu_int8",
+    "test_compare_cpu_nanmedian_xpu_uint8",
+
+    # sort algorithm is different to cpu
+    "test_compare_cpu_argsort_xpu_bfloat16",
+    "test_compare_cpu_argsort_xpu_float16",
+    "test_compare_cpu_argsort_xpu_int8",
+    "test_compare_cpu_argsort_xpu_uint8",
+
+    # AssertionError: The values for attribute 'dtype' do not match: torch.float32 != torch.bfloat16
+    # https://github.com/intel/torch-xpu-ops/issues/780
+    "test_compare_cpu_native_layer_norm_xpu_bfloat16",
+    "test_compare_cpu_native_layer_norm_xpu_float16",
+
+    # AssertionError: Tensor-likes are not close!
+    # https://github.com/intel/torch-xpu-ops/issues/781
+    "test_compare_cpu_square_xpu_complex64",
+
+    ),
+}
diff --git a/test/xpu/extended/skip_list_win.py b/test/xpu/extended/skip_list_win.py
new file mode 100644
index 000000000..e8faa008d
--- /dev/null
+++ b/test/xpu/extended/skip_list_win.py
@@ -0,0 +1,5 @@
+skip_dict = {
+    "test_ops_xpu.py": (
+        "test_compare_cpu_pow_xpu_bfloat16",  # https://github.com/intel/torch-xpu-ops/pull/764
+    ),
+}
diff --git a/test/xpu/extended/skip_list_win_arc.py b/test/xpu/extended/skip_list_win_arc.py
new file mode 100644
index 000000000..9d9221a4d
--- /dev/null
+++ b/test/xpu/extended/skip_list_win_arc.py
@@ -0,0 +1,8 @@
+skip_dict = {
+    # SYCL Compiler on Windows removed the following operations when '-cl-poison-unsupported-fp64-kernels' is on
+    # Hence, skip the following windows specific errors
+    "test_ops_xpu.py": (
+        "test_compare_cpu_sqrt_xpu_complex64",
+        "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32",
+    ),
+}
diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py
index e6d46fccb..93fc8f359 100644
--- a/test/xpu/extended/test_ops_xpu.py
+++ b/test/xpu/extended/test_ops_xpu.py
@@ -44,7 +44,11 @@
 xpu_device = torch.device("xpu")
 
 any_common_cpu_xpu_one = OpDTypes.any_common_cpu_cuda_one
-cpu_xpu_all = (torch.bfloat16, torch.complex128, torch.complex64, torch.float16, torch.float32, torch.float64, torch.int16, torch.int32, torch.int64, torch.int8, torch.uint8, torch.bool)
+cpu_xpu_all = (torch.bfloat16, torch.complex64, torch.float16, torch.float32, torch.int16,
+               torch.int32, torch.int64, torch.int8, torch.uint8, torch.bool) \
+               if "has_fp64=0" in str(torch.xpu.get_device_properties(0)) else \
+               (torch.bfloat16, torch.complex128, torch.complex64, torch.float16, torch.float32,
+                torch.float64, torch.int16, torch.int32, torch.int64, torch.int8, torch.uint8, torch.bool)
 _ops_and_refs_with_no_numpy_ref = [op for op in ops_and_refs if op.ref is None]
 
 _xpu_computation_ops = [
diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py
index 9b48b4959..28ce0a072 100644
--- a/test/xpu/run_test_with_only.py
+++ b/test/xpu/run_test_with_only.py
@@ -6,37 +6,27 @@
 
 
 def launch_test(test_case, skip_list=None, exe_list=None):
+    os.environ["PYTORCH_ENABLE_XPU_FALLBACK"]="1"
+    os.environ["PYTORCH_TEST_WITH_SLOW"]="1"
     if skip_list != None:
-        skip_options = " -k 'not " + skip_list[0]
+        skip_options = " -k \"not " + skip_list[0]
         for skip_case in skip_list[1:]:
             skip_option = " and not " + skip_case
             skip_options += skip_option
-        skip_options += "'"
-        test_command = (
-            "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v "
-            + test_case
-        )
-        test_command += skip_options
+        skip_options += "\""
+        test_command   = "pytest -v "  + test_case  + skip_options
         return os.system(test_command)
     elif exe_list != None:
-        exe_options = " -k '" + exe_list[0]
+        exe_options = " -k \"" + exe_list[0]
         for exe_case in exe_list[1:]:
             exe_option = " or " + exe_case
             exe_options += exe_option
-        exe_options += "'"
-        test_command = (
-            "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v "
-            + test_case
-        )
-        test_command += exe_options
+        exe_options += "\""
+        test_command   = "pytest -v "  + test_case  + exe_options
         return os.system(test_command)
     else:
-        test_command = (
-            "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v "
-            + test_case
-        )
-        return os.system(test_command)
-
+        test_command   = "pytest -v " + test_case  
+        return os.system(test_command)     
 
 res = 0
 
@@ -51,5 +41,8 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 )
 res += launch_test("test_decomp_xpu.py", exe_list=execute_list)
 
-exit_code = os.WEXITSTATUS(res)
-sys.exit(exit_code)
+if os.name == "nt":
+    sys.exit(res)
+else:    
+    exit_code = os.WEXITSTATUS(res)
+    sys.exit(exit_code)
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index e348355b5..c321b7a9b 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -17,5 +17,9 @@
 if fail_test:
     print(",".join(fail_test) + " have failures")
 
-exit_code = os.WEXITSTATUS(res)
-sys.exit(exit_code)
+
+if os.name == "nt":
+    sys.exit(res)
+else:    
+    exit_code = os.WEXITSTATUS(res)
+    sys.exit(exit_code)
diff --git a/test/xpu/run_test_with_skip_arc.py b/test/xpu/run_test_with_skip_arc.py
index c20b1e5ac..3c99244a1 100644
--- a/test/xpu/run_test_with_skip_arc.py
+++ b/test/xpu/run_test_with_skip_arc.py
@@ -20,5 +20,8 @@
         skip_list += skip_dict_win_arc[key]
     res += launch_test(key, skip_list)
 
-exit_code = os.WEXITSTATUS(res)
-sys.exit(exit_code)
+if os.name == "nt":
+    sys.exit(res)
+else:    
+    exit_code = os.WEXITSTATUS(res)
+    sys.exit(exit_code)
\ No newline at end of file
diff --git a/test/xpu/run_test_with_skip_mtl.py b/test/xpu/run_test_with_skip_mtl.py
index 828466b11..b8f6c180e 100644
--- a/test/xpu/run_test_with_skip_mtl.py
+++ b/test/xpu/run_test_with_skip_mtl.py
@@ -16,5 +16,8 @@
         skip_list += skip_dict_win[key]
     res += launch_test(key, skip_list)
 
-exit_code = os.WEXITSTATUS(res)
-sys.exit(exit_code)
+if os.name == "nt":
+    sys.exit(res)
+else:    
+    exit_code = os.WEXITSTATUS(res)
+    sys.exit(exit_code)
\ No newline at end of file
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index a83fa6749..6b5df037b 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -1,6 +1,9 @@
 skip_dict = {
     "test_ops_xpu.py": (
         # Skip list of base line
+
+        # Need to revisit when the ops are enabled
+        # AssertionError: The supported dtypes for xxx on device type xpu are incorrect! 
         "test_dtypes___rmod___xpu",
         "test_dtypes_nn_functional_conv1d_xpu",
         "test_dtypes_nn_functional_conv2d_xpu",
@@ -10,176 +13,122 @@
         "test_dtypes_nn_functional_conv_transpose3d_xpu",
         "test_dtypes_nn_functional_softsign_xpu",
         "test_dtypes_sparse_sampled_addmm_xpu",
-        "test_compare_cpu_sparse_sampled_addmm_xpu_float32",
-        "test_compare_cpu_to_sparse_xpu_float32",
+        # AssertionError: RuntimeError not raised
         "test_errors_dot_xpu",
-        "test_errors_kthvalue_xpu",
-        "test_errors_sparse_mul_layout0_xpu",
-        "test_errors_sparse_mul_layout1_xpu",
-        "test_errors_sparse_mul_layout2_xpu",
-        "test_errors_sparse_mul_layout3_xpu",
+        "test_errors_kthvalue_xpu",     
         "test_errors_take_xpu",
         "test_errors_vdot_xpu",
-        "test_non_standard_bool_values_to_sparse_xpu_bool",
+        # Fallback cases with skipCPUIfNoLapack, AssertionError: Tensor-likes are not close!
         "test_noncontiguous_samples_linalg_det_xpu_float32",
         "test_noncontiguous_samples_linalg_slogdet_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_ex_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_xpu_float32",
         "test_noncontiguous_samples_linalg_tensorsolve_xpu_float32",
         "test_noncontiguous_samples_logdet_xpu_float32",
-        "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_conv3d_xpu_int64",
+        "test_noncontiguous_samples_nn_functional_rrelu_xpu_float32",
         "test_noncontiguous_samples_nn_functional_conv3d_xpu_complex64",
-        "test_noncontiguous_samples_nn_functional_conv_transpose1d_xpu_int64",
+        "test_variant_consistency_eager_nn_functional_rrelu_xpu_float32",
+
+        # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta
+        # https://github.com/intel/torch-xpu-ops/issues/357
+        "test_compare_cpu_sparse_sampled_addmm_xpu_float32",
+        "test_errors_sparse_mul_layout0_xpu",
+        "test_errors_sparse_mul_layout1_xpu",
+        "test_errors_sparse_mul_layout2_xpu",
+        "test_errors_sparse_mul_layout3_xpu",
+        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64",
+        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32",      
+
+        # NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend.
+        # https://github.com/intel/torch-xpu-ops/issues/357
+        "test_compare_cpu_to_sparse_xpu_float32",
+        "test_variant_consistency_eager_to_sparse_xpu_float32",
+
+        # RuntimeError: sparse_dim expected sparse or strided tensor layout but got Sparse
+        # Issue https://github.com/intel/torch-xpu-ops/issues/357
+        "test_variant_consistency_eager_to_sparse_xpu_complex64",
+        "test_non_standard_bool_values_to_sparse_xpu_bool",        
+        
+        # OneDNN issues, https://github.com/intel/torch-xpu-ops/issues/253
+        # RuntimeError: Long is not supported in oneDNN! 
+        # RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
+        # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
+        "test_noncontiguous_samples_nn_functional_conv3d_xpu_int64",        
+        "test_noncontiguous_samples_nn_functional_conv_transpose1d_xpu_int64",        # 
         "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_complex64",
         "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_float32",
         "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_int64",
         "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_complex64",
         "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_float32",
         "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_rrelu_xpu_float32",
+        "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64",
+        "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64",
+        
+        # RuntimeError: mode only supports CPU AND CUDA device type, got: xpu
+        # Issue https://github.com/intel/torch-xpu-ops/issues/327
         "test_numpy_ref_linalg_tensorinv_xpu_float64",
         "test_out_mode_xpu_float32",
+
+        # RuntimeError: false INTERNAL ASSERT FAILED at "/home/gta/daisyden/pytorch4/aten/src/ATen/native/DispatchStub.cpp":220, please report a bug to PyTorch. DispatchStub: missing kernel for xpu
         "test_out_nanmean_xpu_float32",
-        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64",
-        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32",
-        "test_out_warning_nanmean_xpu",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int16",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int32",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int64",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int8",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_uint8",
-        "test_python_ref__refs_linspace_xpu_int16",
-        "test_python_ref__refs_linspace_xpu_int32",
-        "test_python_ref__refs_linspace_xpu_int64",
-        "test_python_ref__refs_linspace_xpu_int8",
-        "test_python_ref__refs_linspace_xpu_uint8",
-        "test_python_ref__refs_logaddexp_xpu_complex128",
-        "test_python_ref__refs_logaddexp_xpu_complex64",
-        "test_python_ref__refs_native_layer_norm_xpu_bfloat16",
-        "test_python_ref__refs_native_layer_norm_xpu_float16",
-        "test_python_ref__refs_native_layer_norm_xpu_float32",
-        "test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16",
-        "test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_float16",
-        "test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_bfloat16",
-        "test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_float16",
-        "test_python_ref__refs_nn_functional_triplet_margin_loss_xpu_uint8",
-        "test_python_ref__refs_square_xpu_bool",
-        "test_python_ref__refs_trunc_xpu_float64",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_float64",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int16",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int32",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int64",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int8",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int16",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int32",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int64",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int8",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int16",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int32",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int64",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int8",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float64",
+        "test_out_warning_nanmean_xpu",  
+
+        # NameError: name 'nanj' is not defined. Did you mean: 'nan'?
+        # https://github.com/intel/torch-xpu-ops/issues/768
         "test_python_ref_executor__refs_logaddexp_executor_aten_xpu_complex128",
         "test_python_ref_executor__refs_logaddexp_executor_aten_xpu_complex64",
-        "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float64",
-        "test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_nn_functional_triplet_margin_loss_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_square_executor_aten_xpu_bool",
-        "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex128",
-        "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex64",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int16",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int32",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int64",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int8",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_uint8",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int16",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int32",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int64",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int8",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_uint8",
-        "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex64",
-        "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_bfloat16",
-        "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float16",
-        "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float32",
-        "test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16",
-        "test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_float16",
-        "test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_bfloat16",
-        "test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_float16",
-        "test_python_ref_torch_fallback__refs_sinh_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_special_multigammaln_mvlgamma_p_5_xpu_int32",
-        "test_python_ref_torch_fallback__refs_square_xpu_bool",
-        "test_python_ref_torch_fallback__refs_vdot_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_vdot_xpu_complex64",
+
+        # RuntimeError: could not create a primitive descriptor for a deconvolution 
+        # https://github.com/intel/torch-xpu-ops/issues/253
         "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_complex64",
         "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_float32",
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64",
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_float32",
-        "test_variant_consistency_eager_nn_functional_rrelu_xpu_float32",
-        "test_variant_consistency_eager_to_sparse_xpu_complex64",
-        "test_variant_consistency_eager_to_sparse_xpu_float32",
-        "test_compare_cpu__refs_special_zeta_xpu_float32",
+
+        # Need revisit when the op is enabled
+        # Unexpected success, xpu passed because it compares to cpu
         "test_compare_cpu_linalg_lu_factor_ex_xpu_float32",
         "test_compare_cpu_linalg_lu_factor_xpu_float32",
         "test_compare_cpu_linalg_lu_xpu_float32",
         "test_compare_cpu_special_hermite_polynomial_h_xpu_float32",
-        "test_compare_cpu_special_zeta_xpu_float32",
+        "test_compare_cpu_special_zeta_xpu_float32",  
+
+         # XFAIL of CUDA and XPU, unexpected success in fallback
         "test_out_cholesky_inverse_xpu_float32",
         "test_out_geqrf_xpu_float32",
         "test_out_narrow_copy_xpu_float32",
         "test_out_ormqr_xpu_float32",
-        "test_out_triangular_solve_xpu_float32",
-        "test_python_ref__refs_heaviside_xpu_int64",
-        "test_python_ref__refs_special_bessel_j0_xpu_int64",
-        "test_python_ref_errors__refs_dstack_xpu",
-        "test_python_ref_errors__refs_hstack_xpu",
-        "test_python_ref_errors__refs_linalg_cross_xpu",
-        "test_python_ref_errors__refs_vstack_xpu",
+        
+        # XFAIL of CUDA, XPU got unexpected success
+        "test_python_ref__refs_div_no_rounding_mode_xpu_complex32",
+        "test_python_ref__refs_pow_xpu_complex32",
         "test_python_ref_executor__refs_mul_executor_aten_xpu_complex32",
-        "test_python_ref__refs_special_multigammaln_mvlgamma_p_5_xpu_float64",
-        "test_python_ref_executor__refs_special_multigammaln_mvlgamma_p_3_executor_aten_xpu_float64",
-        "test_python_ref__refs_square_xpu_complex128",
-        "test_python_ref__refs_square_xpu_complex64",
-        "test_python_ref_executor__refs_istft_executor_aten_xpu_complex128",
-        "test_python_ref_executor__refs_square_executor_aten_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_square_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_square_xpu_complex64",
-        # Skip list of new added when porting XPU operators.
-        # See: https://github.com/intel/torch-xpu-ops/issues/128
-        "test_dtypes_view_as_complex_xpu",  # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16}
-        "test_dtypes_view_as_real_xpu",  # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16}
-        "test_noncontiguous_samples_native_dropout_backward_xpu_int64",  # The implementation aligns with CUDA, RuntimeError: "masked_scale" not implemented for 'Long'.
-        "test_non_standard_bool_values_native_dropout_backward_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "masked_scale" not implemented for 'Bool'.
-        "test_non_standard_bool_values_scatter_reduce_amax_xpu_bool",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-        "test_non_standard_bool_values_scatter_reduce_amin_xpu_bool",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-        "test_non_standard_bool_values_scatter_reduce_prod_xpu_bool",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-        "test_non_standard_bool_values_argsort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "argsort" not implemented for 'Bool'.
-        "test_non_standard_bool_values_msort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "msort" not implemented for 'Bool'.
+        "test_python_ref_torch_fallback__refs_div_no_rounding_mode_xpu_complex32",
+        "test_python_ref__refs_pow_xpu_complex32",
+        "test_python_ref_executor__refs_mul_executor_aten_xpu_complex32",
+        "test_python_ref_torch_fallback__refs_div_no_rounding_mode_xpu_complex32",
+        "test_python_ref_torch_fallback__refs_pow_xpu_complex32",
+
+
+        # unexpected success because of cpu fallback 
+        "test_out_triangular_solve_xpu_float32",
+
+        # Newly added:
+
+        # Cuda skipped it     
         "test_non_standard_bool_values_sort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "sort" not implemented for 'Bool'.
+
+        # Cuda skipped it
+        "test_non_standard_bool_values_msort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "msort" not implemented for 'Bool'.
+
+        
+        # Unexpected success
         "test_python_ref_executor__refs_pow_executor_aten_xpu_complex32",  # Didn't align with CUDA, Unexpected success
+
         # Unexpected success
-        "test_errors_histogramdd_xpu",
+        # "test_errors_histogramdd_xpu", #XFAIL now
         # Jiterator is only supported on CUDA and ROCm GPUs, none are available.
+        # https://github.com/intel/torch-xpu-ops/issues/584
         "_jiterator_",
         # https://github.com/intel/torch-xpu-ops/issues/157
         # Segfault:
@@ -473,9 +422,11 @@
         "test_variant_consistency_eager_triangular_solve_xpu_complex64",
         # oneDNN issues
         # RuntimeError: value cannot be converted to type float without overflow
+        # https://github.com/intel/torch-xpu-ops/issues/683
         "test_conj_view_addbmm_xpu_complex64",
         "test_neg_conj_view_addbmm_xpu_complex128",
         # CPU fallback error: AssertionError: Tensor-likes are not close!
+        # https://github.com/intel/torch-xpu-ops/issues/271
         "test_neg_view_nn_functional_rrelu_xpu_float64",
         ### Error #0 in TestMathBitsXPU , RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         # https://github.com/intel/torch-xpu-ops/issues/254
@@ -693,7 +644,7 @@
         "test_neg_view_nn_functional_conv_transpose2d_xpu_float64",
         "test_neg_view_nn_functional_conv_transpose3d_xpu_float64",
         ### Error #2 in TestMathBitsXPU , NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend.
-        # https://github.com/intel/torch-xpu-ops/issues/242
+        # https://github.com/intel/torch-xpu-ops/issues/242 and https://github.com/intel/torch-xpu-ops/issues/240
         "test_conj_view_to_sparse_xpu_complex64",
         "test_neg_conj_view_to_sparse_xpu_complex128",
         "test_neg_view_to_sparse_xpu_float64",
@@ -703,13 +654,14 @@
         # in XPU supported operators. Then the case will work.
         "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64",
         "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64",
+
         #AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect!
         #The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}.
-        #XPU supports bfloat16, CUDA doesn't support it.
+        # XPU supports bfloat16, CUDA doesn't support it.
         "test_dtypes_unique_xpu",
         # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
         # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
-        "test_dtypes_polar_xpu",
+        #"test_dtypes_polar_xpu",
         # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
         # but test_dtypes infrastructure leverage CUDA supported datatypes
         "test_dtypes_histogram_xpu",
@@ -718,32 +670,15 @@
         # 1. most cases of nextafter require Half dtype.
         # 2. Half dtype is a common dtype in workloads.
         # So far CUDA doesn't support Half, so that XPU fails as we aligned claimed dtypes with CUDA in test infra.
+        # https://github.com/intel/torch-xpu-ops/issues/623
         "test_dtypes_nextafter_xpu",
-        # AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect!
-        # The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}.
-        # XPU supports bfloat16, CUDA doesn't support it.
-        "test_dtypes_unique_consecutive_xpu",
-        # 0816 add 
-        "test_dtypes__unsafe_masked_index_put_accumulate_xpu",
-        "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_complex64",
-        "test_conj_view__unsafe_masked_index_put_accumulate_xpu_complex64",
-        "test_neg_conj_view__unsafe_masked_index_put_accumulate_xpu_complex128",
-        "test_neg_view__unsafe_masked_index_put_accumulate_xpu_float64",
-        "test_compare_cpu__unsafe_masked_index_put_accumulate_xpu_float32",
-        "test_dtypes__unsafe_masked_index_put_accumulate_xpu",
-        "test_multiple_devices__unsafe_masked_index_put_accumulate_xpu_float32",
-        "test_multiple_devices__unsafe_masked_index_put_accumulate_xpu_int64",
-        "test_non_standard_bool_values__unsafe_masked_index_put_accumulate_xpu_bool",
-        "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_complex64",
-        "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_float32",
-        "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_int64",
-        "test_out__unsafe_masked_index_put_accumulate_xpu_float32",
-        "test_out_warning__unsafe_masked_index_put_accumulate_xpu",
-        "test_variant_consistency_eager__unsafe_masked_index_put_accumulate_xpu_complex64",
-        "test_variant_consistency_eager__unsafe_masked_index_put_accumulate_xpu_float32",
-        "test_conj_view__unsafe_masked_index_put_accumulate_xpu_complex64",
-        "test_neg_conj_view__unsafe_masked_index_put_accumulate_xpu_complex128",
-        "test_neg_view__unsafe_masked_index_put_accumulate_xpu_float64",
+
+        # AssertionError: The supported dtypes for argsort on device type xpu are incorrect!
+        # The following dtypes worked in forward but are not listed by the OpInfo: {torch.bool}.
+        # CUDA does not have torch.bool support on argsort.
+        "test_dtypes_argsort_xpu",
+        # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported"
+        "test_errors_histogramdd_xpu",
     ),
 
     "test_binary_ufuncs_xpu.py": (
@@ -1185,9 +1120,9 @@
         # It is kernel assert on XPU implementation not exception on host.
         # We are same as CUDA implementation. And CUDA skips these cases.
         "test_trivial_fancy_out_of_bounds_xpu",
-        "test_advancedindex",
-        # CUDA bias case
-        "test_index_put_accumulate_with_optional_tensors_xpu",
+        # index boundary should be checked. 
+        # https://github.com/intel/torch-xpu-ops/issues/783
+        "test_advancedindex_xpu_float64",
         # XPU implementation doesn't claimn FP8 now
         # https://github.com/intel/torch-xpu-ops/issues/461
         "test_index_put_src_datatype_xpu_float8_e5m2",
@@ -1330,10 +1265,18 @@
         "test_reference_numerics_normal_polygamma_polygamma_n_2_xpu_float16",
         "test_reference_numerics_normal_polygamma_polygamma_n_3_xpu_float16",
         "test_reference_numerics_normal_polygamma_polygamma_n_4_xpu_float16",
+
+        # CUDA XFAIL
+        "test_reference_numerics_large__refs_rsqrt_xpu_complex32",
+
+        # Compiler issue in handling tanh with real or imag inf.
+        # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974
+        "test_reference_numerics_large__refs_tanh_xpu_complex32",
     ),
 
     "test_masked_xpu.py": (
         # RuntimeError: is_coalesced expected sparse coordinate tensor layout but got Sparse.
+        # https://github.com/intel/torch-xpu-ops/issues/357
         "test_mask_layout_sparse_coo_masked_amax_xpu_bfloat16",
         "test_mask_layout_sparse_coo_masked_amax_xpu_float16",
         "test_mask_layout_sparse_coo_masked_amax_xpu_float32",
@@ -1645,7 +1588,7 @@
         "test_triangular_solve_xpu_complex64",
         "test_triangular_solve_xpu_float64",
         # https://github.com/intel/torch-xpu-ops/issues/317
-        # addmm.out, addmv.out, linalg_lstsq, linalg_vector_norm.out, norm.out, vdot&dot lack XPU support and fallback to CPU
+        # addmm.out, addmv.out, linalg_lstsq, vdot&dot lack XPU support and fallback to CPU
         "test_addmm_sizes_xpu_complex128",
         "test_addmm_sizes_xpu_complex64",
         "test_blas_alpha_beta_empty_xpu_complex128",
@@ -1654,21 +1597,18 @@
         "test_linalg_lstsq_input_checks_xpu_complex64",
         "test_linalg_lstsq_input_checks_xpu_float32",
         "test_linalg_lstsq_input_checks_xpu_float64",
-        "test_norm_fused_type_promotion_xpu_bfloat16",
-        "test_norm_fused_type_promotion_xpu_float16",
         "test_dot_invalid_args_xpu",
         "test_vdot_invalid_args_xpu",
+        # https://github.com/intel/torch-xpu-ops/issues/821
+        # RuntimeError: Fail to enable Kineto Profiler on XPU due to error code: 200
+        "test_norm_fused_type_promotion_xpu_bfloat16",
+        # AssertionError: True is not false
+        "test_norm_fused_type_promotion_xpu_float16",
+        # https://github.com/intel/torch-xpu-ops/issues/814
         # xpu does not have '_cuda_tunableop_is_enabled' API
         "test_matmul_small_brute_force_tunableop_xpu_float16",
         "test_matmul_small_brute_force_tunableop_xpu_float32",
         "test_matmul_small_brute_force_tunableop_xpu_float64",
-        # TypeError: Invalid NaN comparison.
-        "test_compile_int4_mm_m_32_k_32_n_48_xpu",
-        "test_compile_int4_mm_m_32_k_32_n_64_xpu",
-        "test_compile_int4_mm_m_32_k_64_n_48_xpu",
-        "test_compile_int4_mm_m_32_k_64_n_64_xpu",
-        "test_compile_int4_mm_m_64_k_32_n_48_xpu",
-        "test_compile_int4_mm_m_64_k_32_n_64_xpu",
         # XPU does not support tunable.
         "test_bmm_tunableop_rocm_xpu_float32",
         "test_numeric_check_leak_tunableop_rocm_xpu_float32",
@@ -1991,6 +1931,7 @@
 
     "test_maskedtensor_xpu.py": (
         # RuntimeError: is_coalesced expected sparse coordinate tensor layout but got Sparse
+        # https://github.com/intel/torch-xpu-ops/issues/357
         "test_contiguous_xpu",
         "test_invalid_sparse_coo_values_xpu",
         "test_to_dense_and_sparse_coo_xpu",
@@ -2710,6 +2651,7 @@
         "test_autodiff__foreach_sigmoid_outplace_xpu_complex128",
         "test_binary_op_with_scalar_self_support__foreach_pow_is_fastpath_True_xpu_bool",
         # AssertionError: RuntimeError not raised
+        # https://github.com/intel/torch-xpu-ops/issues/784 
         "test_0dim_tensor_overload_exception_xpu",
         # RuntimeError: Tried to instantiate dummy base class CUDAGraph
         "test_big_num_tensors__foreach_max_use_cuda_graph_True_xpu_float32",
diff --git a/test/xpu/skip_list_win.py b/test/xpu/skip_list_win.py
index 35bf3014a..f6f74c817 100644
--- a/test/xpu/skip_list_win.py
+++ b/test/xpu/skip_list_win.py
@@ -68,8 +68,4 @@
         "test_reference_numerics_large_sin_xpu_complex64",
         "test_reference_numerics_small_acos_xpu_complex32",
     ),
-    "test_ops_xpu.py": (
-        "test_compare_cpu_pow_xpu_bfloat16",  # https://github.com/intel/torch-xpu-ops/pull/764
-    ),
-
 }
diff --git a/test/xpu/skip_list_win_arc.py b/test/xpu/skip_list_win_arc.py
index 0b1d079e1..de5d03891 100644
--- a/test/xpu/skip_list_win_arc.py
+++ b/test/xpu/skip_list_win_arc.py
@@ -1,10 +1,4 @@
 skip_dict = {
-    # SYCL Compiler on Windows removed the following operations when '-cl-poison-unsupported-fp64-kernels' is on
-    # Hence, skip the following windows specific errors
-    "test_ops_xpu.py": (
-        "test_compare_cpu_sqrt_xpu_complex64",
-        "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32",
-    ),
     "test_binary_ufuncs_xpu": (
         "test_batch_vs_slicing___rpow___xpu_complex64",
         "test_batch_vs_slicing__refs_pow_xpu_complex64",
diff --git a/test/xpu/test_indexing_xpu.py b/test/xpu/test_indexing_xpu.py
index c3637b0fe..d57567318 100644
--- a/test/xpu/test_indexing_xpu.py
+++ b/test/xpu/test_indexing_xpu.py
@@ -11,6 +11,37 @@
 
 with XPUPatchForImport(False):
     from test_indexing import NumpyTests,TestIndexing
+    import torch
+
+    
+    def __test_index_put_accumulate_with_optional_tensors(self, device):
+        # TODO: replace with a better solution.
+        # Currently, here using torchscript to put None into indices.
+        # on C++ it gives indices as a list of 2 optional tensors: first is null and
+        # the second is a valid tensor.
+        @torch.jit.script
+        def func(x, i, v):
+            idx = [None, i]
+            x.index_put_(idx, v, accumulate=True)
+            return x
+        
+        n = 4
+        t = torch.arange(n * 2, dtype=torch.float32).reshape(n, 2)
+        t_dev = t.to(device)
+        indices = torch.tensor([1, 0])
+        indices_dev = indices.to(device)
+        value0d = torch.tensor(10.0)
+        value1d = torch.tensor([1.0, 2.0])
+
+        out_cuda = func(t_dev, indices_dev, value0d.xpu())
+        out_cpu = func(t, indices, value0d)
+        self.assertEqual(out_cuda.cpu(), out_cpu)
+
+        out_cuda = func(t_dev, indices_dev, value1d.xpu())
+        out_cpu = func(t, indices, value1d)
+        self.assertEqual(out_cuda.cpu(), out_cpu)
+
+    TestIndexing.test_index_put_accumulate_with_optional_tensors = __test_index_put_accumulate_with_optional_tensors
 
 instantiate_device_type_tests(NumpyTests, globals(), only_for=("xpu"), allow_xpu=True)
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index debab066a..01d140858 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -237,6 +237,11 @@
     "nan_to_num",
     "scatter_reduce",
     "nanmean",
+    "native_layer_norm",
+    "native_layer_norm_backward",
+    "square",
+    "heaviside",
+    "argsort",
 ]
 
 _ops_without_cuda_support = [
@@ -415,6 +420,7 @@ def ModuleTest_test_xpu(self, test_case):
             xpu_gradInput = test_case._backward(
                 xpu_module, xpu_input_tuple, xpu_output, xpu_gradOutput
             )
+            
             test_case.assertEqual(
                 cpu_gradInput,
                 xpu_gradInput,
@@ -570,7 +576,7 @@ def convert_dtype(obj, dtype, requires_grad=False):
 CriterionTest.test_cuda = CriterionTest_test_xpu
 
 from torch.testing._internal.common_methods_invocations import sample_inputs_cat_concat, S, M
-from torch.testing._internal.common_methods_invocations import make_tensor
+from torch.testing._internal.common_methods_invocations import make_tensor, mask_not_all_zeros
 from functools import partial
 from torch.testing._internal.opinfo.core import SampleInput
 
@@ -604,6 +610,21 @@ def index_variable_nofp64(shape, max_indices, device=torch.device('cpu')):
     index = torch.rand(*shape, dtype=torch.float32, device=device).mul_(max_indices).floor_().long()
     return index
 
+def sample_inputs_index_put_nofp64(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    for accumulate in [False, True]:
+        # Test with indices arg
+        yield SampleInput(
+            make_arg((S, S,)),
+            (index_variable_nofp64(2, S, device=device),),
+            make_arg((2, S)),
+            accumulate=accumulate)
+
+        # Test with mask arg
+        mask = torch.zeros(S, dtype=torch.bool) if accumulate else mask_not_all_zeros((S,))
+        yield SampleInput(
+            make_arg((S, S)), (mask, ), make_arg((S,)), accumulate=accumulate)
 
 def sample_inputs_softmax_variant_nofp64(
     op_info,
@@ -638,6 +659,30 @@ def sample_inputs_softmax_variant_nofp64(
         SampleInput(make_arg(shape), args=dim, kwargs=kwargs) for shape, dim in cases
     )
 
+def sample_inputs_like_fns_nofp64(self, device, dtype, requires_grad, **kwargs):
+
+    inputs = [
+        ((), {}),
+        ((S, S), {}),
+        ((0, S, 0), {}),
+        ((S,), {'dtype': dtype, 'device': device}),
+        # Hard-code some dtypes/devices. We want to test cases where the
+        # (dtype, device) is different from the input's (dtype, device)
+        # disabled for ARC
+        # ((S,), {'dtype': torch.double}),
+        ((S,), {'device': 'cpu'}),
+        # disabled for ARC
+        #((S,), {'dtype': torch.double, 'device': 'cpu'}),
+    ]
+    if torch.cuda.is_available():
+        inputs.append(((S,), {'device': 'cuda'}))
+
+    for shape, kwargs in inputs:
+        t = make_tensor(shape, dtype=dtype, device=device,
+                        low=None, high=None,
+                        requires_grad=requires_grad)
+        yield SampleInput(t, **kwargs)
+
 class XPUPatchForImport:
     def __init__(self, patch_test_case=True) -> None:
         self.test_package = (
@@ -671,10 +716,6 @@ def __init__(self, patch_test_case=True) -> None:
         self.cuda_is_available = cuda.is_available
         self.cuda_is_bf16_supported = cuda.is_bf16_supported
 
-        if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
-            self.sample_inputs_softmax_variant = common_methods_invocations.sample_inputs_softmax_variant
-            self.index_variable = common_methods_invocations.index_variable
-            self.reference_inputs_cat = common_methods_invocations.reference_inputs_cat
 
     def align_db_decorators(self, db):
         def gen_xpu_wrappers(op_name, wrappers):
@@ -722,14 +763,13 @@ def gen_xpu_wrappers(op_name, wrappers):
 
     def align_supported_dtypes(self, db):
         for opinfo in db:
-            if (
-                opinfo.name not in _xpu_computation_op_list
-                or opinfo.name in _ops_without_cuda_support
-            ):
+            if ( opinfo.name not in _xpu_computation_op_list and (opinfo.torch_opinfo.name not in _xpu_computation_op_list 
+                if db == common_methods_invocations.python_ref_db else True)) or opinfo.name in _ops_without_cuda_support:
                 opinfo.dtypesIfXPU = opinfo.dtypes
             else:
                 backward_dtypes = set(opinfo.backward_dtypesIfCUDA)
-                backward_dtypes.add(bfloat16)
+                if bfloat16 in opinfo.dtypesIfXPU:
+                    backward_dtypes.add(bfloat16)
                 opinfo.backward_dtypes = tuple(backward_dtypes)
 
             if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
@@ -737,16 +777,31 @@ def align_supported_dtypes(self, db):
                 opinfo.dtypesIfXPU = set(filter(lambda x: (x not in fp64_dtypes), list(opinfo.dtypesIfXPU)))
                 opinfo.backward_dtypes = tuple(filter(lambda x: (x not in fp64_dtypes), list(opinfo.backward_dtypes)))
 
+    def filter_fp64_sample_input(self, db):
+        # Only for platform without fp64 support
+        if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
+            for opinfo in db:
+                if opinfo.name in _xpu_computation_op_list:
+                    if opinfo.variant_test_name == "with_dtype" and \
+                        opinfo.name in ["log_softmax", "softmax", "nn.functional.softmin", ] and \
+                        get_wrapped_fn(opinfo.sample_inputs_func) != opinfo.sample_inputs_func and \
+                        get_wrapped_fn(opinfo.sample_inputs_func).func.__name__ == common_methods_invocations.sample_inputs_softmax_variant.__name__:
+                            opinfo.sample_inputs_func = torch.no_grad()(partial(sample_inputs_softmax_variant_nofp64, with_dtype=True))
+                    elif opinfo.sample_inputs_func.__name__ == common_methods_invocations.sample_inputs_softmax_variant.__name__:
+                        opinfo.sample_inputs_func = sample_inputs_softmax_variant_nofp64
+                    elif opinfo.sample_inputs_func.__name__ == common_methods_invocations.sample_inputs_like_fns.__name__:
+                        opinfo.sample_inputs_func = sample_inputs_like_fns_nofp64
+                    elif opinfo.sample_inputs_func.__name__ == common_methods_invocations.sample_inputs_index_put.__name__:
+                        opinfo.sample_inputs_func = sample_inputs_index_put_nofp64
+
+                    if opinfo.reference_inputs_func != None and opinfo.reference_inputs_func.__name__ == common_methods_invocations.reference_inputs_cat.__name__:
+                        opinfo.reference_inputs_func = reference_inputs_cat_nofp64
+
     def __enter__(self):
         # Monkey patch until we have a fancy way
 
         common_device_type.onlyCUDA = common_device_type.onlyXPU
 
-        if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
-            common_methods_invocations.sample_inputs_softmax_variant = sample_inputs_softmax_variant_nofp64
-            common_methods_invocations.index_variable = index_variable_nofp64
-            common_methods_invocations.reference_inputs_cat = reference_inputs_cat_nofp64
-
         class dtypesIfXPU(common_device_type.dtypes):
             def __init__(self, *args):
                 super().__init__(*args, device_type="xpu")
@@ -773,6 +828,7 @@ def __init__(self, *args):
         ]:
             self.align_supported_dtypes(db)
             self.align_db_decorators(db)
+            self.filter_fp64_sample_input(db)
         self.align_db_decorators(module_db)
         common_methods_invocations.python_ref_db = [
             op
@@ -869,11 +925,6 @@ def __exit__(self, exc_type, exc_value, traceback):
         cuda.is_available = self.cuda_is_available
         cuda.is_bf16_supported = self.cuda_is_bf16_supported
 
-        if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
-            common_methods_invocations.sample_inputs_softmax_variant = self.sample_inputs_softmax_variant
-            common_methods_invocations.index_variable = self.index_variable
-            common_methods_invocations.reference_inputs_cat = self.reference_inputs_cat
-
 
 # Copy the test cases from generic_base_class to generic_test_class.
 # It serves to reuse test cases. Regarding some newly added hardware,
@@ -915,33 +966,33 @@ def copy_tests(
 
 
 def launch_test(test_case, skip_list=None, exe_list=None):
+    os.environ["PYTORCH_ENABLE_XPU_FALLBACK"]="1"
+    os.environ["PYTORCH_TEST_WITH_SLOW"]="1"
     if skip_list != None:
-        skip_options = " -k 'not " + skip_list[0]
+        skip_options = " -k \"not " + skip_list[0]
         for skip_case in skip_list[1:]:
             skip_option = " and not " + skip_case
             skip_options += skip_option
-        skip_options += "'"
+        skip_options += "\""
         test_command = (
-            "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v "
+            "pytest -v "
             + test_case
         )
         test_command += skip_options
-        return os.system(test_command)
     elif exe_list != None:
-        exe_options = " -k '" + exe_list[0]
+        exe_options = " -k \"" + exe_list[0]
         for exe_case in exe_list[1:]:
             exe_option = " or " + exe_case
             exe_options += exe_option
-        exe_options += "'"
+        exe_options += "\""
         test_command = (
-            "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v "
+            "pytest -v "
             + test_case
         )
         test_command += exe_options
-        return os.system(test_command)
     else:
         test_command = (
-            "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v "
+            "pytest -v "
             + test_case
         )
-        return os.system(test_command)
+    return os.system(test_command)
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index df72a5e4c..75a1b8efa 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -5147,4 +5147,21 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     XPU: sign_out
-  tags: pointwise
\ No newline at end of file
+  tags: pointwise
+
+- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: range_out_no_step
+
+- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: range_xpu_out
+  cpp_no_default_args: ['step']