From 6636a0686268548fd4e3b2f0dac0c07e731f98c2 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Thu, 5 Sep 2024 01:35:14 -0700
Subject: [PATCH] Windows: Enable building for latest torch-xpu-ops (#865)

In latest torch-xpu-ops, we have split libtorch_xpu.so into multiple
libraries to avoid a binary with large size. The change of building
system leads to two issues on Windows. The PR is to resolve them. 1)
Cyclic dependence, 2) Symbols visibility.

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 cmake/Modules/FindSYCL/run_sycl.cmake         |   6 +
 src/ATen/CMakeLists.txt                       |   5 +-
 src/ATen/native/xpu/RangeFactories.cpp        |   4 +-
 src/ATen/native/xpu/sycl/AbsKernel.cpp        |   2 +
 src/ATen/native/xpu/sycl/AbsKernel.h          |   2 +-
 .../native/xpu/sycl/ActivationEluKernels.cpp  |   2 +
 .../native/xpu/sycl/ActivationEluKernels.h    |   4 +-
 .../native/xpu/sycl/ActivationGeluKernel.cpp  |   2 +
 .../native/xpu/sycl/ActivationGeluKernel.h    |   6 +-
 .../native/xpu/sycl/ActivationGluKernels.cpp  |   3 +
 .../native/xpu/sycl/ActivationGluKernels.h    |   4 +-
 .../xpu/sycl/ActivationHardsigmoidKernels.cpp |   2 +
 .../xpu/sycl/ActivationHardsigmoidKernels.h   |   4 +-
 .../xpu/sycl/ActivationHardswishKernels.cpp   |   2 +
 .../xpu/sycl/ActivationHardswishKernels.h     |   4 +-
 .../xpu/sycl/ActivationHardtanhKernels.cpp    |   2 +
 .../xpu/sycl/ActivationHardtanhKernels.h      |   2 +-
 .../xpu/sycl/ActivationLeakyReluKernels.cpp   |   2 +
 .../xpu/sycl/ActivationLeakyReluKernels.h     |   6 +-
 .../xpu/sycl/ActivationLogSigmoidKernels.cpp  |   2 +
 .../xpu/sycl/ActivationLogSigmoidKernels.h    |   4 +-
 .../native/xpu/sycl/ActivationMishKernels.cpp |   2 +
 .../native/xpu/sycl/ActivationMishKernels.h   |   4 +-
 .../xpu/sycl/ActivationPreluKernels.cpp       |   4 +-
 .../native/xpu/sycl/ActivationPreluKernels.h  |   4 +-
 .../native/xpu/sycl/ActivationSiluKernels.cpp |   2 +
 .../native/xpu/sycl/ActivationSiluKernels.h   |   4 +-
 .../xpu/sycl/ActivationSoftplusKernels.cpp    |   2 +
 .../xpu/sycl/ActivationSoftplusKernels.h      |   4 +-
 .../xpu/sycl/ActivationSoftshrinkKernels.cpp  |   2 +
 .../xpu/sycl/ActivationSoftshrinkKernels.h    |   8 +-
 .../xpu/sycl/ActivationThresholdKernel.cpp    |   2 +
 .../xpu/sycl/ActivationThresholdKernel.h      |   2 +-
 .../sycl/AdaptiveAveragePooling2dKernels.cpp  |   2 +
 .../sycl/AdaptiveAveragePooling2dKernels.h    |   4 +-
 .../xpu/sycl/AdaptiveMaxPooling2dKernels.cpp  |   2 +
 .../xpu/sycl/AdaptiveMaxPooling2dKernels.h    |   4 +-
 src/ATen/native/xpu/sycl/AmpKernels.cpp       |   2 +
 src/ATen/native/xpu/sycl/AmpKernels.h         |   6 +-
 .../native/xpu/sycl/AveragePool2dKernels.cpp  |   2 +
 .../native/xpu/sycl/AveragePool2dKernels.h    |   4 +-
 src/ATen/native/xpu/sycl/BatchNormKernels.cpp | 120 ++++++++++--------
 src/ATen/native/xpu/sycl/BatchNormKernels.h   |  15 ++-
 .../xpu/sycl/BinaryBitwiseOpsKernels.cpp      |   2 +
 .../native/xpu/sycl/BinaryBitwiseOpsKernels.h |   6 +-
 .../native/xpu/sycl/BinaryDivTrueKernel.cpp   |   2 +
 .../xpu/sycl/BinaryGeometricKernels.cpp       |   2 +
 .../native/xpu/sycl/BinaryGeometricKernels.h  |   4 +-
 src/ATen/native/xpu/sycl/BinaryKernels.cpp    |   2 +
 src/ATen/native/xpu/sycl/BinaryKernels.h      |  12 +-
 .../xpu/sycl/BinaryLogicalOpsKernels.cpp      |   2 +
 .../native/xpu/sycl/BinaryLogicalOpsKernels.h |   6 +-
 .../xpu/sycl/BinaryMiscBackwardOpsKernels.cpp |   2 +
 .../xpu/sycl/BinaryMiscBackwardOpsKernels.h   |   8 +-
 .../native/xpu/sycl/BinaryMiscOpsKernels.cpp  |   2 +
 .../native/xpu/sycl/BinaryMiscOpsKernels.h    |   6 +-
 .../native/xpu/sycl/BinaryRemainderKernel.cpp |   2 +
 .../native/xpu/sycl/BinaryRemainderKernel.h   |   4 +-
 .../native/xpu/sycl/BinaryShiftOpsKernels.cpp |   4 +-
 .../native/xpu/sycl/BinaryShiftOpsKernels.h   |   4 +-
 .../native/xpu/sycl/BucketizationKernels.cpp  |   2 +
 .../native/xpu/sycl/BucketizationKernels.h    |   6 +-
 src/ATen/native/xpu/sycl/Col2ImKernel.cpp     |   2 +
 src/ATen/native/xpu/sycl/Col2ImKernel.h       |   2 +-
 src/ATen/native/xpu/sycl/CompareKernels.cpp   |   2 +
 src/ATen/native/xpu/sycl/CompareKernels.h     |  12 +-
 src/ATen/native/xpu/sycl/ComplexKernels.cpp   |   2 +
 src/ATen/native/xpu/sycl/ComplexKernels.h     |   4 +-
 src/ATen/native/xpu/sycl/CopyKernel.cpp       |   2 +
 src/ATen/native/xpu/sycl/CopyKernel.h         |   2 +-
 src/ATen/native/xpu/sycl/CopysignKernel.cpp   |   2 +
 src/ATen/native/xpu/sycl/CopysignKernel.h     |   2 +-
 src/ATen/native/xpu/sycl/CrossKernel.cpp      |   2 +
 src/ATen/native/xpu/sycl/CrossKernel.h        |   5 +-
 src/ATen/native/xpu/sycl/CumprodKernel.cpp    |   3 +
 src/ATen/native/xpu/sycl/CumprodKernel.h      |   2 +-
 src/ATen/native/xpu/sycl/CumsumKernel.cpp     |   3 +
 src/ATen/native/xpu/sycl/CumsumKernel.h       |   2 +-
 src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp |   2 +
 src/ATen/native/xpu/sycl/DilatedMaxPool2d.h   |   4 +-
 src/ATen/native/xpu/sycl/DistanceKernels.cpp  |   2 +
 src/ATen/native/xpu/sycl/DistanceKernels.h    |   4 +-
 .../native/xpu/sycl/DistributionBernoulli.cpp |   2 +
 .../sycl/DistributionExponentialKernel.cpp    |   2 +
 .../native/xpu/sycl/DistributionKernels.h     |  18 +--
 .../native/xpu/sycl/DistributionNormal.cpp    |   2 +
 .../xpu/sycl/DistributionRandomKernel.cpp     |   2 +
 .../native/xpu/sycl/DistributionUniform.cpp   |   2 +
 src/ATen/native/xpu/sycl/Dropout.cpp          |   2 +
 src/ATen/native/xpu/sycl/DropoutKernels.h     |   8 +-
 src/ATen/native/xpu/sycl/Embedding.cpp        |   2 +
 src/ATen/native/xpu/sycl/EmbeddingBag.cpp     |   2 +
 .../native/xpu/sycl/EmbeddingBagKernels.h     |   2 +-
 src/ATen/native/xpu/sycl/EmbeddingKernels.h   |   2 +-
 src/ATen/native/xpu/sycl/FillKernel.cpp       |   2 +
 src/ATen/native/xpu/sycl/FillKernel.h         |   2 +-
 .../xpu/sycl/ForeachBinaryOpListKernels.h     |  12 +-
 .../xpu/sycl/ForeachBinaryOpScalarKernels.h   |  12 +-
 .../sycl/ForeachBinaryOpScalarListKernels.h   |  12 +-
 .../xpu/sycl/ForeachPointwiseOpListKernels.h  |   8 +-
 .../sycl/ForeachPointwiseOpScalarKernels.h    |   8 +-
 .../ForeachPointwiseOpScalarListKernels.h     |   8 +-
 .../native/xpu/sycl/ForeachReduceKernels.cpp  |   2 +
 .../native/xpu/sycl/ForeachReduceKernels.h    |   2 +-
 .../native/xpu/sycl/ForeachTernaryKernels.cpp |   3 +
 .../xpu/sycl/ForeachTernaryOpListKernels.h    |   4 +-
 .../xpu/sycl/ForeachTernaryOpScalarKernels.h  |   4 +-
 .../native/xpu/sycl/ForeachUnaryKernels.cpp   |   2 +
 .../native/xpu/sycl/ForeachUnaryKernels.h     |   4 +-
 src/ATen/native/xpu/sycl/GcdLcmKernels.cpp    |   4 +-
 src/ATen/native/xpu/sycl/GcdLcmKernels.h      |   2 +-
 src/ATen/native/xpu/sycl/GridSampler.cpp      |   2 +
 src/ATen/native/xpu/sycl/GridSamplerKernels.h |   6 +-
 src/ATen/native/xpu/sycl/GroupNormKernels.cpp |   2 +
 src/ATen/native/xpu/sycl/GroupNormKernels.h   |   4 +-
 src/ATen/native/xpu/sycl/HistogramKernels.h   |   6 +-
 .../native/xpu/sycl/HistogramddKernels.cpp    |   2 +
 src/ATen/native/xpu/sycl/Im2ColKernel.cpp     |   2 +
 src/ATen/native/xpu/sycl/Im2ColKernel.h       |   2 +-
 src/ATen/native/xpu/sycl/Indexing.cpp         |   2 +
 src/ATen/native/xpu/sycl/IndexingKernels.h    |  18 +--
 src/ATen/native/xpu/sycl/LayerNormKernels.cpp |   2 +
 src/ATen/native/xpu/sycl/LayerNormKernels.h   |   4 +-
 src/ATen/native/xpu/sycl/LerpKernels.cpp      |   2 +
 src/ATen/native/xpu/sycl/LerpKernels.h        |   6 +-
 .../native/xpu/sycl/LinearAlgebraKernels.cpp  |   2 +
 .../native/xpu/sycl/LinearAlgebraKernels.h    |   5 +-
 src/ATen/native/xpu/sycl/LogAddExpKernels.cpp |   3 +
 src/ATen/native/xpu/sycl/LogAddExpKernels.h   |   4 +-
 src/ATen/native/xpu/sycl/LossKernels.cpp      |   2 +
 src/ATen/native/xpu/sycl/LossKernels.h        |   4 +-
 src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp |  14 +-
 src/ATen/native/xpu/sycl/LossNLL2dKernels.h   |   8 +-
 src/ATen/native/xpu/sycl/LossNLLKernel.cpp    |   2 +
 src/ATen/native/xpu/sycl/LossNLLKernel.h      |   6 +-
 .../xpu/sycl/{Math.h => MathExtensions.h}     |   0
 .../xpu/sycl/MaxMinElementwiseKernels.cpp     |   2 +
 .../xpu/sycl/MaxMinElementwiseKernels.h       |   8 +-
 .../native/xpu/sycl/MultinomialKernel.cpp     |   3 +
 src/ATen/native/xpu/sycl/MultinomialKernel.h  |   4 +-
 src/ATen/native/xpu/sycl/NMSKernel.cpp        |   2 +
 src/ATen/native/xpu/sycl/NMSKernel.h          |   2 +-
 src/ATen/native/xpu/sycl/NonzeroKernel.cpp    |   2 +
 src/ATen/native/xpu/sycl/NonzeroKernel.h      |   2 +-
 .../native/xpu/sycl/PointwiseOpsKernels.cpp   |   2 +
 .../native/xpu/sycl/PointwiseOpsKernels.h     |  15 ++-
 src/ATen/native/xpu/sycl/PowKernels.cpp       |   2 +
 src/ATen/native/xpu/sycl/PowKernels.h         |   4 +-
 src/ATen/native/xpu/sycl/RandpermKernel.cpp   |   2 +
 src/ATen/native/xpu/sycl/RandpermKernel.h     |  10 +-
 .../native/xpu/sycl/RangeFactoriesKernel.cpp  |   2 +
 .../native/xpu/sycl/RangeFactoriesKernel.h    |   4 +-
 .../native/xpu/sycl/ReduceAMinMaxKernel.cpp   |   2 +
 .../native/xpu/sycl/ReduceArgMaxKernel.cpp    |   2 +
 .../native/xpu/sycl/ReduceArgMinKernel.cpp    |   2 +
 .../native/xpu/sycl/ReduceLogicKernels.cpp    |   2 +
 .../xpu/sycl/ReduceMaxValuesKernels.cpp       |   2 +
 .../native/xpu/sycl/ReduceMaxValuesKernels.h  |   6 +-
 .../xpu/sycl/ReduceMinValuesKernels.cpp       |   2 +
 .../native/xpu/sycl/ReduceMinValuesKernels.h  |   6 +-
 .../native/xpu/sycl/ReduceMomentKernels.cpp   |   2 +
 src/ATen/native/xpu/sycl/ReduceNormKernel.cpp |   4 +-
 src/ATen/native/xpu/sycl/ReduceNormKernel.h   |   2 +-
 src/ATen/native/xpu/sycl/ReduceOpsKernels.h   |  25 ++--
 .../native/xpu/sycl/ReduceSumProdKernels.cpp  |   2 +
 .../native/xpu/sycl/ReflectionPadKernels.cpp  |   4 +-
 .../native/xpu/sycl/ReflectionPadKernels.h    |  14 +-
 src/ATen/native/xpu/sycl/RenormKernel.cpp     |   2 +
 src/ATen/native/xpu/sycl/RenormKernel.h       |   4 +-
 src/ATen/native/xpu/sycl/RepeatKernel.cpp     |   5 +-
 src/ATen/native/xpu/sycl/RepeatKernel.h       |   4 +-
 .../xpu/sycl/ReplicationPaddingKernels.cpp    |   4 +-
 .../xpu/sycl/ReplicationPaddingKernels.h      |  14 +-
 src/ATen/native/xpu/sycl/ResizeKernel.cpp     |   4 +-
 src/ATen/native/xpu/sycl/ResizeKernel.h       |   2 +-
 src/ATen/native/xpu/sycl/ScanKernels.h        |  10 +-
 .../native/xpu/sycl/ScatterGatherKernels.cpp  |   2 +
 .../native/xpu/sycl/ScatterGatherKernels.h    |  14 +-
 src/ATen/native/xpu/sycl/Shape.cpp            |   2 +
 src/ATen/native/xpu/sycl/ShapeKernels.h       |   2 +-
 src/ATen/native/xpu/sycl/SoftMaxKernels.cpp   |   2 +
 src/ATen/native/xpu/sycl/SoftMaxKernels.h     |   8 +-
 src/ATen/native/xpu/sycl/Sorting.cpp          |   2 +
 src/ATen/native/xpu/sycl/Sorting.h            |   4 +-
 src/ATen/native/xpu/sycl/StepKernels.cpp      |   2 +
 src/ATen/native/xpu/sycl/StepKernels.h        |   2 +-
 .../native/xpu/sycl/SummaryOpsKernels.cpp     |   2 +
 src/ATen/native/xpu/sycl/SummaryOpsKernels.h  |   6 +-
 .../native/xpu/sycl/TensorCompareKernels.cpp  |   2 +
 .../native/xpu/sycl/TensorCompareKernels.h    |  16 ++-
 src/ATen/native/xpu/sycl/TensorTopKKernel.cpp |   2 +
 src/ATen/native/xpu/sycl/TensorTopKKernel.h   |   2 +-
 .../xpu/sycl/TensorTransformationsKernels.cpp |   2 +
 .../xpu/sycl/TensorTransformationsKernels.h   |   4 +-
 .../native/xpu/sycl/TriangularOpsKernels.cpp  |   2 +
 .../native/xpu/sycl/TriangularOpsKernels.h    |  12 +-
 .../native/xpu/sycl/UnaryComplexKernels.cpp   |   2 +
 .../native/xpu/sycl/UnaryComplexKernels.h     |   8 +-
 .../native/xpu/sycl/UnaryFractionKernels.cpp  |   2 +
 .../native/xpu/sycl/UnaryFractionKernels.h    |  14 +-
 .../native/xpu/sycl/UnaryGammaKernels.cpp     |   4 +-
 src/ATen/native/xpu/sycl/UnaryGammaKernels.h  |   6 +-
 .../xpu/sycl/UnaryGeometricAcosKernel.cpp     |   2 +
 .../xpu/sycl/UnaryGeometricAcosKernel.h       |   5 +-
 .../xpu/sycl/UnaryGeometricAcoshKernel.cpp    |   2 +
 .../xpu/sycl/UnaryGeometricAcoshKernel.h      |   4 +-
 .../xpu/sycl/UnaryGeometricAsinKernel.cpp     |   2 +
 .../xpu/sycl/UnaryGeometricAsinKernel.h       |   2 +-
 .../xpu/sycl/UnaryGeometricAsinhKernel.cpp    |   2 +
 .../xpu/sycl/UnaryGeometricAsinhKernel.h      |   2 +-
 .../xpu/sycl/UnaryGeometricAtanKernel.cpp     |   2 +
 .../xpu/sycl/UnaryGeometricAtanKernel.h       |   2 +-
 .../xpu/sycl/UnaryGeometricAtanhKernel.cpp    |   2 +
 .../xpu/sycl/UnaryGeometricAtanhKernel.h      |   2 +-
 .../xpu/sycl/UnaryGeometricCosKernel.cpp      |   2 +
 .../native/xpu/sycl/UnaryGeometricCosKernel.h |   2 +-
 .../xpu/sycl/UnaryGeometricCoshKernel.cpp     |   2 +
 .../xpu/sycl/UnaryGeometricCoshKernel.h       |   2 +-
 .../xpu/sycl/UnaryGeometricSinKernel.cpp      |   2 +
 .../native/xpu/sycl/UnaryGeometricSinKernel.h |   2 +-
 .../xpu/sycl/UnaryGeometricSinhKernel.cpp     |   2 +
 .../xpu/sycl/UnaryGeometricSinhKernel.h       |   2 +-
 .../xpu/sycl/UnaryGeometricTanKernel.cpp      |   2 +
 .../native/xpu/sycl/UnaryGeometricTanKernel.h |   2 +-
 .../xpu/sycl/UnaryGeometricTanhKernel.cpp     |   2 +
 .../xpu/sycl/UnaryGeometricTanhKernel.h       |   2 +-
 src/ATen/native/xpu/sycl/UnaryKernels.cpp     |   2 +
 src/ATen/native/xpu/sycl/UnaryKernels.h       |  12 +-
 src/ATen/native/xpu/sycl/UnaryLogKernels.cpp  |   2 +
 src/ATen/native/xpu/sycl/UnaryLogKernels.h    |   8 +-
 src/ATen/native/xpu/sycl/UnarySignKernels.cpp |   2 +
 src/ATen/native/xpu/sycl/UnarySignKernels.h   |  10 +-
 .../xpu/sycl/UnarySpecialOpsKernels.cpp       |   2 +
 .../native/xpu/sycl/UnarySpecialOpsKernels.h  |  14 +-
 .../native/xpu/sycl/UnfoldBackwardKernels.cpp |   2 +
 .../native/xpu/sycl/UnfoldBackwardKernels.h   |   2 +-
 src/ATen/native/xpu/sycl/UniqueKernels.cpp    |   2 +
 src/ATen/native/xpu/sycl/UniqueKernels.h      |  11 +-
 .../xpu/sycl/UpSampleBicubic2dKernels.cpp     |   2 +
 .../xpu/sycl/UpSampleBicubic2dKernels.h       |   4 +-
 .../xpu/sycl/UpSampleBilinear2dKernels.cpp    |   2 +
 .../xpu/sycl/UpSampleBilinear2dKernels.h      |   6 +-
 .../xpu/sycl/UpSampleLinear1dKernels.cpp      |   4 +-
 .../native/xpu/sycl/UpSampleLinear1dKernels.h |   7 +-
 .../xpu/sycl/UpSampleNearest1dKernels.cpp     |   2 +
 .../xpu/sycl/UpSampleNearest1dKernels.h       |   4 +-
 .../xpu/sycl/UpSampleNearest2dKernels.cpp     |   2 +-
 .../xpu/sycl/UpSampleNearest2dKernels.h       |   4 +-
 .../native/xpu/sycl/WeightNormKernels.cpp     |   4 +-
 src/ATen/native/xpu/sycl/WeightNormKernels.h  |   8 +-
 src/BuildOnLinux.cmake                        |  79 ++++++++++++
 src/BuildOnWindows.cmake                      | 106 ++++++++++++++++
 src/CMakeLists.txt                            |  84 ++----------
 src/bridge.cpp                                |  34 +++++
 254 files changed, 970 insertions(+), 479 deletions(-)
 rename src/ATen/native/xpu/sycl/{Math.h => MathExtensions.h} (100%)
 create mode 100644 src/BuildOnLinux.cmake
 create mode 100644 src/BuildOnWindows.cmake
 create mode 100644 src/bridge.cpp

diff --git a/cmake/Modules/FindSYCL/run_sycl.cmake b/cmake/Modules/FindSYCL/run_sycl.cmake
index dcc99c18f..f56d49768 100644
--- a/cmake/Modules/FindSYCL/run_sycl.cmake
+++ b/cmake/Modules/FindSYCL/run_sycl.cmake
@@ -56,6 +56,12 @@ endforeach()
 # Choose host flags in FindSYCL.cmake
 @SYCL_host_flags@
 
+# Adding permissive flag for MSVC build to overcome ambiguous symbol error.
+if(WIN32)
+  string(APPEND SYCL_host_compiler_flags "/permissive- ")
+endif()
+
+
 list(REMOVE_DUPLICATES CMAKE_HOST_FLAGS)
 foreach(flag ${CMAKE_HOST_FLAGS})
   # Extra quotes are added around each flag to help SYCL parse out flags with spaces.
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index 815ad018f..57c4783cd 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -1,10 +1,13 @@
 # ATen XPU sources
 
-file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp")
+file(GLOB xpu_cpp "xpu/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
+list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
 list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
 
 set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
+set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index f32a89a75..3aca0d5c7 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -29,7 +29,7 @@ Tensor& XPUNativeFunctions::arange_out(
 
         TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
         TORCH_CHECK(
-            std::isfinite(xstart) && std::isfinite(xend),
+            std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
             "unsupported range: ",
             xstart,
             " -> ",
@@ -97,7 +97,7 @@ Tensor& XPUNativeFunctions::range_out(
 
   TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
   TORCH_CHECK(
-      std::isfinite(xstart) && std::isfinite(xend),
+      std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
       "unsupported range: ",
       xstart,
       " -> ",
diff --git a/src/ATen/native/xpu/sycl/AbsKernel.cpp b/src/ATen/native/xpu/sycl/AbsKernel.cpp
index 596000ecb..3df01baee 100644
--- a/src/ATen/native/xpu/sycl/AbsKernel.cpp
+++ b/src/ATen/native/xpu/sycl/AbsKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/AbsKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/AbsKernel.h b/src/ATen/native/xpu/sycl/AbsKernel.h
index 3b51e993e..3423c32cf 100644
--- a/src/ATen/native/xpu/sycl/AbsKernel.h
+++ b/src/ATen/native/xpu/sycl/AbsKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void abs_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
index de1e6447b..57b5e34b3 100644
--- a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationEluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.h b/src/ATen/native/xpu/sycl/ActivationEluKernels.h
index 555bfb5f4..a15eaa590 100644
--- a/src/ATen/native/xpu/sycl/ActivationEluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.h
@@ -4,13 +4,13 @@
 
 namespace at::native::xpu {
 
-void elu_kernel(
+TORCH_XPU_API void elu_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,
     const Scalar& input_scale);
 
-void elu_backward_kernel(
+TORCH_XPU_API void elu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,
diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
index 19a5de78c..5bbaa1ab0 100644
--- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
index 6c373a7cd..660fae9ff 100644
--- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
+++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
@@ -6,9 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate);
+TORCH_XPU_API void gelu_kernel(
+    TensorIteratorBase& iter,
+    c10::string_view approximate);
 
-void gelu_backward_kernel(
+TORCH_XPU_API void gelu_backward_kernel(
     TensorIteratorBase& iter,
     c10::string_view approximate);
 
diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
index 056e8c332..f60613ec7 100644
--- a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -2,9 +2,12 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorIterator.h>
 
+#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ActivationGluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.h b/src/ATen/native/xpu/sycl/ActivationGluKernels.h
index c0fd6e22c..a4b11dbb4 100644
--- a/src/ATen/native/xpu/sycl/ActivationGluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.h
@@ -4,9 +4,9 @@
 
 namespace at::native::xpu {
 
-void glu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter);
 
-void glu_backward_kernel(
+TORCH_XPU_API void glu_backward_kernel(
     const TensorIteratorBase& iter,
     int64_t gI_stride,
     int64_t I_stride);
diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
index d2c53e2b0..6399e6b23 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
index ee5f0e87c..47bfa636c 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void hardsigmoid_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter);
 
-void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
index 40099584a..7dadbf3aa 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardswishKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
index ea4e81fc9..6b741e744 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
@@ -6,9 +6,9 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardswish_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_kernel(TensorIterator& iter);
 
-void hardswish_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
index f712b951d..2f009757b 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardtanhKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
index c2d110bae..745c0f69b 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardtanh_backward_kernel(
+TORCH_XPU_API void hardtanh_backward_kernel(
     TensorIterator& iter,
     const Scalar& min,
     const Scalar& max);
diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
index 9da5a275b..81e2cb5ba 100644
--- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLeakyReluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
index 758701ce9..5656c661e 100644
--- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
@@ -4,9 +4,11 @@
 
 namespace at::native::xpu {
 
-void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_);
+TORCH_XPU_API void leaky_relu_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& negval_);
 
-void leaky_relu_backward_kernel(
+TORCH_XPU_API void leaky_relu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& negval_);
 
diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
index a61694d32..09487462e 100644
--- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
index 1cc6c2e44..be8e7266c 100644
--- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
 
-void log_sigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
index fe0154326..5e517911b 100644
--- a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
@@ -8,6 +8,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationMishKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
index bd68197cd..a2c7e499d 100644
--- a/src/ATen/native/xpu/sycl/ActivationMishKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void mish_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter);
 
-void mish_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mish_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
index 8a2c7012f..9ec4e598b 100644
--- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationPreluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -40,4 +42,4 @@ void prelu_backward_kernel(TensorIterator& iter) {
       });
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
index 23f60c242..4d33e9064 100644
--- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void prelu_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_kernel(TensorIterator& iter);
 
-void prelu_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
index 093a9a79b..924b75d81 100644
--- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationSiluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
index c31acdc9a..eaec67986 100644
--- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void silu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter);
 
-void silu_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
index 99b1c5716..8dc3684e3 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftplusKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
index 8a5e5ef2b..a9b6027bc 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void softplus_kernel(
+TORCH_XPU_API void softplus_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);
 
-void softplus_backward_kernel(
+TORCH_XPU_API void softplus_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
index 4393576e9..b96ab461e 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h
index 481d1e5a1..398e3f4a4 100644
--- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h
@@ -4,8 +4,12 @@
 
 namespace at::native::xpu {
 
-void softshrink_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void softshrink_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
-void softshrink_backward_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void softshrink_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
index 055aa262d..175db7753 100644
--- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationThresholdKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h
index c22a6b18c..df38c6d3d 100644
--- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h
+++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void threshold_kernel(
+TORCH_XPU_API void threshold_kernel(
     TensorIteratorBase& iter,
     const Scalar& threshold,
     const Scalar& value);
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
index 4d7ef286d..0221ceec0 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -6,6 +6,8 @@
 #include <comm/MemoryFormat.h>
 #include <vector>
 
+#include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu;
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
index 9b6d9a046..0d58a6e7f 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void adaptive_avg_pool2d_backward_kernel(
+TORCH_XPU_API void adaptive_avg_pool2d_backward_kernel(
     Tensor& gradInput,
     const Tensor& gradOutput,
     const Tensor& input);
 
-void adaptive_avg_pool2d_kernel(
+TORCH_XPU_API void adaptive_avg_pool2d_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size);
diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
index 9da231b0d..cb2c6b083 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
@@ -12,6 +12,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename index_t>
diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
index 028fed1d2..abba4e354 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
@@ -4,13 +4,13 @@
 
 namespace at::native::xpu {
 
-void adaptive_max_pool2d_kernel(
+TORCH_XPU_API void adaptive_max_pool2d_kernel(
     const Tensor& input,
     IntArrayRef output_size,
     Tensor& output,
     Tensor& indices);
 
-void adaptive_max_pool2d_backward_kernel(
+TORCH_XPU_API void adaptive_max_pool2d_backward_kernel(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& indices,
diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp
index 87354eea9..137ae3902 100644
--- a/src/ATen/native/xpu/sycl/AmpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/AmpKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/AmpKernels.h b/src/ATen/native/xpu/sycl/AmpKernels.h
index 4bf863597..4f828c1f2 100644
--- a/src/ATen/native/xpu/sycl/AmpKernels.h
+++ b/src/ATen/native/xpu/sycl/AmpKernels.h
@@ -3,17 +3,17 @@
 
 namespace at::native::xpu {
 
-void amp_non_finite_check_and_unscale_kernel(
+TORCH_XPU_API void amp_non_finite_check_and_unscale_kernel(
     Tensor& scaled_grad,
     Tensor& found_inf,
     const Tensor& inv_scale);
 
-void amp_foreach_non_finite_check_and_unscale_kernel(
+TORCH_XPU_API void amp_foreach_non_finite_check_and_unscale_kernel(
     std::vector<std::vector<at::Tensor>> scaled_grads,
     Tensor& found_inf,
     const Tensor& inv_scale);
 
-Tensor& amp_update_scale_kernel(
+TORCH_XPU_API Tensor& amp_update_scale_kernel(
     Tensor& current_scale,
     Tensor& growth_tracker,
     const Tensor& found_inf,
diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
index 4e2328641..7373935aa 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
@@ -9,6 +9,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
+
 namespace at::native {
 namespace xpu {
 
diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
index 75ac83a4e..84842355d 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void avg_pool2d_kernel(
+TORCH_XPU_API void avg_pool2d_kernel(
     const Tensor& input_,
     IntArrayRef kernel_size,
     IntArrayRef stride,
@@ -14,7 +14,7 @@ void avg_pool2d_kernel(
     c10::optional<int64_t> divisor_override,
     Tensor& output);
 
-void avg_pool2d_backward_kernel(
+TORCH_XPU_API void avg_pool2d_backward_kernel(
     const Tensor& gradOutput_,
     const Tensor& input_,
     IntArrayRef kernel_size,
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index 25acd873a..084a9b65f 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -13,6 +13,14 @@
 #include <comm/SYCLContext.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/BatchNormKernels.h>
+
+#ifdef _WIN32
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+
 namespace at {
 namespace native {
 namespace xpu {
@@ -1008,9 +1016,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
   }
 
   BatchNormCollectStatisticsChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ input,
-      accscalar_t* __restrict__ out_mean,
-      accscalar_t* __restrict__ out_invstd,
+      const scalar_t* RESTRICT input,
+      accscalar_t* RESTRICT out_mean,
+      accscalar_t* RESTRICT out_invstd,
       volatile accscalar_t* staging_data,
       int* semaphores,
       const int reduction_size,
@@ -1028,9 +1036,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
         wg_size_(wg_size) {}
 
  private:
-  const scalar_t* __restrict__ input_;
-  accscalar_t* __restrict__ out_mean_;
-  accscalar_t* __restrict__ out_invstd_;
+  const scalar_t* RESTRICT input_;
+  accscalar_t* RESTRICT out_mean_;
+  accscalar_t* RESTRICT out_invstd_;
   volatile accscalar_t* staging_data_;
   int* semaphores_;
   const int reduction_size_;
@@ -1375,13 +1383,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor {
   }
 
   BatchNormTransformInputChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ input,
-      const scalar_t* __restrict__ z,
-      const accscalar_t* __restrict__ mean,
-      const accscalar_t* __restrict__ inv_std,
-      const layerscalar_t* __restrict__ weight,
-      const layerscalar_t* __restrict__ shift,
-      scalar_t* __restrict__ out,
+      const scalar_t* RESTRICT input,
+      const scalar_t* RESTRICT z,
+      const accscalar_t* RESTRICT mean,
+      const accscalar_t* RESTRICT inv_std,
+      const layerscalar_t* RESTRICT weight,
+      const layerscalar_t* RESTRICT shift,
+      scalar_t* RESTRICT out,
       const int reduction_size,
       const int stride,
       const bool fuse_relu)
@@ -1397,13 +1405,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor {
         fuse_relu_(fuse_relu) {}
 
  private:
-  const scalar_t* __restrict__ input_;
-  const scalar_t* __restrict__ z_;
-  const accscalar_t* __restrict__ mean_;
-  const accscalar_t* __restrict__ inv_std_;
-  const layerscalar_t* __restrict__ weight_;
-  const layerscalar_t* __restrict__ shift_;
-  scalar_t* __restrict__ out_;
+  const scalar_t* RESTRICT input_;
+  const scalar_t* RESTRICT z_;
+  const accscalar_t* RESTRICT mean_;
+  const accscalar_t* RESTRICT inv_std_;
+  const layerscalar_t* RESTRICT weight_;
+  const layerscalar_t* RESTRICT shift_;
+  scalar_t* RESTRICT out_;
   const int reduction_size_;
   const int stride_;
   const bool fuse_relu_;
@@ -2034,14 +2042,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
   }
 
   BatchNormBackwardReduceChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ input,
-      const scalar_t* __restrict__ grad_output,
-      const accscalar_t* __restrict__ mean,
-      const accscalar_t* __restrict__ inv_std,
-      accscalar_t* __restrict__ sum_dy_o,
-      accscalar_t* __restrict__ sum_dy_xmu_o,
-      layerscalar_t* __restrict__ grad_weight,
-      layerscalar_t* __restrict__ grad_bias,
+      const scalar_t* RESTRICT input,
+      const scalar_t* RESTRICT grad_output,
+      const accscalar_t* RESTRICT mean,
+      const accscalar_t* RESTRICT inv_std,
+      accscalar_t* RESTRICT sum_dy_o,
+      accscalar_t* RESTRICT sum_dy_xmu_o,
+      layerscalar_t* RESTRICT grad_weight,
+      layerscalar_t* RESTRICT grad_bias,
       volatile accscalar_t* staging_data,
       int* semaphores,
       const int reduction_size,
@@ -2062,14 +2070,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
         wg_size_(wg_size) {}
 
  private:
-  const scalar_t* __restrict__ input_;
-  const scalar_t* __restrict__ grad_output_;
-  const accscalar_t* __restrict__ mean_;
-  const accscalar_t* __restrict__ inv_std_;
-  accscalar_t* __restrict__ sum_dy_o_;
-  accscalar_t* __restrict__ sum_dy_xmu_o_;
-  layerscalar_t* __restrict__ grad_weight_;
-  layerscalar_t* __restrict__ grad_bias_;
+  const scalar_t* RESTRICT input_;
+  const scalar_t* RESTRICT grad_output_;
+  const accscalar_t* RESTRICT mean_;
+  const accscalar_t* RESTRICT inv_std_;
+  accscalar_t* RESTRICT sum_dy_o_;
+  accscalar_t* RESTRICT sum_dy_xmu_o_;
+  layerscalar_t* RESTRICT grad_weight_;
+  layerscalar_t* RESTRICT grad_bias_;
   volatile accscalar_t* staging_data_;
   int* semaphores_;
   const int reduction_size_;
@@ -2395,7 +2403,7 @@ struct BatchNormBackwardElemtKernelFunctor {
       GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>
           grad_input,
       const stat_accscalar_t norm_fct,
-      const int* __restrict__ numel = nullptr,
+      const int* RESTRICT numel = nullptr,
       const int world_size = 0)
       : input_(input),
         grad_output_(grad_output),
@@ -2451,7 +2459,7 @@ struct BatchNormBackwardElemtKernelFunctor {
   GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>
       grad_input_;
   const stat_accscalar_t norm_fct_;
-  const int* __restrict__ numel_;
+  const int* RESTRICT numel_;
   const int world_size_;
 };
 
@@ -2680,18 +2688,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor {
   }
 
   BatchNormBackwardElemtChannelsLastKernelFunctor(
-      const scalar_t* __restrict__ grad_output,
-      const scalar_t* __restrict__ input,
-      const accscalar_t* __restrict__ mean,
-      const accscalar_t* __restrict__ inv_std,
-      const layerscalar_t* __restrict__ weight,
-      const accscalar_t* __restrict__ sum_dy,
-      const accscalar_t* __restrict__ sum_dy_xmu,
-      scalar_t* __restrict__ grad_input,
+      const scalar_t* RESTRICT grad_output,
+      const scalar_t* RESTRICT input,
+      const accscalar_t* RESTRICT mean,
+      const accscalar_t* RESTRICT inv_std,
+      const layerscalar_t* RESTRICT weight,
+      const accscalar_t* RESTRICT sum_dy,
+      const accscalar_t* RESTRICT sum_dy_xmu,
+      scalar_t* RESTRICT grad_input,
       const accscalar_t norm_fct,
       const int reduction_size,
       const int stride,
-      const int* __restrict__ numel = nullptr,
+      const int* RESTRICT numel = nullptr,
       const int64_t world_size = 0)
       : grad_output_(grad_output),
         input_(input),
@@ -2708,18 +2716,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor {
         world_size_(world_size) {}
 
  private:
-  const scalar_t* __restrict__ grad_output_;
-  const scalar_t* __restrict__ input_;
-  const accscalar_t* __restrict__ mean_;
-  const accscalar_t* __restrict__ inv_std_;
-  const layerscalar_t* __restrict__ weight_;
-  const accscalar_t* __restrict__ sum_dy_;
-  const accscalar_t* __restrict__ sum_dy_xmu_;
-  scalar_t* __restrict__ grad_input_;
+  const scalar_t* RESTRICT grad_output_;
+  const scalar_t* RESTRICT input_;
+  const accscalar_t* RESTRICT mean_;
+  const accscalar_t* RESTRICT inv_std_;
+  const layerscalar_t* RESTRICT weight_;
+  const accscalar_t* RESTRICT sum_dy_;
+  const accscalar_t* RESTRICT sum_dy_xmu_;
+  scalar_t* RESTRICT grad_input_;
   const accscalar_t norm_fct_;
   const int reduction_size_;
   const int stride_;
-  const int* __restrict__ numel_;
+  const int* RESTRICT numel_;
   const int64_t world_size_;
 };
 
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.h b/src/ATen/native/xpu/sycl/BatchNormKernels.h
index 3bc559b38..9d9ae780b 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.h
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.h
@@ -6,11 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-std::tuple<Tensor, Tensor> batch_norm_stats_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> batch_norm_stats_kernel(
     const Tensor& self,
     double epsilon);
 
-void batch_norm_elemt_kernel(
+TORCH_XPU_API void batch_norm_elemt_kernel(
     Tensor& out,
     const Tensor& self,
     const c10::optional<Tensor>& weight_opt,
@@ -18,7 +18,8 @@ void batch_norm_elemt_kernel(
     const Tensor& mean_,
     const Tensor& invstd_);
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor, Tensor>
+batch_norm_backward_reduce_kernel(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& mean,
@@ -28,7 +29,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_kernel(
     bool weight_g,
     bool bias_g);
 
-Tensor batch_norm_backward_elemt_kernel(
+TORCH_XPU_API Tensor batch_norm_backward_elemt_kernel(
     const Tensor& self,
     const Tensor& input,
     const Tensor& mean,
@@ -38,13 +39,13 @@ Tensor batch_norm_backward_elemt_kernel(
     const Tensor& sum_dy_xmu,
     const Tensor& count);
 
-std::tuple<Tensor, Tensor> batch_norm_update_stats_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> batch_norm_update_stats_kernel(
     const Tensor& self,
     const c10::optional<Tensor>& running_mean_opt,
     const c10::optional<Tensor>& running_var_opt,
     double momentum);
 
-std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_kernel(
+TORCH_XPU_API std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_kernel(
     const Tensor& self,
     const c10::optional<Tensor>& weight_opt,
     const c10::optional<Tensor>& bias_opt,
@@ -57,7 +58,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_kernel(
     Tensor& save_mean,
     Tensor& save_invstd);
 
-std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_kernel(
     const Tensor& grad_out,
     const Tensor& input,
     const c10::optional<Tensor>& weight_opt,
diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
index 7e2906e8b..34e50222e 100644
--- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h
index 4247c0a79..f9712b85d 100644
--- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h
@@ -6,11 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void bitwise_and_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_and_kernel(TensorIteratorBase& iter);
 
-void bitwise_or_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_or_kernel(TensorIteratorBase& iter);
 
-void bitwise_xor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_xor_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp
index e10d25dd5..6f35c0cb6 100644
--- a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/BinaryInternal.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryKernels.h>
+
 namespace at::native::xpu {
 
 void div_true_kernel(TensorIteratorBase& iter) {
diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
index e170760e8..3224ba54f 100644
--- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
@@ -3,6 +3,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryGeometricKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
index 588d52c4f..61af8b511 100644
--- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void atan2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void atan2_kernel(TensorIteratorBase& iter);
 
-void hypot_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hypot_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.cpp b/src/ATen/native/xpu/sycl/BinaryKernels.cpp
index 327eb9486..2902486bf 100644
--- a/src/ATen/native/xpu/sycl/BinaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/BinaryInternal.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryKernels.h>
+
 namespace at::native::xpu {
 
 template <typename opmath_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.h b/src/ATen/native/xpu/sycl/BinaryKernels.h
index 78dfaa0da..f862595eb 100644
--- a/src/ATen/native/xpu/sycl/BinaryKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryKernels.h
@@ -4,16 +4,16 @@
 
 namespace at::native::xpu {
 
-void add_kernel(TensorIteratorBase& iter, const Scalar& alpha);
+TORCH_XPU_API void add_kernel(TensorIteratorBase& iter, const Scalar& alpha);
 
-void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha);
+TORCH_XPU_API void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha);
 
-void mul_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mul_kernel(TensorIteratorBase& iter);
 
-void div_true_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void div_true_kernel(TensorIteratorBase& iter);
 
-void div_trunc_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void div_trunc_kernel(TensorIteratorBase& iter);
 
-void div_floor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void div_floor_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
index 23146b47d..18bcffca6 100644
--- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/BinaryInternal.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
index ee641d9fb..82271e6a1 100644
--- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void logical_and_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_and_kernel(TensorIteratorBase& iter);
 
-void logical_or_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_or_kernel(TensorIteratorBase& iter);
 
-void logical_xor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_xor_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
index 667236123..a7a676675 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h
index 37553191d..8bea1f44c 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h
@@ -4,10 +4,12 @@
 
 namespace at::native::xpu {
 
-void sigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sigmoid_backward_kernel(TensorIteratorBase& iter);
 
-void tanh_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void tanh_backward_kernel(TensorIteratorBase& iter);
 
-void logit_backward_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar);
+TORCH_XPU_API void logit_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& eps_scalar);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
index 5ac71c163..3e0989b24 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryMiscOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
index 17672ec29..ffe08be3e 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void mse_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mse_kernel(TensorIteratorBase& iter);
 
-void smooth_l1_kernel(TensorIteratorBase& iter, double beta);
+TORCH_XPU_API void smooth_l1_kernel(TensorIteratorBase& iter, double beta);
 
-void huber_kernel(TensorIterator& iter, double delta);
+TORCH_XPU_API void huber_kernel(TensorIterator& iter, double delta);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
index 4f06a94fc..9a5320c68 100644
--- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
@@ -3,6 +3,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryRemainderKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h
index 1d400951f..f3d3e2390 100644
--- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h
+++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h
@@ -6,9 +6,9 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void remainder_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void remainder_kernel(TensorIteratorBase& iter);
 
-void fmod_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void fmod_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp
index 64adba17e..1a0027d2f 100644
--- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/BinaryShiftOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -45,4 +47,4 @@ void rshift_kernel(TensorIteratorBase& iter) {
   });
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h
index 7e661d919..64ded1705 100644
--- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void lshift_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lshift_kernel(TensorIteratorBase& iter);
 
-void rshift_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void rshift_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
index d56eff222..c5686babb 100644
--- a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/BucketizationUtils.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/BucketizationKernels.h>
+
 namespace at::native::xpu {
 
 // customized lower_bound func to ensure the low bound of 'nan', 'inf' etc. be
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.h b/src/ATen/native/xpu/sycl/BucketizationKernels.h
index f47cea2af..1c86ece08 100644
--- a/src/ATen/native/xpu/sycl/BucketizationKernels.h
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.h
@@ -2,11 +2,13 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-void searchsorted_kernel(
+
+TORCH_XPU_API void searchsorted_kernel(
     Tensor& result,
     const Tensor& input,
     const Tensor& sorted_sequence,
     bool out_int32,
     bool right,
     const Tensor& sorter);
-} // namespace at::native::xpu
\ No newline at end of file
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
index 299711162..1f9321692 100644
--- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
@@ -5,6 +5,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/Col2ImKernel.h>
+
 namespace at::native::xpu {
 
 template <typename T>
diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.h b/src/ATen/native/xpu/sycl/Col2ImKernel.h
index db220a6db..a910112e0 100644
--- a/src/ATen/native/xpu/sycl/Col2ImKernel.h
+++ b/src/ATen/native/xpu/sycl/Col2ImKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void col2im_kernel(
+TORCH_XPU_API void col2im_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
diff --git a/src/ATen/native/xpu/sycl/CompareKernels.cpp b/src/ATen/native/xpu/sycl/CompareKernels.cpp
index b9af47cb7..1096dfa40 100644
--- a/src/ATen/native/xpu/sycl/CompareKernels.cpp
+++ b/src/ATen/native/xpu/sycl/CompareKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/CompareKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/CompareKernels.h b/src/ATen/native/xpu/sycl/CompareKernels.h
index d68827631..52cbc151c 100644
--- a/src/ATen/native/xpu/sycl/CompareKernels.h
+++ b/src/ATen/native/xpu/sycl/CompareKernels.h
@@ -6,17 +6,17 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void eq_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void eq_kernel(TensorIteratorBase& iter);
 
-void ne_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void ne_kernel(TensorIteratorBase& iter);
 
-void lt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lt_kernel(TensorIteratorBase& iter);
 
-void le_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void le_kernel(TensorIteratorBase& iter);
 
-void gt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void gt_kernel(TensorIteratorBase& iter);
 
-void ge_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void ge_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
index 87504bd5e..4e44d9064 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ComplexKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h
index d51556b4f..900bbaca6 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void complex_kernel(TensorIterator& iter);
+TORCH_XPU_API void complex_kernel(TensorIterator& iter);
 
-void polar_kernel(TensorIterator& iter);
+TORCH_XPU_API void polar_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index c2dde5056..bdddd3f44 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -7,6 +7,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/CopyKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.h b/src/ATen/native/xpu/sycl/CopyKernel.h
index d02a1a45b..30232b27a 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.h
+++ b/src/ATen/native/xpu/sycl/CopyKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void copy_kernel(TensorIterator& iter);
+TORCH_XPU_API void copy_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.cpp b/src/ATen/native/xpu/sycl/CopysignKernel.cpp
index 3b8351abf..327828adc 100644
--- a/src/ATen/native/xpu/sycl/CopysignKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopysignKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/CopysignKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.h b/src/ATen/native/xpu/sycl/CopysignKernel.h
index cf856728c..86b6a40d9 100644
--- a/src/ATen/native/xpu/sycl/CopysignKernel.h
+++ b/src/ATen/native/xpu/sycl/CopysignKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void copysign_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void copysign_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CrossKernel.cpp b/src/ATen/native/xpu/sycl/CrossKernel.cpp
index 0ed5f11eb..313261c9d 100644
--- a/src/ATen/native/xpu/sycl/CrossKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CrossKernel.cpp
@@ -4,6 +4,8 @@
 #include <c10/core/WrapDimMinimal.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/CrossKernel.h>
+
 namespace at::native::xpu {
 template <typename scalar_t>
 struct CrossKernelFunctor {
diff --git a/src/ATen/native/xpu/sycl/CrossKernel.h b/src/ATen/native/xpu/sycl/CrossKernel.h
index cc77229c4..8d382b97e 100644
--- a/src/ATen/native/xpu/sycl/CrossKernel.h
+++ b/src/ATen/native/xpu/sycl/CrossKernel.h
@@ -1,9 +1,12 @@
 #pragma once
 #include <ATen/ATen.h>
+
 namespace at::native::xpu {
-void linalg_cross_kernel(
+
+TORCH_XPU_API void linalg_cross_kernel(
     const Tensor& result,
     const Tensor& x1,
     const Tensor& x2,
     int64_t dim);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.cpp b/src/ATen/native/xpu/sycl/CumprodKernel.cpp
index f35cc7979..375e6a80c 100644
--- a/src/ATen/native/xpu/sycl/CumprodKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CumprodKernel.cpp
@@ -3,6 +3,9 @@
 
 #include <ATen/native/xpu/sycl/ScanUtils.h>
 
+#include <ATen/native/xpu/sycl/CumprodKernel.h>
+#include <ATen/native/xpu/sycl/ScanKernels.h>
+
 namespace at::native::xpu {
 
 void launch_cumprod_kernel(
diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.h b/src/ATen/native/xpu/sycl/CumprodKernel.h
index 71e8c7693..d78301f04 100644
--- a/src/ATen/native/xpu/sycl/CumprodKernel.h
+++ b/src/ATen/native/xpu/sycl/CumprodKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void launch_cumprod_kernel(
+TORCH_XPU_API void launch_cumprod_kernel(
     const Tensor& result,
     const Tensor& self,
     int64_t dim);
diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.cpp b/src/ATen/native/xpu/sycl/CumsumKernel.cpp
index c2754869b..0523db84b 100644
--- a/src/ATen/native/xpu/sycl/CumsumKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CumsumKernel.cpp
@@ -3,6 +3,9 @@
 
 #include <ATen/native/xpu/sycl/ScanUtils.h>
 
+#include <ATen/native/xpu/sycl/CumsumKernel.h>
+#include <ATen/native/xpu/sycl/ScanKernels.h>
+
 namespace at::native::xpu {
 
 void launch_cumsum_kernel(
diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.h b/src/ATen/native/xpu/sycl/CumsumKernel.h
index 79c299608..77ce62b61 100644
--- a/src/ATen/native/xpu/sycl/CumsumKernel.h
+++ b/src/ATen/native/xpu/sycl/CumsumKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void launch_cumsum_kernel(
+TORCH_XPU_API void launch_cumsum_kernel(
     const Tensor& result,
     const Tensor& self,
     int64_t dim);
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index eab6f4c48..8c130a0b9 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -15,6 +15,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
+
 namespace at::native::xpu {
 
 static inline int p_start(
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
index de550f4be..9d827c642 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void max_pool2d_with_indices_kernel(
+TORCH_XPU_API void max_pool2d_with_indices_kernel(
     const Tensor& input,
     IntArrayRef kernel_size,
     IntArrayRef stride,
@@ -14,7 +14,7 @@ void max_pool2d_with_indices_kernel(
     Tensor& output,
     Tensor& indices);
 
-Tensor& max_pool2d_with_indices_backward_kernel(
+TORCH_XPU_API Tensor& max_pool2d_with_indices_backward_kernel(
     Tensor& gradInput,
     const Tensor& gradOutput,
     const Tensor& input,
diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
index eb0f1f50e..446466a3e 100644
--- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
@@ -2,6 +2,8 @@
 #include <ATen/AccumulateType.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/DistanceKernels.h>
+
 namespace at::native::xpu {
 template <typename scalar_t>
 static double device_sqrt(scalar_t val) {
diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.h b/src/ATen/native/xpu/sycl/DistanceKernels.h
index 446a6b7c2..a53b84cdf 100644
--- a/src/ATen/native/xpu/sycl/DistanceKernels.h
+++ b/src/ATen/native/xpu/sycl/DistanceKernels.h
@@ -2,9 +2,11 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-void cdist_kernel(
+
+TORCH_XPU_API void cdist_kernel(
     Tensor& result,
     const Tensor& x1_expanded,
     const Tensor& x2_expanded,
     double p);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
index ab7bc609c..1a01a7cfc 100644
--- a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
@@ -7,6 +7,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
index 4c1b83689..cfc3e1dfa 100644
--- a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
@@ -7,6 +7,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at::native::xpu {
 
 void exponential_kernel(
diff --git a/src/ATen/native/xpu/sycl/DistributionKernels.h b/src/ATen/native/xpu/sycl/DistributionKernels.h
index e5700f26e..88d62933f 100644
--- a/src/ATen/native/xpu/sycl/DistributionKernels.h
+++ b/src/ATen/native/xpu/sycl/DistributionKernels.h
@@ -4,41 +4,43 @@
 
 namespace at::native::xpu {
 
-void random_from_to_kernel(
+TORCH_XPU_API void random_from_to_kernel(
     TensorIteratorBase& iter,
     uint64_t range,
     int64_t base,
     c10::optional<Generator> gen_);
 
-void random_full_64_bits_range_kernel(
+TORCH_XPU_API void random_full_64_bits_range_kernel(
     TensorIteratorBase& iter,
     c10::optional<Generator> gen_);
 
-void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_);
+TORCH_XPU_API void random_kernel(
+    TensorIteratorBase& iter,
+    c10::optional<Generator> gen_);
 
-void uniform_kernel(
+TORCH_XPU_API void uniform_kernel(
     TensorIteratorBase& iter,
     double from,
     double to,
     c10::optional<Generator> gen);
 
-void normal_kernel(
+TORCH_XPU_API void normal_kernel(
     const TensorBase& self,
     double mean,
     double std,
     c10::optional<Generator> gen);
 
-void bernoulli_tensor_kernel(
+TORCH_XPU_API void bernoulli_tensor_kernel(
     const TensorBase& self,
     const TensorBase& p_,
     c10::optional<Generator> gen_);
 
-void bernoulli_scalar_kernel(
+TORCH_XPU_API void bernoulli_scalar_kernel(
     const TensorBase& self,
     double p,
     c10::optional<Generator> gen);
 
-void exponential_kernel(
+TORCH_XPU_API void exponential_kernel(
     TensorIteratorBase& iter,
     double lambda,
     c10::optional<Generator> gen);
diff --git a/src/ATen/native/xpu/sycl/DistributionNormal.cpp b/src/ATen/native/xpu/sycl/DistributionNormal.cpp
index 41b7c6df5..93b938210 100644
--- a/src/ATen/native/xpu/sycl/DistributionNormal.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionNormal.cpp
@@ -9,6 +9,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
index 138d81454..3c15de1d9 100644
--- a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
@@ -9,6 +9,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DistributionUniform.cpp b/src/ATen/native/xpu/sycl/DistributionUniform.cpp
index 91261e18d..c38626909 100644
--- a/src/ATen/native/xpu/sycl/DistributionUniform.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionUniform.cpp
@@ -9,6 +9,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
 
+#include <ATen/native/xpu/sycl/DistributionKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp
index 3886a2a72..ddaee4d4c 100644
--- a/src/ATen/native/xpu/sycl/Dropout.cpp
+++ b/src/ATen/native/xpu/sycl/Dropout.cpp
@@ -10,6 +10,8 @@
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/TensorInfo.h>
 
+#include <ATen/native/xpu/sycl/DropoutKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DropoutKernels.h b/src/ATen/native/xpu/sycl/DropoutKernels.h
index a72e8e2f8..922b2838c 100644
--- a/src/ATen/native/xpu/sycl/DropoutKernels.h
+++ b/src/ATen/native/xpu/sycl/DropoutKernels.h
@@ -6,15 +6,13 @@ namespace at {
 namespace native {
 namespace xpu {
 
-std::tuple<Tensor, Tensor> dropout_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> dropout_kernel(
     const Tensor& self,
     double p,
     c10::optional<bool> train);
 
-Tensor dropout_backward_kernel(
-    const Tensor& grad,
-    const Tensor& mask,
-    double scale);
+TORCH_XPU_API Tensor
+dropout_backward_kernel(const Tensor& grad, const Tensor& mask, double scale);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/Embedding.cpp b/src/ATen/native/xpu/sycl/Embedding.cpp
index b75494ca2..d905a4d97 100644
--- a/src/ATen/native/xpu/sycl/Embedding.cpp
+++ b/src/ATen/native/xpu/sycl/Embedding.cpp
@@ -4,6 +4,8 @@
 #include <ATen/native/xpu/sycl/EmbeddingBackwardKernel.h>
 #include <ATen/native/xpu/sycl/pstl/PSTLFunctions.h>
 
+#include <ATen/native/xpu/sycl/EmbeddingKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index 0c7338d5d..f4f35061f 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/EmbeddingBag.h>
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
 
+#include <ATen/native/xpu/sycl/EmbeddingBagKernels.h>
+
 namespace at::native::xpu {
 
 std::pair<Tensor, Tensor> promoteIndicesAndOffsets(
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
index 047981b57..97ad6f0d0 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_kernel(
     const Tensor& weight_t,
     const Tensor& indices_t,
     const Tensor& offsets_t,
diff --git a/src/ATen/native/xpu/sycl/EmbeddingKernels.h b/src/ATen/native/xpu/sycl/EmbeddingKernels.h
index 5aaa3257a..f2e75b37d 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingKernels.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-Tensor embedding_dense_backward_kernel(
+TORCH_XPU_API Tensor embedding_dense_backward_kernel(
     const Tensor& grad_,
     const Tensor& indices_,
     int64_t num_weights,
diff --git a/src/ATen/native/xpu/sycl/FillKernel.cpp b/src/ATen/native/xpu/sycl/FillKernel.cpp
index a0ee93b81..22e845364 100644
--- a/src/ATen/native/xpu/sycl/FillKernel.cpp
+++ b/src/ATen/native/xpu/sycl/FillKernel.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/FillKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/FillKernel.h b/src/ATen/native/xpu/sycl/FillKernel.h
index 82bdd5f39..11c813cc0 100644
--- a/src/ATen/native/xpu/sycl/FillKernel.h
+++ b/src/ATen/native/xpu/sycl/FillKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void fill_kernel(TensorIterator& iter, const Scalar& scalar);
+TORCH_XPU_API void fill_kernel(TensorIterator& iter, const Scalar& scalar);
 
 }
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
index 56afbf630..d4448ca1b 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
@@ -31,11 +31,11 @@ namespace at::native::xpu {
   std::vector<Tensor> FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)( \
       TensorList tensor1, TensorList tensor2, const Scalar& alpha)
 
-FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add);
-FOREACH_BINARY_LIST_ALPHA_KERNEL(add);
-FOREACH_BINARY_LIST_INPLACE_KERNEL(mul);
-FOREACH_BINARY_LIST_KERNEL(mul);
-FOREACH_BINARY_LIST_INPLACE_KERNEL(div);
-FOREACH_BINARY_LIST_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(div);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
index 1f3ab4afc..4aac91b47 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
@@ -17,11 +17,11 @@ namespace at::native::xpu {
   std::vector<Tensor> FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)( \
       TensorList tensors, const Scalar& scalar)
 
-FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add);
-FOREACH_BINARY_SCALAR_KERNEL(add);
-FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul);
-FOREACH_BINARY_SCALAR_KERNEL(mul);
-FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div);
-FOREACH_BINARY_SCALAR_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(div);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
index 331f5c0ba..a863983dd 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
@@ -17,11 +17,11 @@ namespace at::native::xpu {
   std::vector<Tensor> FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)( \
       TensorList tensors, at::ArrayRef<Scalar> scalars)
 
-FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add);
-FOREACH_BINARY_SCALARLIST_KERNEL(add);
-FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul);
-FOREACH_BINARY_SCALARLIST_KERNEL(mul);
-FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div);
-FOREACH_BINARY_SCALARLIST_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(mul);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(div);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
index 89e3b0ea2..4f2eb478d 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
@@ -11,9 +11,9 @@ namespace at::native::xpu {
 #define FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(NAME) \
   FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(NAME)
 
-FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv);
-FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
index ed8b7f95b..cac1d235f 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
@@ -17,9 +17,9 @@ namespace at::native::xpu {
       TensorList tensors2,                               \
       const Scalar& scalar)
 
-FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv);
-FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
index e1afdf6fb..69b89796d 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
@@ -17,9 +17,9 @@ namespace at::native::xpu {
       TensorList tensors2,                                   \
       at::ArrayRef<Scalar> scalars)
 
-FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul);
-FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv);
-FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv);
+TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
index 849a77443..cc90fa893 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
+
 enum class NormType { L1, L2 };
 
 namespace at::native::xpu {
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
index a5cd8b448..187246aca 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
@@ -3,7 +3,7 @@
 
 namespace at::native::xpu {
 
-std::vector<Tensor> foreach_norm_kernel(
+TORCH_XPU_API std::vector<Tensor> foreach_norm_kernel(
     TensorList tensors,
     const Scalar& ord,
     double p,
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
index 297662928..44b11cfd9 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
@@ -5,6 +5,9 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
+#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
+#include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
index f8a725a5d..67d90a9db 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
@@ -3,13 +3,13 @@
 
 namespace at::native::xpu {
 
-void foreach_lerp_list_kernel(
+TORCH_XPU_API void foreach_lerp_list_kernel(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3,
     TensorList result);
 
-void foreach_lerp_list_kernel_(
+TORCH_XPU_API void foreach_lerp_list_kernel_(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3);
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
index f6b8a71b7..2c52b99f4 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
@@ -3,13 +3,13 @@
 
 namespace at::native::xpu {
 
-void foreach_lerp_scalar_kernel(
+TORCH_XPU_API void foreach_lerp_scalar_kernel(
     TensorList tensors1,
     TensorList tensors2,
     const Scalar& weight,
     TensorList result);
 
-void foreach_lerp_scalar_kernel_(
+TORCH_XPU_API void foreach_lerp_scalar_kernel_(
     TensorList tensors1,
     TensorList tensors2,
     const Scalar& weight);
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
index 9b046472c..791345e54 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
+#include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, template <class> class Op>
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
index 3c910b7df..7d7b79d5c 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
@@ -3,8 +3,8 @@
 
 namespace at::native::xpu {
 
-std::vector<Tensor> foreach_sqrt_kernel(TensorList tensors);
+TORCH_XPU_API std::vector<Tensor> foreach_sqrt_kernel(TensorList tensors);
 
-void foreach_sqrt_kernel_(TensorList tensors);
+TORCH_XPU_API void foreach_sqrt_kernel_(TensorList tensors);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
index 2c257d4d2..a3b70383b 100644
--- a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/GcdLcmKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -20,4 +22,4 @@ void gcd_kernel(TensorIteratorBase& iter) {
   });
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.h b/src/ATen/native/xpu/sycl/GcdLcmKernels.h
index acf855d40..ba429d762 100644
--- a/src/ATen/native/xpu/sycl/GcdLcmKernels.h
+++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gcd_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void gcd_kernel(TensorIteratorBase& iter);
 
 }
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp
index 9427fd4ce..e0a51293f 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.cpp
+++ b/src/ATen/native/xpu/sycl/GridSampler.cpp
@@ -14,6 +14,8 @@
 #include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/GridSampler.h>
 
+#include <ATen/native/xpu/sycl/GridSamplerKernels.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu::detail;
diff --git a/src/ATen/native/xpu/sycl/GridSamplerKernels.h b/src/ATen/native/xpu/sycl/GridSamplerKernels.h
index cc9ba4be4..28bfc6972 100644
--- a/src/ATen/native/xpu/sycl/GridSamplerKernels.h
+++ b/src/ATen/native/xpu/sycl/GridSamplerKernels.h
@@ -4,14 +4,14 @@
 
 namespace at::native::xpu {
 
-Tensor grid_sampler_2d_kernel(
+TORCH_XPU_API Tensor grid_sampler_2d_kernel(
     const Tensor& input,
     const Tensor& grid,
     int64_t interpolation_mode,
     int64_t padding_mode,
     bool align_corners);
 
-void grid_sampler_2d_backward_kernel(
+TORCH_XPU_API void grid_sampler_2d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_grid,
     const Tensor& grad_output,
@@ -22,4 +22,4 @@ void grid_sampler_2d_backward_kernel(
     bool align_corners,
     std::array<bool, 2> output_mask);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
index 8dafdc8d2..66b3275e1 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -10,6 +10,8 @@
 #include <comm/MemoryFormat.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/GroupNormKernels.h>
+
 namespace at::native::xpu {
 
 template <
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.h b/src/ATen/native/xpu/sycl/GroupNormKernels.h
index 97dac5332..3cc680c0b 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.h
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void group_norm_kernel(
+TORCH_XPU_API void group_norm_kernel(
     const Tensor& X,
     const Tensor& gamma,
     const Tensor& beta,
@@ -18,7 +18,7 @@ void group_norm_kernel(
     Tensor& rstd,
     ScalarType dtype);
 
-void group_norm_backward_kernel(
+TORCH_XPU_API void group_norm_backward_kernel(
     const Tensor& dY,
     const Tensor& X,
     const Tensor& mean,
diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h
index f05e91e25..e51f0fd10 100644
--- a/src/ATen/native/xpu/sycl/HistogramKernels.h
+++ b/src/ATen/native/xpu/sycl/HistogramKernels.h
@@ -4,14 +4,14 @@
 
 namespace at::native::xpu {
 
-void histogramdd_kernel(
+TORCH_XPU_API void histogramdd_kernel(
     const Tensor& self,
     const std::optional<Tensor>& weight,
     bool density,
     Tensor& hist,
     const TensorList& bin_edges_);
 
-void histogramdd_linear_kernel(
+TORCH_XPU_API void histogramdd_linear_kernel(
     const Tensor& self,
     const std::optional<Tensor>& weight,
     bool density,
@@ -19,4 +19,4 @@ void histogramdd_linear_kernel(
     const TensorList& bin_edges_,
     bool local_search);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
index 8a8a9beaf..4ffa24e48 100644
--- a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
+++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
@@ -17,6 +17,8 @@
 #include <ATen/ops/linspace.h>
 #endif
 
+#include <ATen/native/xpu/sycl/HistogramKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
index aa511e6df..996a85126 100644
--- a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
@@ -5,6 +5,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/Im2ColKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.h b/src/ATen/native/xpu/sycl/Im2ColKernel.h
index e18fa063b..4b7090231 100644
--- a/src/ATen/native/xpu/sycl/Im2ColKernel.h
+++ b/src/ATen/native/xpu/sycl/Im2ColKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void im2col_kernel(
+TORCH_XPU_API void im2col_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef kernel_size,
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index fcee372d1..e9b2a0b00 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -19,6 +19,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/TensorInfo.h>
 
+#include <ATen/native/xpu/sycl/IndexingKernels.h>
+
 using namespace at::xpu::detail;
 using namespace at::xpu;
 
diff --git a/src/ATen/native/xpu/sycl/IndexingKernels.h b/src/ATen/native/xpu/sycl/IndexingKernels.h
index 8f32f49f9..288cd5391 100644
--- a/src/ATen/native/xpu/sycl/IndexingKernels.h
+++ b/src/ATen/native/xpu/sycl/IndexingKernels.h
@@ -3,22 +3,24 @@
 
 namespace at::native::xpu {
 
-void index_kernel(
+TORCH_XPU_API void index_kernel(
     TensorIterator& iter,
     at::IntArrayRef index_size,
     at::IntArrayRef index_stride,
     at::IntArrayRef non_index_size,
     at::IntArrayRef non_index_stride);
 
-void index_select_kernel(
+TORCH_XPU_API void index_select_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Tensor& out);
 
-void masked_fill_kernel(TensorIterator& iter, const Scalar& value);
+TORCH_XPU_API void masked_fill_kernel(
+    TensorIterator& iter,
+    const Scalar& value);
 
-void index_add_kernel(
+TORCH_XPU_API void index_add_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
@@ -26,13 +28,13 @@ void index_add_kernel(
     const Scalar& alpha,
     const Tensor& out);
 
-void index_fill_kernel(
+TORCH_XPU_API void index_fill_kernel(
     Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Scalar& source);
 
-void index_put_kernel(
+TORCH_XPU_API void index_put_kernel(
     TensorIterator& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride,
@@ -40,14 +42,14 @@ void index_put_kernel(
     IntArrayRef non_index_stride,
     bool accumulate);
 
-void index_put_deterministic_kernel(
+TORCH_XPU_API void index_put_deterministic_kernel(
     Tensor& self,
     const c10::List<c10::optional<Tensor>>& indices,
     const Tensor& value,
     bool accumulate,
     bool unsafe);
 
-void masked_scatter_kernel(
+TORCH_XPU_API void masked_scatter_kernel(
     const TensorBase& self,
     const TensorBase& mask,
     const TensorBase& maskPrefixSum,
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index b21ec1589..60d40b71d 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -9,6 +9,8 @@
 #include <ATen/native/xpu/sycl/Norm.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/LayerNormKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.h b/src/ATen/native/xpu/sycl/LayerNormKernels.h
index 0797d50fd..0c57a61ba 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.h
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-std::tuple<Tensor, Tensor, Tensor> layer_norm_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> layer_norm_kernel(
     const Tensor& X,
     const Tensor& gamma,
     const Tensor& beta,
@@ -17,7 +17,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_kernel(
     Tensor& mean,
     Tensor& rstd);
 
-std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_kernel(
     const Tensor& dY,
     const Tensor& X,
     const Tensor& mean,
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp
index b0f480ac3..1648f193b 100644
--- a/src/ATen/native/xpu/sycl/LerpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/LerpKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.h b/src/ATen/native/xpu/sycl/LerpKernels.h
index c455adee8..6f13ec6eb 100644
--- a/src/ATen/native/xpu/sycl/LerpKernels.h
+++ b/src/ATen/native/xpu/sycl/LerpKernels.h
@@ -4,8 +4,10 @@
 
 namespace at::native::xpu {
 
-void lerp_tensor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lerp_tensor_kernel(TensorIteratorBase& iter);
 
-void lerp_scalar_kernel(TensorIteratorBase& iter, const c10::Scalar& weight);
+TORCH_XPU_API void lerp_scalar_kernel(
+    TensorIteratorBase& iter,
+    const c10::Scalar& weight);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
index 8d3128e9e..0f478eceb 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
@@ -1,6 +1,8 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/LinearAlgebraKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
index 32f987a2e..ca61016b9 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
@@ -3,6 +3,9 @@
 
 namespace at::native::xpu {
 
-void addr_kernel(TensorIterator& iter, const Scalar& beta, const Scalar& alpha);
+TORCH_XPU_API void addr_kernel(
+    TensorIterator& iter,
+    const Scalar& beta,
+    const Scalar& alpha);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp
index 69b6a73a1..d79964295 100644
--- a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp
@@ -5,6 +5,9 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <c10/util/MathConstants.h>
+
+#include <ATen/native/xpu/sycl/LogAddExpKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.h b/src/ATen/native/xpu/sycl/LogAddExpKernels.h
index 01b9f400f..3a5ae4c34 100644
--- a/src/ATen/native/xpu/sycl/LogAddExpKernels.h
+++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void logaddexp_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logaddexp_kernel(TensorIteratorBase& iter);
 
-void logaddexp2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logaddexp2_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LossKernels.cpp b/src/ATen/native/xpu/sycl/LossKernels.cpp
index f7c260535..69315adf5 100644
--- a/src/ATen/native/xpu/sycl/LossKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossKernels.cpp
@@ -4,6 +4,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/LossKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/LossKernels.h b/src/ATen/native/xpu/sycl/LossKernels.h
index 17f61afbb..560eb2c44 100644
--- a/src/ATen/native/xpu/sycl/LossKernels.h
+++ b/src/ATen/native/xpu/sycl/LossKernels.h
@@ -3,14 +3,14 @@
 
 namespace at::native::xpu {
 
-Tensor& binary_cross_entropy_kernel(
+TORCH_XPU_API Tensor& binary_cross_entropy_kernel(
     const Tensor& input,
     const Tensor& target,
     const Tensor& weight,
     int64_t reduction,
     Tensor& loss);
 
-Tensor& binary_cross_entropy_backward_kernel(
+TORCH_XPU_API Tensor& binary_cross_entropy_backward_kernel(
     const Tensor& grad,
     const Tensor& input,
     const Tensor& target,
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
index c84613a4f..4b93cb3c3 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
@@ -11,6 +11,8 @@
 #include <ATen/native/xpu/sycl/GroupReduceUtils.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/LossNLL2dKernels.h>
+
 namespace at::native::xpu {
 inline Tensor optional_contiguous(const Tensor& source) {
   return source.defined() ? source.contiguous() : source;
@@ -251,7 +253,15 @@ void nll_loss2d_forward_kernel(
       at::ScalarType::BFloat16,
       input.scalar_type(),
       "nll_loss2d_forward_kernel",
-      [&] {
+      [&input_,
+       &weight_,
+       &target_,
+       &output,
+       &total_weight,
+       &input,
+       &target,
+       &reduction,
+       &ignore_index] {
         using accscalar_t = acc_type_device<scalar_t, kXPU>;
         AT_DISPATCH_INDEX_TYPES(
             at::native::canUse32BitIndexMath(input_, INT_MAX)
@@ -527,4 +537,4 @@ void nll_loss2d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h
index cb2084649..87519b5e7 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h
@@ -3,7 +3,8 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-void nll_loss2d_forward_kernel(
+
+TORCH_XPU_API void nll_loss2d_forward_kernel(
     Tensor& output,
     Tensor& total_weight,
     const Tensor& input,
@@ -12,7 +13,7 @@ void nll_loss2d_forward_kernel(
     int64_t reduction,
     int64_t ignore_index);
 
-void nll_loss2d_backward_kernel(
+TORCH_XPU_API void nll_loss2d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
@@ -21,4 +22,5 @@ void nll_loss2d_backward_kernel(
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight);
-} // namespace at::native::xpu
\ No newline at end of file
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
index f3d6c1f94..2f8e2a790 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
@@ -4,6 +4,8 @@
 #include <ATen/core/Reduction.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/LossNLLKernel.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu;
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.h b/src/ATen/native/xpu/sycl/LossNLLKernel.h
index 301cf53bf..fdbd7a29c 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.h
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.h
@@ -3,7 +3,7 @@
 
 namespace at::native::xpu {
 
-std::tuple<Tensor&, Tensor&> nll_loss_forward_kernel(
+TORCH_XPU_API std::tuple<Tensor&, Tensor&> nll_loss_forward_kernel(
     const Tensor& self,
     const Tensor& target,
     const OptionalTensorRef weight_opt,
@@ -12,7 +12,7 @@ std::tuple<Tensor&, Tensor&> nll_loss_forward_kernel(
     Tensor& output,
     Tensor& total_weight);
 
-Tensor& nll_loss_backward_kernel(
+TORCH_XPU_API Tensor& nll_loss_backward_kernel(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -22,4 +22,4 @@ Tensor& nll_loss_backward_kernel(
     const Tensor& total_weight,
     Tensor& grad_input);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/Math.h b/src/ATen/native/xpu/sycl/MathExtensions.h
similarity index 100%
rename from src/ATen/native/xpu/sycl/Math.h
rename to src/ATen/native/xpu/sycl/MathExtensions.h
diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp
index 0d38a4882..6d535a40d 100644
--- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp
+++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/MaxMinElementwiseKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h
index 063a086cd..b54c69717 100644
--- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h
+++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void maximum_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void maximum_kernel(TensorIteratorBase& iter);
 
-void minimum_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void minimum_kernel(TensorIteratorBase& iter);
 
-void fmax_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void fmax_kernel(TensorIteratorBase& iter);
 
-void fmin_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void fmin_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp
index 866f62a1a..d8418fef9 100644
--- a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp
+++ b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp
@@ -15,6 +15,9 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
+
+#include <ATen/native/xpu/sycl/MultinomialKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename item_t>
diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.h b/src/ATen/native/xpu/sycl/MultinomialKernel.h
index d400e51cd..5adede6f6 100644
--- a/src/ATen/native/xpu/sycl/MultinomialKernel.h
+++ b/src/ATen/native/xpu/sycl/MultinomialKernel.h
@@ -3,10 +3,10 @@
 
 namespace at::native::xpu {
 
-void multinomial_kernel(
+TORCH_XPU_API void multinomial_kernel(
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
     c10::optional<Generator> generator);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/NMSKernel.cpp b/src/ATen/native/xpu/sycl/NMSKernel.cpp
index 3bf712582..78e885307 100644
--- a/src/ATen/native/xpu/sycl/NMSKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NMSKernel.cpp
@@ -2,6 +2,8 @@
 #include <ATen/AccumulateType.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/NMSKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/NMSKernel.h b/src/ATen/native/xpu/sycl/NMSKernel.h
index 17aec3f7b..198223731 100644
--- a/src/ATen/native/xpu/sycl/NMSKernel.h
+++ b/src/ATen/native/xpu/sycl/NMSKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold);
+TORCH_XPU_API Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold);
 
 }
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
index e1c50a263..0cb77d52f 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
@@ -5,6 +5,8 @@
 #include <comm/SYCLHelpers.h>
 #include <comm/TensorInfo.h>
 
+#include <ATen/native/xpu/sycl/NonzeroKernel.h>
+
 namespace at::native::xpu {
 
 struct FlattenIdxtoRealIdxKernelFunctor {
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.h b/src/ATen/native/xpu/sycl/NonzeroKernel.h
index 1306dfa28..de92eeddb 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.h
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.h
@@ -3,6 +3,6 @@
 
 namespace at::native::xpu {
 
-void nonzero_kernel(const Tensor& self, Tensor& out);
+TORCH_XPU_API void nonzero_kernel(const Tensor& self, Tensor& out);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
index 822a83e99..76e8469eb 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/PointwiseOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
index 613c3cca6..f4572ae70 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -4,15 +4,20 @@
 
 namespace at::native::xpu {
 
-void addcmul_kernel(TensorIterator& iter, Scalar value);
+TORCH_XPU_API void addcmul_kernel(TensorIterator& iter, Scalar value);
 
-void addcdiv_kernel(TensorIterator& iter, Scalar value);
+TORCH_XPU_API void addcdiv_kernel(TensorIterator& iter, Scalar value);
 
-void mse_backward_kernel(TensorIterator& iter, const Scalar& value);
+TORCH_XPU_API void mse_backward_kernel(
+    TensorIterator& iter,
+    const Scalar& value);
 
-void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta);
+TORCH_XPU_API void smooth_l1_backward_kernel(
+    TensorIterator& iter,
+    Scalar norm,
+    double beta);
 
-void huber_backward_kernel(
+TORCH_XPU_API void huber_backward_kernel(
     TensorIterator& iter,
     const Scalar& norm,
     double delta);
diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp
index 5b6769d1f..c0fad9f27 100644
--- a/src/ATen/native/xpu/sycl/PowKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PowKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/UnaryFractionKernels.h>
 #include <ATen/native/xpu/sycl/UnaryKernels.h>
 
+#include <ATen/native/xpu/sycl/PowKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/PowKernels.h b/src/ATen/native/xpu/sycl/PowKernels.h
index 31a31c420..d21d0d1a7 100644
--- a/src/ATen/native/xpu/sycl/PowKernels.h
+++ b/src/ATen/native/xpu/sycl/PowKernels.h
@@ -6,11 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void pow_tensor_scalar_kernel(
+TORCH_XPU_API void pow_tensor_scalar_kernel(
     TensorIteratorBase& iter,
     const Scalar& exp_scalar);
 
-void pow_tensor_tensor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void pow_tensor_tensor_kernel(TensorIteratorBase& iter);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.cpp b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
index d151de28c..d30fa3007 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
@@ -7,6 +7,8 @@
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/RandpermKernel.h>
+
 namespace at::native::xpu {
 
 template <typename T, typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.h b/src/ATen/native/xpu/sycl/RandpermKernel.h
index 3a25c311f..2fa5f8e4c 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.h
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.h
@@ -2,8 +2,8 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-Tensor randperm_kernel(
-    Tensor& result,
-    int64_t n,
-    c10::optional<Generator> generator);
-} // namespace at::native::xpu
\ No newline at end of file
+
+TORCH_XPU_API Tensor
+randperm_kernel(Tensor& result, int64_t n, c10::optional<Generator> generator);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
index 54b6c6328..f50ce1abf 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/detail/FunctionTraits.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/RangeFactoriesKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
index 3cf08ca5d..71dc4f5fb 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
@@ -6,13 +6,13 @@ namespace at {
 namespace native {
 namespace xpu {
 
-Tensor& arange_kernel(
+TORCH_XPU_API Tensor& arange_kernel(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
     Tensor& result);
 
-Tensor& range_kernel(
+TORCH_XPU_API Tensor& range_kernel(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
diff --git a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
index 121761053..a2df2bd63 100644
--- a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
index 1e18b2e5b..c522dbe7c 100644
--- a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
index 3c9f8453d..60f92ef3b 100644
--- a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename acc_t = scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
index 8eabf3353..ec753c66a 100644
--- a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
index 16095056c..3dfbb05c2 100644
--- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceMaxValuesKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h
index 6503f8842..9a669ec7a 100644
--- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h
+++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void max_values_kernel(TensorIterator& iter);
+TORCH_XPU_API void max_values_kernel(TensorIterator& iter);
 
-void max_kernel(TensorIterator& iter);
+TORCH_XPU_API void max_kernel(TensorIterator& iter);
 
-void max_all_kernel(TensorIterator& iter);
+TORCH_XPU_API void max_all_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
index 2a0ce889c..f73df51a0 100644
--- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceMinValuesKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h
index d5600f5d4..65f3acf8e 100644
--- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h
+++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void min_values_kernel(TensorIterator& iter);
+TORCH_XPU_API void min_values_kernel(TensorIterator& iter);
 
-void min_kernel(TensorIterator& iter);
+TORCH_XPU_API void min_kernel(TensorIterator& iter);
 
-void min_all_kernel(TensorIterator& iter);
+TORCH_XPU_API void min_all_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
index 6d0e75680..7e24f7ecc 100644
--- a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Reduce.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
index 658f2e21b..565f5868e 100644
--- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -1,7 +1,9 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
+
+#include <ATen/native/xpu/sycl/ReduceNormKernel.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.h b/src/ATen/native/xpu/sycl/ReduceNormKernel.h
index 5e255dd5b..bc14826ec 100644
--- a/src/ATen/native/xpu/sycl/ReduceNormKernel.h
+++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void norm_kernel(TensorIterator& iter, const Scalar& val);
+TORCH_XPU_API void norm_kernel(TensorIterator& iter, const Scalar& val);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
index d85754efb..8b03fb137 100644
--- a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
@@ -4,26 +4,29 @@
 
 namespace at::native::xpu {
 
-void argmax_kernel(TensorIterator& iter);
+TORCH_XPU_API void argmax_kernel(TensorIterator& iter);
 
-void argmin_kernel(TensorIterator& iter);
+TORCH_XPU_API void argmin_kernel(TensorIterator& iter);
 
-void and_kernel(TensorIterator& iter);
+TORCH_XPU_API void and_kernel(TensorIterator& iter);
 
-void or_kernel(TensorIterator& iter);
+TORCH_XPU_API void or_kernel(TensorIterator& iter);
 
-void mean_kernel(TensorIterator& iter);
+TORCH_XPU_API void mean_kernel(TensorIterator& iter);
 
-void sum_kernel(TensorIterator& iter);
+TORCH_XPU_API void sum_kernel(TensorIterator& iter);
 
-void prod_kernel(TensorIterator& iter);
+TORCH_XPU_API void prod_kernel(TensorIterator& iter);
 
-void nansum_kernel(TensorIterator& iter);
+TORCH_XPU_API void nansum_kernel(TensorIterator& iter);
 
-void std_var_kernel(TensorIterator& iter, double correction, bool take_sqrt);
+TORCH_XPU_API void std_var_kernel(
+    TensorIterator& iter,
+    double correction,
+    bool take_sqrt);
 
-void aminmax_kernel(TensorIterator& iter);
+TORCH_XPU_API void aminmax_kernel(TensorIterator& iter);
 
-void aminmax_allreduce_kernel(TensorIterator& iter);
+TORCH_XPU_API void aminmax_allreduce_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
index f250b34ad..7bdc3a188 100644
--- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
 
+#include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
index 1627a5def..efddb53e6 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
@@ -14,6 +14,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ReflectionPadKernels.h>
+
 namespace at::native::xpu {
 
 inline std::pair<int64_t, int64_t> get_index_mapping1d(
@@ -874,4 +876,4 @@ void reflection_pad3d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
index a21f6c8ee..093c5d9b5 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
@@ -4,37 +4,37 @@
 
 namespace at::native::xpu {
 
-void reflection_pad1d_kernel(
+TORCH_XPU_API void reflection_pad1d_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
-void reflection_pad1d_backward_kernel(
+TORCH_XPU_API void reflection_pad1d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding);
 
-void reflection_pad2d_kernel(
+TORCH_XPU_API void reflection_pad2d_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
-void reflection_pad2d_backward_kernel(
+TORCH_XPU_API void reflection_pad2d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding);
 
-void reflection_pad3d_kernel(
+TORCH_XPU_API void reflection_pad3d_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
-void reflection_pad3d_backward_kernel(
+TORCH_XPU_API void reflection_pad3d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RenormKernel.cpp b/src/ATen/native/xpu/sycl/RenormKernel.cpp
index 0a1696d73..0f08a6566 100644
--- a/src/ATen/native/xpu/sycl/RenormKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RenormKernel.cpp
@@ -2,6 +2,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/RenormKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/RenormKernel.h b/src/ATen/native/xpu/sycl/RenormKernel.h
index a619cf708..488e8365a 100644
--- a/src/ATen/native/xpu/sycl/RenormKernel.h
+++ b/src/ATen/native/xpu/sycl/RenormKernel.h
@@ -3,6 +3,8 @@
 
 namespace at::native::xpu {
 
-void renorm_scale_factor_kernel(TensorIteratorBase& iter, double maxnorm);
+TORCH_XPU_API void renorm_scale_factor_kernel(
+    TensorIteratorBase& iter,
+    double maxnorm);
 
 }
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
index 0b10f982a..f1eeb426b 100644
--- a/src/ATen/native/xpu/sycl/RepeatKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
@@ -2,6 +2,9 @@
 #include <ATen/native/Repeat.h>
 #include <ATen/native/xpu/sycl/RepeatKernel.h>
 #include <comm/SYCLContext.h>
+
+#include <ATen/native/xpu/sycl/RepeatKernel.h>
+
 namespace at::native::xpu {
 template <typename index_t>
 struct RepeatInterleaveKernelFunctor {
@@ -72,4 +75,4 @@ Tensor repeat_interleave_kernel(
   });
   return output;
 }
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h
index 3df41a956..9856c4305 100644
--- a/src/ATen/native/xpu/sycl/RepeatKernel.h
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.h
@@ -2,8 +2,8 @@
 #include <ATen/ATen.h>
 namespace at::native::xpu {
 
-Tensor repeat_interleave_kernel(
+TORCH_XPU_API Tensor repeat_interleave_kernel(
     const Tensor& repeats,
     c10::optional<int64_t> output_size);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
index 4ba4eafb9..2371ad695 100644
--- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
@@ -13,6 +13,8 @@
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ReplicationPaddingKernels.h>
+
 namespace at::native::xpu {
 
 inline int imin(int a, int b) {
@@ -763,4 +765,4 @@ void replication_pad3d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
index 85d19d5bf..0269b962e 100644
--- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
@@ -4,37 +4,37 @@
 
 namespace at::native::xpu {
 
-void replication_pad1d_kernel(
+TORCH_XPU_API void replication_pad1d_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad1d_backward_kernel(
+TORCH_XPU_API void replication_pad1d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad2d_kernel(
+TORCH_XPU_API void replication_pad2d_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad2d_backward_kernel(
+TORCH_XPU_API void replication_pad2d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad3d_kernel(
+TORCH_XPU_API void replication_pad3d_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
-void replication_pad3d_backward_kernel(
+TORCH_XPU_API void replication_pad3d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
index 195d526d8..0835b53ff 100644
--- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
@@ -3,6 +3,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/XPUGuard.h>
 
+#include <ATen/native/xpu/sycl/ResizeKernel.h>
+
 namespace at::native::xpu {
 
 void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) {
@@ -58,7 +60,7 @@ TensorImpl* resize_impl_xpu_(
     TensorImpl* self,
     IntArrayRef size,
     at::OptionalIntArrayRef stride,
-    bool device_guard = true) {
+    bool device_guard) {
   if (self->sizes() == size && (!stride || self->strides() == stride)) {
     return self;
   }
diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.h b/src/ATen/native/xpu/sycl/ResizeKernel.h
index 5cef196ed..c5c113194 100644
--- a/src/ATen/native/xpu/sycl/ResizeKernel.h
+++ b/src/ATen/native/xpu/sycl/ResizeKernel.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-TensorImpl* resize_impl_xpu_(
+TORCH_XPU_API TensorImpl* resize_impl_xpu_(
     TensorImpl* self,
     IntArrayRef size,
     at::OptionalIntArrayRef stride,
diff --git a/src/ATen/native/xpu/sycl/ScanKernels.h b/src/ATen/native/xpu/sycl/ScanKernels.h
index af632fa83..ea20e4460 100644
--- a/src/ATen/native/xpu/sycl/ScanKernels.h
+++ b/src/ATen/native/xpu/sycl/ScanKernels.h
@@ -3,8 +3,14 @@
 
 namespace at::native::xpu {
 
-void cumsum_kernel(const Tensor& result, const Tensor& self, int64_t dim);
+TORCH_XPU_API void cumsum_kernel(
+    const Tensor& result,
+    const Tensor& self,
+    int64_t dim);
 
-void cumprod_kernel(const Tensor& result, const Tensor& self, int64_t dim);
+TORCH_XPU_API void cumprod_kernel(
+    const Tensor& result,
+    const Tensor& self,
+    int64_t dim);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
index 81d4b9e9a..597be8553 100644
--- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
@@ -14,6 +14,8 @@
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ScatterGatherKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h
index 0f8b7e5be..4d7d375b7 100644
--- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h
+++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h
@@ -6,45 +6,45 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gather_kernel(
+TORCH_XPU_API void gather_kernel(
     const Tensor& result,
     const Tensor& self,
     int64_t dim,
     const Tensor& index);
 
-void scatter_kernel(
+TORCH_XPU_API void scatter_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Tensor& src);
 
-void scatter_fill_kernel(
+TORCH_XPU_API void scatter_fill_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Scalar& src);
 
-void scatter_add_kernel(
+TORCH_XPU_API void scatter_add_kernel(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Tensor& src);
 
-void scatter_reduce_kernel(
+TORCH_XPU_API void scatter_reduce_kernel(
     const Tensor& self,
     const int64_t dim,
     const Tensor& index,
     const Tensor& src,
     const ReductionType& reduce);
 
-void scatter_reduce_two_kernel(
+TORCH_XPU_API void scatter_reduce_two_kernel(
     const Tensor& self,
     const int64_t dim,
     const Tensor& index,
     const Tensor& src,
     const ReductionType& reduce);
 
-void scatter_scalar_reduce_kernel(
+TORCH_XPU_API void scatter_scalar_reduce_kernel(
     const Tensor& self,
     const int64_t dim,
     const Tensor& index,
diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
index 1c9c1b9d5..98089995c 100644
--- a/src/ATen/native/xpu/sycl/Shape.cpp
+++ b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -8,6 +8,8 @@
 #include <ATen/xpu/CachingHostAllocator.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ShapeKernels.h>
+
 namespace at::native::xpu {
 
 // The best performance is achieved for parallel computing with 1024 batch sizes
diff --git a/src/ATen/native/xpu/sycl/ShapeKernels.h b/src/ATen/native/xpu/sycl/ShapeKernels.h
index 494f37020..8f758bfb2 100644
--- a/src/ATen/native/xpu/sycl/ShapeKernels.h
+++ b/src/ATen/native/xpu/sycl/ShapeKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void cat_out_kernel(
+TORCH_XPU_API void cat_out_kernel(
     const ITensorListRef& tensors,
     int64_t dim,
     int64_t valid,
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index eb72795fc..7c0dd11db 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -8,6 +8,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/SoftMaxKernels.h>
+
 using namespace xpu::sycl;
 
 namespace at {
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
index 7ed23aeb2..66c4c3228 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
@@ -6,26 +6,26 @@ namespace at {
 namespace native {
 namespace xpu {
 
-Tensor& _softmax_kernel(
+TORCH_XPU_API Tensor& _softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
     Tensor& output);
 
-Tensor& _log_softmax_kernel(
+TORCH_XPU_API Tensor& _log_softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
     Tensor& output);
 
-Tensor& _softmax_backward_kernel(
+TORCH_XPU_API Tensor& _softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
     bool half_to_float,
     Tensor& grad_input);
 
-Tensor& _log_softmax_backward_kernel(
+TORCH_XPU_API Tensor& _log_softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
diff --git a/src/ATen/native/xpu/sycl/Sorting.cpp b/src/ATen/native/xpu/sycl/Sorting.cpp
index 6e3a4f49e..a4cf160a2 100644
--- a/src/ATen/native/xpu/sycl/Sorting.cpp
+++ b/src/ATen/native/xpu/sycl/Sorting.cpp
@@ -15,6 +15,8 @@
 #include <ATen/native/xpu/sycl/SortingRadixSelect.h>
 #include <c10/macros/Macros.h>
 
+#include <ATen/native/xpu/sycl/Sorting.h>
+
 namespace at::native::xpu {
 
 template <typename key_t, typename value_t, typename func_t>
diff --git a/src/ATen/native/xpu/sycl/Sorting.h b/src/ATen/native/xpu/sycl/Sorting.h
index 6abf7a475..813fe9ed3 100644
--- a/src/ATen/native/xpu/sycl/Sorting.h
+++ b/src/ATen/native/xpu/sycl/Sorting.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-std::tuple<Tensor&, Tensor&> sort_stable_kernel(
+TORCH_XPU_API std::tuple<Tensor&, Tensor&> sort_stable_kernel(
     const Tensor& self,
     c10::optional<bool> stable,
     Tensor& values,
@@ -12,7 +12,7 @@ std::tuple<Tensor&, Tensor&> sort_stable_kernel(
     int dim,
     bool descending);
 
-void launch_median_kernel(
+TORCH_XPU_API void launch_median_kernel(
     const TensorBase& vals,
     const TensorBase& inds,
     const TensorBase& self,
diff --git a/src/ATen/native/xpu/sycl/StepKernels.cpp b/src/ATen/native/xpu/sycl/StepKernels.cpp
index d3d76926b..34f703591 100644
--- a/src/ATen/native/xpu/sycl/StepKernels.cpp
+++ b/src/ATen/native/xpu/sycl/StepKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/StepKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/StepKernels.h b/src/ATen/native/xpu/sycl/StepKernels.h
index 90f949d9a..c026a6b65 100644
--- a/src/ATen/native/xpu/sycl/StepKernels.h
+++ b/src/ATen/native/xpu/sycl/StepKernels.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void nextafter_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void nextafter_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp
index 265170821..6e88a987f 100644
--- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp
@@ -10,6 +10,8 @@
 #include <comm/SYCLHelpers.h>
 #include <comm/TensorInfo.h>
 
+#include <ATen/native/xpu/sycl/SummaryOpsKernels.h>
+
 namespace at::native::xpu {
 using namespace at::native;
 using namespace at::xpu::detail;
diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
index becf3ac54..1a6465ab6 100644
--- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
@@ -3,9 +3,7 @@
 
 namespace at::native::xpu {
 
-Tensor bincount_kernel(
-    const Tensor& self,
-    const Tensor& weights,
-    int64_t minlength);
+TORCH_XPU_API Tensor
+bincount_kernel(const Tensor& self, const Tensor& weights, int64_t minlength);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
index 8159db75f..1d2d7f772 100644
--- a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/TensorCompareKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.h b/src/ATen/native/xpu/sycl/TensorCompareKernels.h
index 51cb74a40..0e83f33e9 100644
--- a/src/ATen/native/xpu/sycl/TensorCompareKernels.h
+++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.h
@@ -4,20 +4,24 @@
 
 namespace at::native::xpu {
 
-void where_kernel(TensorIterator& iter);
+TORCH_XPU_API void where_kernel(TensorIterator& iter);
 
-void clamp_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void clamp_kernel(TensorIteratorBase& iter);
 
-void clamp_scalar_kernel(
+TORCH_XPU_API void clamp_scalar_kernel(
     TensorIteratorBase& iter,
     const Scalar& min,
     const Scalar& max);
 
-void clamp_min_scalar_kernel(TensorIteratorBase& iter, Scalar min);
+TORCH_XPU_API void clamp_min_scalar_kernel(
+    TensorIteratorBase& iter,
+    Scalar min);
 
-void clamp_max_scalar_kernel(TensorIteratorBase& iter, Scalar max);
+TORCH_XPU_API void clamp_max_scalar_kernel(
+    TensorIteratorBase& iter,
+    Scalar max);
 
-void isin_kernel(
+TORCH_XPU_API void isin_kernel(
     const Tensor& elements,
     const Tensor& test_elements,
     bool invert,
diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
index 2de5fbb8b..d2ec40bb9 100644
--- a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
@@ -5,6 +5,8 @@
 #include <ATen/native/xpu/sycl/Sorting.h>
 #include <ATen/native/xpu/sycl/SortingKernels.h>
 
+#include <ATen/native/xpu/sycl/TensorTopKKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.h b/src/ATen/native/xpu/sycl/TensorTopKKernel.h
index cb5c5a654..44db5de70 100644
--- a/src/ATen/native/xpu/sycl/TensorTopKKernel.h
+++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-std::tuple<at::Tensor&, at::Tensor&> topk_kernel(
+TORCH_XPU_API std::tuple<at::Tensor&, at::Tensor&> topk_kernel(
     const at::Tensor& input,
     int64_t k,
     int64_t dim,
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
index ecf832de3..4a1ddcb74 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/TensorTransformationsKernels.h>
+
 #ifdef _WIN32
 #define RESTRICT __restrict
 #else
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
index b43bd52a8..b5fdbb661 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
@@ -4,9 +4,9 @@
 
 namespace at::native::xpu {
 
-void flip_kernel(TensorIterator& iter);
+TORCH_XPU_API void flip_kernel(TensorIterator& iter);
 
-void roll_kernel(
+TORCH_XPU_API void roll_kernel(
     const Tensor& input,
     Tensor& output,
     IntArrayRef shifts,
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
index 65c165c2c..4e646bcdd 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/Resize.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/TriangularOpsKernels.h>
+
 namespace at::native::xpu {
 
 using namespace at::xpu;
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
index 289fd0bd7..8ae777429 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
@@ -3,8 +3,14 @@
 
 namespace at::native::xpu {
 
-Tensor& tril_kernel(Tensor& result, const Tensor& self, int64_t k);
+TORCH_XPU_API Tensor& tril_kernel(
+    Tensor& result,
+    const Tensor& self,
+    int64_t k);
 
-Tensor& triu_kernel(Tensor& result, const Tensor& self, int64_t k);
+TORCH_XPU_API Tensor& triu_kernel(
+    Tensor& result,
+    const Tensor& self,
+    int64_t k);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
index 87de57a3a..d7a49f7d7 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
@@ -9,6 +9,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnaryComplexKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
index d3ad4fe15..7dd91771b 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void conj_kernel(TensorIterator& iter);
+TORCH_XPU_API void conj_kernel(TensorIterator& iter);
 
-void conj_physical_kernel(TensorIterator& iter);
+TORCH_XPU_API void conj_physical_kernel(TensorIterator& iter);
 
-void neg_conj_kernel(TensorIterator& iter);
+TORCH_XPU_API void neg_conj_kernel(TensorIterator& iter);
 
-void neg_kernel(TensorIterator& iter);
+TORCH_XPU_API void neg_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
index 96510a0e0..a8551c262 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryFractionKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
index 10f4889b0..a3a2015df 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
@@ -4,16 +4,18 @@
 
 namespace at::native::xpu {
 
-void reciprocal_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void reciprocal_kernel(TensorIteratorBase& iter);
 
-void floor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void floor_kernel(TensorIteratorBase& iter);
 
-void ceil_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void ceil_kernel(TensorIteratorBase& iter);
 
-void round_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void round_kernel(TensorIteratorBase& iter);
 
-void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals);
+TORCH_XPU_API void round_decimals_kernel(
+    TensorIteratorBase& iter,
+    int64_t decimals);
 
-void frac_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void frac_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp
index 4a8a4a55e..d2546e842 100644
--- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp
@@ -5,10 +5,12 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/CopyKernel.h>
 #include <ATen/native/xpu/sycl/Loops.h>
-#include <ATen/native/xpu/sycl/Math.h>
+#include <ATen/native/xpu/sycl/MathExtensions.h>
 #include <c10/core/ScalarType.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnaryGammaKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, bool USE_FP64_PI>
diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h
index 5eda977f6..2908a4b13 100644
--- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-void digamma_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void digamma_kernel(TensorIteratorBase& iter);
 
-void polygamma_kernel(TensorIteratorBase& iter, int64_t n);
+TORCH_XPU_API void polygamma_kernel(TensorIteratorBase& iter, int64_t n);
 
-void lgamma_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void lgamma_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp
index e1b8ff80f..d35dedf96 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h>
+
 namespace at::native::xpu {
 template <typename scalar_t, typename acc_t = scalar_t>
 struct AcosFunctor {
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h
index 4ff06d256..17acb1061 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h
@@ -3,6 +3,7 @@
 #include <ATen/native/TensorIterator.h>
 
 namespace at::native::xpu {
-void acos_kernel(TensorIteratorBase& iter);
 
-} // namespace at::native::xpu
\ No newline at end of file
+TORCH_XPU_API void acos_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp
index cd9864275..9062ee17a 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename acc_t = scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h
index 98dc2eed5..732d307ec 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void acosh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void acosh_kernel(TensorIteratorBase& iter);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
index c8b1aaca6..1819b47dc 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
index 194ce6479..ff449e2cd 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void asin_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void asin_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
index 009a68b47..24169a1dc 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
index 4d37288de..2668ee963 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void asinh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void asinh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
index f241e9da3..c6ce4b1b0 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
index 022720223..6ee127539 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void atan_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void atan_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
index 630a64d39..3033ff8f8 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
index 5536641e7..267b59eb6 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void atanh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void atanh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp
index 21552d99d..e4eee5e7a 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricCosKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h
index 35123a666..fc31580b7 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void cos_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void cos_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
index 11440b3d3..16cffe893 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
index 7f031e3ff..bbd10e1aa 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void cosh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void cosh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp
index 2a17cefcb..27e3e2906 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricSinKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h
index 76dca73c8..ecd4da1d4 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void sin_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sin_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp
index 2155aafad..06309f45c 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h
index e3ceaf643..a5005e1d0 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void sinh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sinh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp
index 57c0049b1..996da809b 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricTanKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h
index b96a35f93..9446ff058 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void tan_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void tan_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp
index dcbe8a970..1b71b3974 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h
index 6cecc99e2..831421a12 100644
--- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void tanh_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void tanh_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
index ef404a4fd..2f0645c4d 100644
--- a/src/ATen/native/xpu/sycl/UnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
@@ -9,6 +9,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnaryKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.h b/src/ATen/native/xpu/sycl/UnaryKernels.h
index cc394de6b..db7005523 100644
--- a/src/ATen/native/xpu/sycl/UnaryKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryKernels.h
@@ -4,17 +4,17 @@
 
 namespace at::native::xpu {
 
-void sqrt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sqrt_kernel(TensorIteratorBase& iter);
 
-void rsqrt_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void rsqrt_kernel(TensorIteratorBase& iter);
 
-void bitwise_not_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void bitwise_not_kernel(TensorIteratorBase& iter);
 
-void exp_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void exp_kernel(TensorIteratorBase& iter);
 
-void expm1_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void expm1_kernel(TensorIteratorBase& iter);
 
-void nan_to_num_kernel(
+TORCH_XPU_API void nan_to_num_kernel(
     TensorIteratorBase& iter,
     std::optional<double> nan,
     std::optional<double> pos_inf,
diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
index 57ee1c706..81fc380bb 100644
--- a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
@@ -9,6 +9,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnaryLogKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.h b/src/ATen/native/xpu/sycl/UnaryLogKernels.h
index c8d16ceae..72db9235a 100644
--- a/src/ATen/native/xpu/sycl/UnaryLogKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void log_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_kernel(TensorIteratorBase& iter);
 
-void log10_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log10_kernel(TensorIteratorBase& iter);
 
-void log1p_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log1p_kernel(TensorIteratorBase& iter);
 
-void log2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log2_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
index b35aba437..031a2ace2 100644
--- a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
@@ -7,6 +7,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/UnarySignKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.h b/src/ATen/native/xpu/sycl/UnarySignKernels.h
index 98b6eeb5f..01759a725 100644
--- a/src/ATen/native/xpu/sycl/UnarySignKernels.h
+++ b/src/ATen/native/xpu/sycl/UnarySignKernels.h
@@ -4,14 +4,14 @@
 
 namespace at::native::xpu {
 
-void logical_not_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_not_kernel(TensorIteratorBase& iter);
 
-void neg_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void neg_kernel(TensorIteratorBase& iter);
 
-void sgn_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sgn_kernel(TensorIteratorBase& iter);
 
-void sign_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sign_kernel(TensorIteratorBase& iter);
 
-void signbit_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void signbit_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index e2303634e..cb00537fe 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -10,6 +10,8 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/complex.h>
 
+#include <ATen/native/xpu/sycl/UnarySpecialOpsKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
index 6bed9c6de..f810da576 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
@@ -4,16 +4,18 @@
 
 namespace at::native::xpu {
 
-void sigmoid_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void sigmoid_kernel(TensorIteratorBase& iter);
 
-void erf_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void erf_kernel(TensorIteratorBase& iter);
 
-void erfc_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void erfc_kernel(TensorIteratorBase& iter);
 
-void erfinv_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void erfinv_kernel(TensorIteratorBase& iter);
 
-void exp2_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void exp2_kernel(TensorIteratorBase& iter);
 
-void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar);
+TORCH_XPU_API void logit_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& eps_scalar);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
index ad188f39b..4b95fc553 100644
--- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
@@ -8,6 +8,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UnfoldBackwardKernels.h>
+
 namespace at::native::xpu {
 
 constexpr int n_elems_per_work_item = 4; // UNROLLED_ELEM_PER_WORK_ITEM;
diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
index 35c2f83c1..136f88482 100644
--- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
+++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void unfold_backward_kernel(
+TORCH_XPU_API void unfold_backward_kernel(
     Tensor& grad_out,
     const Tensor& grad_in,
     int64_t dim,
diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.cpp b/src/ATen/native/xpu/sycl/UniqueKernels.cpp
index b57274a8e..359712d1f 100644
--- a/src/ATen/native/xpu/sycl/UniqueKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UniqueKernels.cpp
@@ -3,6 +3,8 @@
 #include <ATen/native/xpu/sycl/pstl/PSTLFunctions.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UniqueKernels.h>
+
 namespace at::native::xpu {
 
 struct UniqueNotEqualFunctor {
diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.h b/src/ATen/native/xpu/sycl/UniqueKernels.h
index b7a7cd1ae..25b36cc64 100644
--- a/src/ATen/native/xpu/sycl/UniqueKernels.h
+++ b/src/ATen/native/xpu/sycl/UniqueKernels.h
@@ -3,29 +3,30 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-std::tuple<Tensor, Tensor, Tensor> unique_consecutive_kernel(
+
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_consecutive_kernel(
     const Tensor& self,
     const bool return_inverse,
     const bool return_counts,
     c10::optional<int64_t> dim);
 
-std::tuple<Tensor, Tensor, Tensor> unique_dim_consecutive_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_dim_consecutive_kernel(
     const Tensor& self,
     const int64_t dim,
     const bool return_inverse,
     const bool return_counts);
 
-std::tuple<Tensor, Tensor, Tensor> unique_dim_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_dim_kernel(
     const Tensor& self,
     const int64_t dim,
     const bool return_inverse,
     const bool return_counts);
 
-std::tuple<Tensor, Tensor> _unique_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> _unique_kernel(
     const Tensor& self,
     const bool return_inverse);
 
-std::tuple<Tensor, Tensor, Tensor> _unique2_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> _unique2_kernel(
     const Tensor& self,
     const bool return_inverse,
     const bool return_counts);
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
index 8c6c33afd..a60136e30 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
@@ -6,6 +6,8 @@
 #include <ATen/native/xpu/UpSample.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename accscalar_t>
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
index 835e25def..e4d6d9485 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void upsample_bicubic2d_kernel(
+TORCH_XPU_API void upsample_bicubic2d_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
@@ -12,4 +12,4 @@ void upsample_bicubic2d_kernel(
     std::optional<double> scales_h,
     std::optional<double> scales_w);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
index 1f578af13..e7c81e29a 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
@@ -14,6 +14,8 @@
 #include <ATen/native/xpu/sycl/Atomics.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename accscalar_t>
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
index 3f75f79cf..fa0e8601a 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
@@ -4,7 +4,7 @@
 
 namespace at::native::xpu {
 
-void upsample_bilinear2d_out_kernel(
+TORCH_XPU_API void upsample_bilinear2d_out_kernel(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
@@ -12,7 +12,7 @@ void upsample_bilinear2d_out_kernel(
     std::optional<double> scales_h,
     std::optional<double> scales_w);
 
-void upsample_bilinear2d_backward_out_kernel(
+TORCH_XPU_API void upsample_bilinear2d_backward_out_kernel(
     Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
@@ -21,4 +21,4 @@ void upsample_bilinear2d_backward_out_kernel(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
index 71fc04ab1..b0ac8cc54 100644
--- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
@@ -14,6 +14,8 @@
 #include "ATen/Context.h"
 #include "ATen/core/TensorBase.h"
 
+#include <ATen/native/xpu/sycl/UpSampleLinear1dKernels.h>
+
 namespace at::native::xpu {
 template <typename scalar_t, typename accscalar_t>
 struct UpsampleLinear1dKernelFunctor {
@@ -213,4 +215,4 @@ void upsample_linear1d_backward_kernel(
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
-#pragma clang diagnostic pop
\ No newline at end of file
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
index 941de1638..5beeeebc7 100644
--- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
@@ -3,14 +3,15 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-void upsample_linear1d_kernel(
+
+TORCH_XPU_API void upsample_linear1d_kernel(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
     std::optional<double> scales,
     Tensor& output);
 
-void upsample_linear1d_backward_kernel(
+TORCH_XPU_API void upsample_linear1d_backward_kernel(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
@@ -18,4 +19,4 @@ void upsample_linear1d_backward_kernel(
     std::optional<double> scales,
     Tensor& grad_input);
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
index 3c4a01d91..4ceda372c 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
@@ -5,6 +5,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
 
+#include <ATen/native/xpu/sycl/UpSampleNearest1dKernels.h>
+
 namespace at::native {
 namespace xpu {
 
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
index bb6dd83ff..173302bc6 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
@@ -5,14 +5,14 @@
 
 namespace at::native::xpu {
 
-void upsample_nearest1d_kernel(
+TORCH_XPU_API void upsample_nearest1d_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
     c10::optional<double> scales,
     bool is_exact);
 
-void upsample_nearest1d_backward_kernel(
+TORCH_XPU_API void upsample_nearest1d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
index 62e2ff9e0..682a73cc6 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
@@ -707,4 +707,4 @@ void upsample_nearest2d_kernel(
   }
 }
 } // namespace xpu
-} // namespace at::native
\ No newline at end of file
+} // namespace at::native
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
index 7d11e03af..8f8f6fff7 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
@@ -5,7 +5,7 @@
 
 namespace at::native::xpu {
 
-void upsample_nearest2d_kernel(
+TORCH_XPU_API void upsample_nearest2d_kernel(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
@@ -13,7 +13,7 @@ void upsample_nearest2d_kernel(
     c10::optional<double> scales_w,
     bool is_exact);
 
-void upsample_nearest2d_backward_kernel(
+TORCH_XPU_API void upsample_nearest2d_backward_kernel(
     Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
index dd93f68c3..ac67d5d34 100644
--- a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
@@ -7,6 +7,8 @@
 #include <comm/TensorOptions.h>
 #include "comm/Runtime.h"
 
+#include <ATen/native/xpu/sycl/WeightNormKernels.h>
+
 namespace at::native::xpu {
 
 template <typename T>
@@ -1024,4 +1026,4 @@ std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
   return {grad_v, grad_g};
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.h b/src/ATen/native/xpu/sycl/WeightNormKernels.h
index 5b0d7afd2..88a45b298 100644
--- a/src/ATen/native/xpu/sycl/WeightNormKernels.h
+++ b/src/ATen/native/xpu/sycl/WeightNormKernels.h
@@ -2,15 +2,17 @@
 #include <ATen/ATen.h>
 
 namespace at::native::xpu {
-std::tuple<Tensor, Tensor> weight_norm_kernel(
+
+TORCH_XPU_API std::tuple<Tensor, Tensor> weight_norm_kernel(
     const Tensor& v,
     const Tensor& g,
     int64_t dim);
 
-std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
+TORCH_XPU_API std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
     const Tensor& grad_w,
     const Tensor& saved_v,
     const Tensor& saved_g,
     const Tensor& saved_norms,
     int64_t dim);
-} // namespace at::native::xpu
\ No newline at end of file
+
+} // namespace at::native::xpu
diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake
new file mode 100644
index 000000000..3e304b31f
--- /dev/null
+++ b/src/BuildOnLinux.cmake
@@ -0,0 +1,79 @@
+# Build on Linux
+
+set(TORCH_XPU_OPS_LIBRARIES)
+set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
+
+add_library(
+  torch_xpu_ops
+  STATIC
+  ${ATen_XPU_CPP_SRCS}
+  ${ATen_XPU_NATIVE_CPP_SRCS}
+  ${ATen_XPU_GEN_SRCS})
+
+if(BUILD_SEPARATE_OPS)
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
+    set(sycl_lib torch-xpu-ops-sycl-${name})
+    sycl_add_library(
+      ${sycl_lib}
+      SHARED
+      SYCL_SOURCES ${sycl_src})
+    target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
+    list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+    # Decouple with PyTorch cmake definition.
+    install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  endforeach()
+else()
+  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
+  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
+  set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS)
+
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    endif()
+  endforeach()
+
+  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
+  sycl_add_library(
+    ${sycl_unary_binary_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
+  sycl_add_library(
+    ${sycl_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS})
+  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+endif()
+set(SYCL_LINK_LIBRARIES_KEYWORD)
+
+list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+
+foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
+  # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
+  torch_compile_options(${lib})
+  target_compile_options_if_supported(${lib} "-Wno-deprecated-copy")
+  target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS})
+
+  target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
+
+  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
+endforeach()
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
new file mode 100644
index 000000000..a8e00cb55
--- /dev/null
+++ b/src/BuildOnWindows.cmake
@@ -0,0 +1,106 @@
+# Build on Windows
+
+set(TORCH_XPU_OPS_LIBRARIES)
+set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
+
+# Walk around cyclic dependence
+# libtorch_xpu.so links to libtorch_xpu_ops.a
+# libtorch_xpu_ops.a dlopens libtorch_xpu_ops_aten.so (Break cycle)
+# libtorch_xpu_ops_aten.so links to libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so
+# libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so links to libtorch_xpu.so
+add_library(
+  torch_xpu_ops
+  STATIC
+  ${ATen_XPU_CPP_SRCS}
+  "bridge.cpp")
+set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
+target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
+
+add_library(
+  torch_xpu_ops_aten
+  SHARED
+  ${ATen_XPU_NATIVE_CPP_SRCS}
+  ${ATen_XPU_GEN_SRCS})
+install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
+
+if(BUILD_SEPARATE_OPS)
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
+    set(sycl_lib torch-xpu-ops-sycl-${name})
+    sycl_add_library(
+      ${sycl_lib}
+      SHARED
+      SYCL_SOURCES ${sycl_src})
+    target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
+    list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+    # Decouple with PyTorch cmake definition.
+    install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  endforeach()
+else()
+  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
+  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
+  set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS)
+
+  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
+    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
+    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
+    # Resolve cyclic dependences between
+    # torch_xpu_ops_sycl_unary_binary_kernels.dll and
+    # torch_xpu_ops_sycl_kernels.dll. Move definition and invoke of kernels
+    # into a same kernel library. Here we move elementwise kernel pow and copy
+    # into torch_xpu_ops_sycl_unary_binary_kernels.dll.
+    string(REGEX MATCH "Pow" IS_POW ${sycl_src})
+    string(REGEX MATCH "Copy" IS_COPY ${sycl_src})
+    if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "" AND IS_POW STREQUAL "" AND IS_COPY STREQUAL "")
+      list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src})
+    else()
+      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
+    endif()
+  endforeach()
+
+  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
+  sycl_add_library(
+    ${sycl_unary_binary_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
+  target_compile_definitions(${sycl_unary_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_binary_lib})
+  target_link_libraries(${sycl_unary_binary_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  set(sycl_lib torch_xpu_ops_sycl_kernels)
+  sycl_add_library(
+    ${sycl_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS})
+  target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib})
+  target_link_libraries(${sycl_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+endif()
+set(SYCL_LINK_LIBRARIES_KEYWORD)
+
+list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
+list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten)
+
+foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
+  # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
+  torch_compile_options(${lib})
+  target_compile_options_if_supported(${lib} "-Wno-deprecated-copy")
+  target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS})
+
+  target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
+  target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
+
+  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
+  target_link_libraries(${lib} PUBLIC c10_xpu)
+endforeach()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ba40a4b8c..0716ca5af 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,88 +2,24 @@
 include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake)
 
 set(ATen_XPU_CPP_SRCS)
+set(ATen_XPU_NATIVE_CPP_SRCS)
 set(ATen_XPU_SYCL_SRCS)
 
 set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")
 
 add_subdirectory(ATen)
 
-set(TORCH_XPU_OPS_LIBRARIES)
-set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
-
-add_library(
-  torch_xpu_ops
-  STATIC
-  ${ATen_XPU_CPP_SRCS}
-  ${ATen_XPU_GEN_SRCS})
-
-if(BUILD_SEPARATE_OPS)
-  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
-    get_filename_component(name ${sycl_src} NAME_WLE REALPATH)
-    set(sycl_lib torch-xpu-ops-sycl-${name})
-    sycl_add_library(
-      ${sycl_lib}
-      SHARED
-      SYCL_SOURCES ${sycl_src})
-    target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
-    list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
-
-    # Decouple with PyTorch cmake definition.
-    install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-  endforeach()
+# With the increasement of bin size, we have to split libtorch_xpu.so into
+# multiple libraries. Because of strict linkage requirements on Windows,
+# we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible.
+# To isolate impact and keep both maintainable, there are separate code pieces
+# to setup CMAKE targets. Will revert splitting library and unify these once
+# compression of device code is supported by SYCL/backend compiler.
+if(WIN32)
+  include(BuildOnWindows.cmake)
 else()
-  # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others.
-  set(ATen_XPU_SYCL_UNARY_BINARY_SRCS)
-  set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS)
-
-  foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
-    string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
-    string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
-    if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "")
-      list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src})
-    else()
-      list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src})
-    endif()
-  endforeach()
-
-  set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels)
-  sycl_add_library(
-    ${sycl_unary_binary_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-  set(sycl_lib torch_xpu_ops_sycl_kernels)
-  sycl_add_library(
-    ${sycl_lib}
-    SHARED
-    SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS})
-  target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib})
-  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib})
-
-  # Decouple with PyTorch cmake definition.
-  install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  include(BuildOnLinux.cmake)
 endif()
-set(SYCL_LINK_LIBRARIES_KEYWORD)
-
-list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops)
-
-foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
-  # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake
-  torch_compile_options(${lib})
-  target_compile_options_if_supported(${lib} "-Wno-deprecated-copy")
-  target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS})
-
-  target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS})
-  target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
-  target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
-
-  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
-endforeach()
 
 include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake)
 if(CLANG_FORMAT)
diff --git a/src/bridge.cpp b/src/bridge.cpp
new file mode 100644
index 000000000..c19ce2554
--- /dev/null
+++ b/src/bridge.cpp
@@ -0,0 +1,34 @@
+#include <windows.h>
+#include <sstream>
+#include <stdexcept>
+
+// The implementation helps walk around cyclic dependence, when we separate
+// kernels into multiple dll/so to avoid a large bin (>2GB).
+// The file is built into libtorch_xpu.so. libtorch_xpu.so won't depend on
+// libtorch_xpu_ops_aten.so but dlopen the library to walk around cyclic
+// dependence during linkage. To break cycle like,
+// libtorch_xpu.so -> (dlopen) libtorch_xpu_ops_aten.so -> (link)
+// libtorch_xpu_ops_kernels.so
+//                                                      -> (link)
+//                                                      libtorch_xpu_ops_unary_binary_kernels.so
+// libtorch_xpu_ops_kernels.so -> (link) libtorch_xpu.so
+// libtorch_xpu_ops_unary_binary_kernels.so -> (link) libtorch_xpu.so
+namespace {
+
+class LoadTorchXPUOps {
+ public:
+  LoadTorchXPUOps() {
+    if (NULL == LoadLibrary(PATH_TO_TORCH_XPU_OPS_ATEN_LIB)) {
+      std::ostringstream error;
+      error << "PyTorch XPU operators library is not loaded (ERROR: "
+            << GetLastError()
+            << "). Please check if PyTorch is installed correctly."
+            << " Or please file an issue on https://github.com/intel/torch-xpu-ops/issues.";
+      throw std::runtime_error(error.str());
+    }
+  }
+};
+
+static LoadTorchXPUOps init;
+
+} // namespace