From 6636a0686268548fd4e3b2f0dac0c07e731f98c2 Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Thu, 5 Sep 2024 01:35:14 -0700 Subject: [PATCH] Windows: Enable building for latest torch-xpu-ops (#865) In latest torch-xpu-ops, we have split libtorch_xpu.so into multiple libraries to avoid a binary with large size. The change of building system leads to two issues on Windows. The PR is to resolve them. 1) Cyclic dependence, 2) Symbols visibility. Signed-off-by: Feng Yuan --- cmake/Modules/FindSYCL/run_sycl.cmake | 6 + src/ATen/CMakeLists.txt | 5 +- src/ATen/native/xpu/RangeFactories.cpp | 4 +- src/ATen/native/xpu/sycl/AbsKernel.cpp | 2 + src/ATen/native/xpu/sycl/AbsKernel.h | 2 +- .../native/xpu/sycl/ActivationEluKernels.cpp | 2 + .../native/xpu/sycl/ActivationEluKernels.h | 4 +- .../native/xpu/sycl/ActivationGeluKernel.cpp | 2 + .../native/xpu/sycl/ActivationGeluKernel.h | 6 +- .../native/xpu/sycl/ActivationGluKernels.cpp | 3 + .../native/xpu/sycl/ActivationGluKernels.h | 4 +- .../xpu/sycl/ActivationHardsigmoidKernels.cpp | 2 + .../xpu/sycl/ActivationHardsigmoidKernels.h | 4 +- .../xpu/sycl/ActivationHardswishKernels.cpp | 2 + .../xpu/sycl/ActivationHardswishKernels.h | 4 +- .../xpu/sycl/ActivationHardtanhKernels.cpp | 2 + .../xpu/sycl/ActivationHardtanhKernels.h | 2 +- .../xpu/sycl/ActivationLeakyReluKernels.cpp | 2 + .../xpu/sycl/ActivationLeakyReluKernels.h | 6 +- .../xpu/sycl/ActivationLogSigmoidKernels.cpp | 2 + .../xpu/sycl/ActivationLogSigmoidKernels.h | 4 +- .../native/xpu/sycl/ActivationMishKernels.cpp | 2 + .../native/xpu/sycl/ActivationMishKernels.h | 4 +- .../xpu/sycl/ActivationPreluKernels.cpp | 4 +- .../native/xpu/sycl/ActivationPreluKernels.h | 4 +- .../native/xpu/sycl/ActivationSiluKernels.cpp | 2 + .../native/xpu/sycl/ActivationSiluKernels.h | 4 +- .../xpu/sycl/ActivationSoftplusKernels.cpp | 2 + .../xpu/sycl/ActivationSoftplusKernels.h | 4 +- .../xpu/sycl/ActivationSoftshrinkKernels.cpp | 2 + .../xpu/sycl/ActivationSoftshrinkKernels.h | 8 +- .../xpu/sycl/ActivationThresholdKernel.cpp | 2 + .../xpu/sycl/ActivationThresholdKernel.h | 2 +- .../sycl/AdaptiveAveragePooling2dKernels.cpp | 2 + .../sycl/AdaptiveAveragePooling2dKernels.h | 4 +- .../xpu/sycl/AdaptiveMaxPooling2dKernels.cpp | 2 + .../xpu/sycl/AdaptiveMaxPooling2dKernels.h | 4 +- src/ATen/native/xpu/sycl/AmpKernels.cpp | 2 + src/ATen/native/xpu/sycl/AmpKernels.h | 6 +- .../native/xpu/sycl/AveragePool2dKernels.cpp | 2 + .../native/xpu/sycl/AveragePool2dKernels.h | 4 +- src/ATen/native/xpu/sycl/BatchNormKernels.cpp | 120 ++++++++++-------- src/ATen/native/xpu/sycl/BatchNormKernels.h | 15 ++- .../xpu/sycl/BinaryBitwiseOpsKernels.cpp | 2 + .../native/xpu/sycl/BinaryBitwiseOpsKernels.h | 6 +- .../native/xpu/sycl/BinaryDivTrueKernel.cpp | 2 + .../xpu/sycl/BinaryGeometricKernels.cpp | 2 + .../native/xpu/sycl/BinaryGeometricKernels.h | 4 +- src/ATen/native/xpu/sycl/BinaryKernels.cpp | 2 + src/ATen/native/xpu/sycl/BinaryKernels.h | 12 +- .../xpu/sycl/BinaryLogicalOpsKernels.cpp | 2 + .../native/xpu/sycl/BinaryLogicalOpsKernels.h | 6 +- .../xpu/sycl/BinaryMiscBackwardOpsKernels.cpp | 2 + .../xpu/sycl/BinaryMiscBackwardOpsKernels.h | 8 +- .../native/xpu/sycl/BinaryMiscOpsKernels.cpp | 2 + .../native/xpu/sycl/BinaryMiscOpsKernels.h | 6 +- .../native/xpu/sycl/BinaryRemainderKernel.cpp | 2 + .../native/xpu/sycl/BinaryRemainderKernel.h | 4 +- .../native/xpu/sycl/BinaryShiftOpsKernels.cpp | 4 +- .../native/xpu/sycl/BinaryShiftOpsKernels.h | 4 +- .../native/xpu/sycl/BucketizationKernels.cpp | 2 + .../native/xpu/sycl/BucketizationKernels.h | 6 +- src/ATen/native/xpu/sycl/Col2ImKernel.cpp | 2 + src/ATen/native/xpu/sycl/Col2ImKernel.h | 2 +- src/ATen/native/xpu/sycl/CompareKernels.cpp | 2 + src/ATen/native/xpu/sycl/CompareKernels.h | 12 +- src/ATen/native/xpu/sycl/ComplexKernels.cpp | 2 + src/ATen/native/xpu/sycl/ComplexKernels.h | 4 +- src/ATen/native/xpu/sycl/CopyKernel.cpp | 2 + src/ATen/native/xpu/sycl/CopyKernel.h | 2 +- src/ATen/native/xpu/sycl/CopysignKernel.cpp | 2 + src/ATen/native/xpu/sycl/CopysignKernel.h | 2 +- src/ATen/native/xpu/sycl/CrossKernel.cpp | 2 + src/ATen/native/xpu/sycl/CrossKernel.h | 5 +- src/ATen/native/xpu/sycl/CumprodKernel.cpp | 3 + src/ATen/native/xpu/sycl/CumprodKernel.h | 2 +- src/ATen/native/xpu/sycl/CumsumKernel.cpp | 3 + src/ATen/native/xpu/sycl/CumsumKernel.h | 2 +- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 2 + src/ATen/native/xpu/sycl/DilatedMaxPool2d.h | 4 +- src/ATen/native/xpu/sycl/DistanceKernels.cpp | 2 + src/ATen/native/xpu/sycl/DistanceKernels.h | 4 +- .../native/xpu/sycl/DistributionBernoulli.cpp | 2 + .../sycl/DistributionExponentialKernel.cpp | 2 + .../native/xpu/sycl/DistributionKernels.h | 18 +-- .../native/xpu/sycl/DistributionNormal.cpp | 2 + .../xpu/sycl/DistributionRandomKernel.cpp | 2 + .../native/xpu/sycl/DistributionUniform.cpp | 2 + src/ATen/native/xpu/sycl/Dropout.cpp | 2 + src/ATen/native/xpu/sycl/DropoutKernels.h | 8 +- src/ATen/native/xpu/sycl/Embedding.cpp | 2 + src/ATen/native/xpu/sycl/EmbeddingBag.cpp | 2 + .../native/xpu/sycl/EmbeddingBagKernels.h | 2 +- src/ATen/native/xpu/sycl/EmbeddingKernels.h | 2 +- src/ATen/native/xpu/sycl/FillKernel.cpp | 2 + src/ATen/native/xpu/sycl/FillKernel.h | 2 +- .../xpu/sycl/ForeachBinaryOpListKernels.h | 12 +- .../xpu/sycl/ForeachBinaryOpScalarKernels.h | 12 +- .../sycl/ForeachBinaryOpScalarListKernels.h | 12 +- .../xpu/sycl/ForeachPointwiseOpListKernels.h | 8 +- .../sycl/ForeachPointwiseOpScalarKernels.h | 8 +- .../ForeachPointwiseOpScalarListKernels.h | 8 +- .../native/xpu/sycl/ForeachReduceKernels.cpp | 2 + .../native/xpu/sycl/ForeachReduceKernels.h | 2 +- .../native/xpu/sycl/ForeachTernaryKernels.cpp | 3 + .../xpu/sycl/ForeachTernaryOpListKernels.h | 4 +- .../xpu/sycl/ForeachTernaryOpScalarKernels.h | 4 +- .../native/xpu/sycl/ForeachUnaryKernels.cpp | 2 + .../native/xpu/sycl/ForeachUnaryKernels.h | 4 +- src/ATen/native/xpu/sycl/GcdLcmKernels.cpp | 4 +- src/ATen/native/xpu/sycl/GcdLcmKernels.h | 2 +- src/ATen/native/xpu/sycl/GridSampler.cpp | 2 + src/ATen/native/xpu/sycl/GridSamplerKernels.h | 6 +- src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 2 + src/ATen/native/xpu/sycl/GroupNormKernels.h | 4 +- src/ATen/native/xpu/sycl/HistogramKernels.h | 6 +- .../native/xpu/sycl/HistogramddKernels.cpp | 2 + src/ATen/native/xpu/sycl/Im2ColKernel.cpp | 2 + src/ATen/native/xpu/sycl/Im2ColKernel.h | 2 +- src/ATen/native/xpu/sycl/Indexing.cpp | 2 + src/ATen/native/xpu/sycl/IndexingKernels.h | 18 +-- src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 2 + src/ATen/native/xpu/sycl/LayerNormKernels.h | 4 +- src/ATen/native/xpu/sycl/LerpKernels.cpp | 2 + src/ATen/native/xpu/sycl/LerpKernels.h | 6 +- .../native/xpu/sycl/LinearAlgebraKernels.cpp | 2 + .../native/xpu/sycl/LinearAlgebraKernels.h | 5 +- src/ATen/native/xpu/sycl/LogAddExpKernels.cpp | 3 + src/ATen/native/xpu/sycl/LogAddExpKernels.h | 4 +- src/ATen/native/xpu/sycl/LossKernels.cpp | 2 + src/ATen/native/xpu/sycl/LossKernels.h | 4 +- src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp | 14 +- src/ATen/native/xpu/sycl/LossNLL2dKernels.h | 8 +- src/ATen/native/xpu/sycl/LossNLLKernel.cpp | 2 + src/ATen/native/xpu/sycl/LossNLLKernel.h | 6 +- .../xpu/sycl/{Math.h => MathExtensions.h} | 0 .../xpu/sycl/MaxMinElementwiseKernels.cpp | 2 + .../xpu/sycl/MaxMinElementwiseKernels.h | 8 +- .../native/xpu/sycl/MultinomialKernel.cpp | 3 + src/ATen/native/xpu/sycl/MultinomialKernel.h | 4 +- src/ATen/native/xpu/sycl/NMSKernel.cpp | 2 + src/ATen/native/xpu/sycl/NMSKernel.h | 2 +- src/ATen/native/xpu/sycl/NonzeroKernel.cpp | 2 + src/ATen/native/xpu/sycl/NonzeroKernel.h | 2 +- .../native/xpu/sycl/PointwiseOpsKernels.cpp | 2 + .../native/xpu/sycl/PointwiseOpsKernels.h | 15 ++- src/ATen/native/xpu/sycl/PowKernels.cpp | 2 + src/ATen/native/xpu/sycl/PowKernels.h | 4 +- src/ATen/native/xpu/sycl/RandpermKernel.cpp | 2 + src/ATen/native/xpu/sycl/RandpermKernel.h | 10 +- .../native/xpu/sycl/RangeFactoriesKernel.cpp | 2 + .../native/xpu/sycl/RangeFactoriesKernel.h | 4 +- .../native/xpu/sycl/ReduceAMinMaxKernel.cpp | 2 + .../native/xpu/sycl/ReduceArgMaxKernel.cpp | 2 + .../native/xpu/sycl/ReduceArgMinKernel.cpp | 2 + .../native/xpu/sycl/ReduceLogicKernels.cpp | 2 + .../xpu/sycl/ReduceMaxValuesKernels.cpp | 2 + .../native/xpu/sycl/ReduceMaxValuesKernels.h | 6 +- .../xpu/sycl/ReduceMinValuesKernels.cpp | 2 + .../native/xpu/sycl/ReduceMinValuesKernels.h | 6 +- .../native/xpu/sycl/ReduceMomentKernels.cpp | 2 + src/ATen/native/xpu/sycl/ReduceNormKernel.cpp | 4 +- src/ATen/native/xpu/sycl/ReduceNormKernel.h | 2 +- src/ATen/native/xpu/sycl/ReduceOpsKernels.h | 25 ++-- .../native/xpu/sycl/ReduceSumProdKernels.cpp | 2 + .../native/xpu/sycl/ReflectionPadKernels.cpp | 4 +- .../native/xpu/sycl/ReflectionPadKernels.h | 14 +- src/ATen/native/xpu/sycl/RenormKernel.cpp | 2 + src/ATen/native/xpu/sycl/RenormKernel.h | 4 +- src/ATen/native/xpu/sycl/RepeatKernel.cpp | 5 +- src/ATen/native/xpu/sycl/RepeatKernel.h | 4 +- .../xpu/sycl/ReplicationPaddingKernels.cpp | 4 +- .../xpu/sycl/ReplicationPaddingKernels.h | 14 +- src/ATen/native/xpu/sycl/ResizeKernel.cpp | 4 +- src/ATen/native/xpu/sycl/ResizeKernel.h | 2 +- src/ATen/native/xpu/sycl/ScanKernels.h | 10 +- .../native/xpu/sycl/ScatterGatherKernels.cpp | 2 + .../native/xpu/sycl/ScatterGatherKernels.h | 14 +- src/ATen/native/xpu/sycl/Shape.cpp | 2 + src/ATen/native/xpu/sycl/ShapeKernels.h | 2 +- src/ATen/native/xpu/sycl/SoftMaxKernels.cpp | 2 + src/ATen/native/xpu/sycl/SoftMaxKernels.h | 8 +- src/ATen/native/xpu/sycl/Sorting.cpp | 2 + src/ATen/native/xpu/sycl/Sorting.h | 4 +- src/ATen/native/xpu/sycl/StepKernels.cpp | 2 + src/ATen/native/xpu/sycl/StepKernels.h | 2 +- .../native/xpu/sycl/SummaryOpsKernels.cpp | 2 + src/ATen/native/xpu/sycl/SummaryOpsKernels.h | 6 +- .../native/xpu/sycl/TensorCompareKernels.cpp | 2 + .../native/xpu/sycl/TensorCompareKernels.h | 16 ++- src/ATen/native/xpu/sycl/TensorTopKKernel.cpp | 2 + src/ATen/native/xpu/sycl/TensorTopKKernel.h | 2 +- .../xpu/sycl/TensorTransformationsKernels.cpp | 2 + .../xpu/sycl/TensorTransformationsKernels.h | 4 +- .../native/xpu/sycl/TriangularOpsKernels.cpp | 2 + .../native/xpu/sycl/TriangularOpsKernels.h | 12 +- .../native/xpu/sycl/UnaryComplexKernels.cpp | 2 + .../native/xpu/sycl/UnaryComplexKernels.h | 8 +- .../native/xpu/sycl/UnaryFractionKernels.cpp | 2 + .../native/xpu/sycl/UnaryFractionKernels.h | 14 +- .../native/xpu/sycl/UnaryGammaKernels.cpp | 4 +- src/ATen/native/xpu/sycl/UnaryGammaKernels.h | 6 +- .../xpu/sycl/UnaryGeometricAcosKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricAcosKernel.h | 5 +- .../xpu/sycl/UnaryGeometricAcoshKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricAcoshKernel.h | 4 +- .../xpu/sycl/UnaryGeometricAsinKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricAsinKernel.h | 2 +- .../xpu/sycl/UnaryGeometricAsinhKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricAsinhKernel.h | 2 +- .../xpu/sycl/UnaryGeometricAtanKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricAtanKernel.h | 2 +- .../xpu/sycl/UnaryGeometricAtanhKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricAtanhKernel.h | 2 +- .../xpu/sycl/UnaryGeometricCosKernel.cpp | 2 + .../native/xpu/sycl/UnaryGeometricCosKernel.h | 2 +- .../xpu/sycl/UnaryGeometricCoshKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricCoshKernel.h | 2 +- .../xpu/sycl/UnaryGeometricSinKernel.cpp | 2 + .../native/xpu/sycl/UnaryGeometricSinKernel.h | 2 +- .../xpu/sycl/UnaryGeometricSinhKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricSinhKernel.h | 2 +- .../xpu/sycl/UnaryGeometricTanKernel.cpp | 2 + .../native/xpu/sycl/UnaryGeometricTanKernel.h | 2 +- .../xpu/sycl/UnaryGeometricTanhKernel.cpp | 2 + .../xpu/sycl/UnaryGeometricTanhKernel.h | 2 +- src/ATen/native/xpu/sycl/UnaryKernels.cpp | 2 + src/ATen/native/xpu/sycl/UnaryKernels.h | 12 +- src/ATen/native/xpu/sycl/UnaryLogKernels.cpp | 2 + src/ATen/native/xpu/sycl/UnaryLogKernels.h | 8 +- src/ATen/native/xpu/sycl/UnarySignKernels.cpp | 2 + src/ATen/native/xpu/sycl/UnarySignKernels.h | 10 +- .../xpu/sycl/UnarySpecialOpsKernels.cpp | 2 + .../native/xpu/sycl/UnarySpecialOpsKernels.h | 14 +- .../native/xpu/sycl/UnfoldBackwardKernels.cpp | 2 + .../native/xpu/sycl/UnfoldBackwardKernels.h | 2 +- src/ATen/native/xpu/sycl/UniqueKernels.cpp | 2 + src/ATen/native/xpu/sycl/UniqueKernels.h | 11 +- .../xpu/sycl/UpSampleBicubic2dKernels.cpp | 2 + .../xpu/sycl/UpSampleBicubic2dKernels.h | 4 +- .../xpu/sycl/UpSampleBilinear2dKernels.cpp | 2 + .../xpu/sycl/UpSampleBilinear2dKernels.h | 6 +- .../xpu/sycl/UpSampleLinear1dKernels.cpp | 4 +- .../native/xpu/sycl/UpSampleLinear1dKernels.h | 7 +- .../xpu/sycl/UpSampleNearest1dKernels.cpp | 2 + .../xpu/sycl/UpSampleNearest1dKernels.h | 4 +- .../xpu/sycl/UpSampleNearest2dKernels.cpp | 2 +- .../xpu/sycl/UpSampleNearest2dKernels.h | 4 +- .../native/xpu/sycl/WeightNormKernels.cpp | 4 +- src/ATen/native/xpu/sycl/WeightNormKernels.h | 8 +- src/BuildOnLinux.cmake | 79 ++++++++++++ src/BuildOnWindows.cmake | 106 ++++++++++++++++ src/CMakeLists.txt | 84 ++---------- src/bridge.cpp | 34 +++++ 254 files changed, 970 insertions(+), 479 deletions(-) rename src/ATen/native/xpu/sycl/{Math.h => MathExtensions.h} (100%) create mode 100644 src/BuildOnLinux.cmake create mode 100644 src/BuildOnWindows.cmake create mode 100644 src/bridge.cpp diff --git a/cmake/Modules/FindSYCL/run_sycl.cmake b/cmake/Modules/FindSYCL/run_sycl.cmake index dcc99c18f..f56d49768 100644 --- a/cmake/Modules/FindSYCL/run_sycl.cmake +++ b/cmake/Modules/FindSYCL/run_sycl.cmake @@ -56,6 +56,12 @@ endforeach() # Choose host flags in FindSYCL.cmake @SYCL_host_flags@ +# Adding permissive flag for MSVC build to overcome ambiguous symbol error. +if(WIN32) + string(APPEND SYCL_host_compiler_flags "/permissive- ") +endif() + + list(REMOVE_DUPLICATES CMAKE_HOST_FLAGS) foreach(flag ${CMAKE_HOST_FLAGS}) # Extra quotes are added around each flag to help SYCL parse out flags with spaces. diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt index 815ad018f..57c4783cd 100644 --- a/src/ATen/CMakeLists.txt +++ b/src/ATen/CMakeLists.txt @@ -1,10 +1,13 @@ # ATen XPU sources -file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp") +file(GLOB xpu_cpp "xpu/*.cpp") +file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp") file(GLOB xpu_sycl "native/xpu/sycl/*.cpp") list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp}) +list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp}) list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl}) set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE) +set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE) set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE) diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index f32a89a75..3aca0d5c7 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -29,7 +29,7 @@ Tensor& XPUNativeFunctions::arange_out( TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); TORCH_CHECK( - std::isfinite(xstart) && std::isfinite(xend), + std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", @@ -97,7 +97,7 @@ Tensor& XPUNativeFunctions::range_out( TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); TORCH_CHECK( - std::isfinite(xstart) && std::isfinite(xend), + std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", diff --git a/src/ATen/native/xpu/sycl/AbsKernel.cpp b/src/ATen/native/xpu/sycl/AbsKernel.cpp index 596000ecb..3df01baee 100644 --- a/src/ATen/native/xpu/sycl/AbsKernel.cpp +++ b/src/ATen/native/xpu/sycl/AbsKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/AbsKernel.h b/src/ATen/native/xpu/sycl/AbsKernel.h index 3b51e993e..3423c32cf 100644 --- a/src/ATen/native/xpu/sycl/AbsKernel.h +++ b/src/ATen/native/xpu/sycl/AbsKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void abs_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp index de1e6447b..57b5e34b3 100644 --- a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.h b/src/ATen/native/xpu/sycl/ActivationEluKernels.h index 555bfb5f4..a15eaa590 100644 --- a/src/ATen/native/xpu/sycl/ActivationEluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.h @@ -4,13 +4,13 @@ namespace at::native::xpu { -void elu_kernel( +TORCH_XPU_API void elu_kernel( TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale); -void elu_backward_kernel( +TORCH_XPU_API void elu_backward_kernel( TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp index 19a5de78c..5bbaa1ab0 100644 --- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp +++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h index 6c373a7cd..660fae9ff 100644 --- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h +++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h @@ -6,9 +6,11 @@ namespace at { namespace native { namespace xpu { -void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate); +TORCH_XPU_API void gelu_kernel( + TensorIteratorBase& iter, + c10::string_view approximate); -void gelu_backward_kernel( +TORCH_XPU_API void gelu_backward_kernel( TensorIteratorBase& iter, c10::string_view approximate); diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp index 056e8c332..f60613ec7 100644 --- a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp @@ -2,9 +2,12 @@ #include #include +#include #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.h b/src/ATen/native/xpu/sycl/ActivationGluKernels.h index c0fd6e22c..a4b11dbb4 100644 --- a/src/ATen/native/xpu/sycl/ActivationGluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.h @@ -4,9 +4,9 @@ namespace at::native::xpu { -void glu_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter); -void glu_backward_kernel( +TORCH_XPU_API void glu_backward_kernel( const TensorIteratorBase& iter, int64_t gI_stride, int64_t I_stride); diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp index d2c53e2b0..6399e6b23 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h index ee5f0e87c..47bfa636c 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void hardsigmoid_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter); -void hardsigmoid_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp index 40099584a..7dadbf3aa 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h index ea4e81fc9..6b741e744 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h @@ -6,9 +6,9 @@ namespace at { namespace native { namespace xpu { -void hardswish_kernel(TensorIterator& iter); +TORCH_XPU_API void hardswish_kernel(TensorIterator& iter); -void hardswish_backward_kernel(TensorIterator& iter); +TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp index f712b951d..2f009757b 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h index c2d110bae..745c0f69b 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void hardtanh_backward_kernel( +TORCH_XPU_API void hardtanh_backward_kernel( TensorIterator& iter, const Scalar& min, const Scalar& max); diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp index 9da5a275b..81e2cb5ba 100644 --- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp @@ -6,6 +6,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h index 758701ce9..5656c661e 100644 --- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h @@ -4,9 +4,11 @@ namespace at::native::xpu { -void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_); +TORCH_XPU_API void leaky_relu_kernel( + TensorIteratorBase& iter, + const Scalar& negval_); -void leaky_relu_backward_kernel( +TORCH_XPU_API void leaky_relu_backward_kernel( TensorIteratorBase& iter, const Scalar& negval_); diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp index a61694d32..09487462e 100644 --- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp @@ -6,6 +6,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h index 1cc6c2e44..be8e7266c 100644 --- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void log_sigmoid_forward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter); -void log_sigmoid_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp index fe0154326..5e517911b 100644 --- a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp @@ -8,6 +8,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h index bd68197cd..a2c7e499d 100644 --- a/src/ATen/native/xpu/sycl/ActivationMishKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void mish_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter); -void mish_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mish_backward_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp index 8a2c7012f..9ec4e598b 100644 --- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template @@ -40,4 +42,4 @@ void prelu_backward_kernel(TensorIterator& iter) { }); } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h index 23f60c242..4d33e9064 100644 --- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void prelu_kernel(TensorIterator& iter); +TORCH_XPU_API void prelu_kernel(TensorIterator& iter); -void prelu_backward_kernel(TensorIterator& iter); +TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp index 093a9a79b..924b75d81 100644 --- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h index c31acdc9a..eaec67986 100644 --- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void silu_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter); -void silu_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp index 99b1c5716..8dc3684e3 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h index 8a5e5ef2b..a9b6027bc 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void softplus_kernel( +TORCH_XPU_API void softplus_kernel( TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_); -void softplus_backward_kernel( +TORCH_XPU_API void softplus_backward_kernel( TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_); diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp index 4393576e9..b96ab461e 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h index 481d1e5a1..398e3f4a4 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h @@ -4,8 +4,12 @@ namespace at::native::xpu { -void softshrink_kernel(TensorIteratorBase& iter, const Scalar& value); +TORCH_XPU_API void softshrink_kernel( + TensorIteratorBase& iter, + const Scalar& value); -void softshrink_backward_kernel(TensorIteratorBase& iter, const Scalar& value); +TORCH_XPU_API void softshrink_backward_kernel( + TensorIteratorBase& iter, + const Scalar& value); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp index 055aa262d..175db7753 100644 --- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp +++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h index c22a6b18c..df38c6d3d 100644 --- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h +++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void threshold_kernel( +TORCH_XPU_API void threshold_kernel( TensorIteratorBase& iter, const Scalar& threshold, const Scalar& value); diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp index 4d7ef286d..0221ceec0 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h index 9b6d9a046..0d58a6e7f 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void adaptive_avg_pool2d_backward_kernel( +TORCH_XPU_API void adaptive_avg_pool2d_backward_kernel( Tensor& gradInput, const Tensor& gradOutput, const Tensor& input); -void adaptive_avg_pool2d_kernel( +TORCH_XPU_API void adaptive_avg_pool2d_kernel( Tensor& output, const Tensor& input, IntArrayRef output_size); diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp index 9da231b0d..cb2c6b083 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp @@ -12,6 +12,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h index 028fed1d2..abba4e354 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h +++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h @@ -4,13 +4,13 @@ namespace at::native::xpu { -void adaptive_max_pool2d_kernel( +TORCH_XPU_API void adaptive_max_pool2d_kernel( const Tensor& input, IntArrayRef output_size, Tensor& output, Tensor& indices); -void adaptive_max_pool2d_backward_kernel( +TORCH_XPU_API void adaptive_max_pool2d_backward_kernel( const Tensor& grad_output, const Tensor& input, const Tensor& indices, diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp index 87354eea9..137ae3902 100644 --- a/src/ATen/native/xpu/sycl/AmpKernels.cpp +++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/AmpKernels.h b/src/ATen/native/xpu/sycl/AmpKernels.h index 4bf863597..4f828c1f2 100644 --- a/src/ATen/native/xpu/sycl/AmpKernels.h +++ b/src/ATen/native/xpu/sycl/AmpKernels.h @@ -3,17 +3,17 @@ namespace at::native::xpu { -void amp_non_finite_check_and_unscale_kernel( +TORCH_XPU_API void amp_non_finite_check_and_unscale_kernel( Tensor& scaled_grad, Tensor& found_inf, const Tensor& inv_scale); -void amp_foreach_non_finite_check_and_unscale_kernel( +TORCH_XPU_API void amp_foreach_non_finite_check_and_unscale_kernel( std::vector> scaled_grads, Tensor& found_inf, const Tensor& inv_scale); -Tensor& amp_update_scale_kernel( +TORCH_XPU_API Tensor& amp_update_scale_kernel( Tensor& current_scale, Tensor& growth_tracker, const Tensor& found_inf, diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp index 4e2328641..7373935aa 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at::native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h index 75ac83a4e..84842355d 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void avg_pool2d_kernel( +TORCH_XPU_API void avg_pool2d_kernel( const Tensor& input_, IntArrayRef kernel_size, IntArrayRef stride, @@ -14,7 +14,7 @@ void avg_pool2d_kernel( c10::optional divisor_override, Tensor& output); -void avg_pool2d_backward_kernel( +TORCH_XPU_API void avg_pool2d_backward_kernel( const Tensor& gradOutput_, const Tensor& input_, IntArrayRef kernel_size, diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index 25acd873a..084a9b65f 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -13,6 +13,14 @@ #include #include +#include + +#ifdef _WIN32 +#define RESTRICT __restrict +#else +#define RESTRICT __restrict__ +#endif + namespace at { namespace native { namespace xpu { @@ -1008,9 +1016,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor } BatchNormCollectStatisticsChannelsLastKernelFunctor( - const scalar_t* __restrict__ input, - accscalar_t* __restrict__ out_mean, - accscalar_t* __restrict__ out_invstd, + const scalar_t* RESTRICT input, + accscalar_t* RESTRICT out_mean, + accscalar_t* RESTRICT out_invstd, volatile accscalar_t* staging_data, int* semaphores, const int reduction_size, @@ -1028,9 +1036,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor wg_size_(wg_size) {} private: - const scalar_t* __restrict__ input_; - accscalar_t* __restrict__ out_mean_; - accscalar_t* __restrict__ out_invstd_; + const scalar_t* RESTRICT input_; + accscalar_t* RESTRICT out_mean_; + accscalar_t* RESTRICT out_invstd_; volatile accscalar_t* staging_data_; int* semaphores_; const int reduction_size_; @@ -1375,13 +1383,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor { } BatchNormTransformInputChannelsLastKernelFunctor( - const scalar_t* __restrict__ input, - const scalar_t* __restrict__ z, - const accscalar_t* __restrict__ mean, - const accscalar_t* __restrict__ inv_std, - const layerscalar_t* __restrict__ weight, - const layerscalar_t* __restrict__ shift, - scalar_t* __restrict__ out, + const scalar_t* RESTRICT input, + const scalar_t* RESTRICT z, + const accscalar_t* RESTRICT mean, + const accscalar_t* RESTRICT inv_std, + const layerscalar_t* RESTRICT weight, + const layerscalar_t* RESTRICT shift, + scalar_t* RESTRICT out, const int reduction_size, const int stride, const bool fuse_relu) @@ -1397,13 +1405,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor { fuse_relu_(fuse_relu) {} private: - const scalar_t* __restrict__ input_; - const scalar_t* __restrict__ z_; - const accscalar_t* __restrict__ mean_; - const accscalar_t* __restrict__ inv_std_; - const layerscalar_t* __restrict__ weight_; - const layerscalar_t* __restrict__ shift_; - scalar_t* __restrict__ out_; + const scalar_t* RESTRICT input_; + const scalar_t* RESTRICT z_; + const accscalar_t* RESTRICT mean_; + const accscalar_t* RESTRICT inv_std_; + const layerscalar_t* RESTRICT weight_; + const layerscalar_t* RESTRICT shift_; + scalar_t* RESTRICT out_; const int reduction_size_; const int stride_; const bool fuse_relu_; @@ -2034,14 +2042,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor } BatchNormBackwardReduceChannelsLastKernelFunctor( - const scalar_t* __restrict__ input, - const scalar_t* __restrict__ grad_output, - const accscalar_t* __restrict__ mean, - const accscalar_t* __restrict__ inv_std, - accscalar_t* __restrict__ sum_dy_o, - accscalar_t* __restrict__ sum_dy_xmu_o, - layerscalar_t* __restrict__ grad_weight, - layerscalar_t* __restrict__ grad_bias, + const scalar_t* RESTRICT input, + const scalar_t* RESTRICT grad_output, + const accscalar_t* RESTRICT mean, + const accscalar_t* RESTRICT inv_std, + accscalar_t* RESTRICT sum_dy_o, + accscalar_t* RESTRICT sum_dy_xmu_o, + layerscalar_t* RESTRICT grad_weight, + layerscalar_t* RESTRICT grad_bias, volatile accscalar_t* staging_data, int* semaphores, const int reduction_size, @@ -2062,14 +2070,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor wg_size_(wg_size) {} private: - const scalar_t* __restrict__ input_; - const scalar_t* __restrict__ grad_output_; - const accscalar_t* __restrict__ mean_; - const accscalar_t* __restrict__ inv_std_; - accscalar_t* __restrict__ sum_dy_o_; - accscalar_t* __restrict__ sum_dy_xmu_o_; - layerscalar_t* __restrict__ grad_weight_; - layerscalar_t* __restrict__ grad_bias_; + const scalar_t* RESTRICT input_; + const scalar_t* RESTRICT grad_output_; + const accscalar_t* RESTRICT mean_; + const accscalar_t* RESTRICT inv_std_; + accscalar_t* RESTRICT sum_dy_o_; + accscalar_t* RESTRICT sum_dy_xmu_o_; + layerscalar_t* RESTRICT grad_weight_; + layerscalar_t* RESTRICT grad_bias_; volatile accscalar_t* staging_data_; int* semaphores_; const int reduction_size_; @@ -2395,7 +2403,7 @@ struct BatchNormBackwardElemtKernelFunctor { GenericPackedTensorAccessor grad_input, const stat_accscalar_t norm_fct, - const int* __restrict__ numel = nullptr, + const int* RESTRICT numel = nullptr, const int world_size = 0) : input_(input), grad_output_(grad_output), @@ -2451,7 +2459,7 @@ struct BatchNormBackwardElemtKernelFunctor { GenericPackedTensorAccessor grad_input_; const stat_accscalar_t norm_fct_; - const int* __restrict__ numel_; + const int* RESTRICT numel_; const int world_size_; }; @@ -2680,18 +2688,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor { } BatchNormBackwardElemtChannelsLastKernelFunctor( - const scalar_t* __restrict__ grad_output, - const scalar_t* __restrict__ input, - const accscalar_t* __restrict__ mean, - const accscalar_t* __restrict__ inv_std, - const layerscalar_t* __restrict__ weight, - const accscalar_t* __restrict__ sum_dy, - const accscalar_t* __restrict__ sum_dy_xmu, - scalar_t* __restrict__ grad_input, + const scalar_t* RESTRICT grad_output, + const scalar_t* RESTRICT input, + const accscalar_t* RESTRICT mean, + const accscalar_t* RESTRICT inv_std, + const layerscalar_t* RESTRICT weight, + const accscalar_t* RESTRICT sum_dy, + const accscalar_t* RESTRICT sum_dy_xmu, + scalar_t* RESTRICT grad_input, const accscalar_t norm_fct, const int reduction_size, const int stride, - const int* __restrict__ numel = nullptr, + const int* RESTRICT numel = nullptr, const int64_t world_size = 0) : grad_output_(grad_output), input_(input), @@ -2708,18 +2716,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor { world_size_(world_size) {} private: - const scalar_t* __restrict__ grad_output_; - const scalar_t* __restrict__ input_; - const accscalar_t* __restrict__ mean_; - const accscalar_t* __restrict__ inv_std_; - const layerscalar_t* __restrict__ weight_; - const accscalar_t* __restrict__ sum_dy_; - const accscalar_t* __restrict__ sum_dy_xmu_; - scalar_t* __restrict__ grad_input_; + const scalar_t* RESTRICT grad_output_; + const scalar_t* RESTRICT input_; + const accscalar_t* RESTRICT mean_; + const accscalar_t* RESTRICT inv_std_; + const layerscalar_t* RESTRICT weight_; + const accscalar_t* RESTRICT sum_dy_; + const accscalar_t* RESTRICT sum_dy_xmu_; + scalar_t* RESTRICT grad_input_; const accscalar_t norm_fct_; const int reduction_size_; const int stride_; - const int* __restrict__ numel_; + const int* RESTRICT numel_; const int64_t world_size_; }; diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.h b/src/ATen/native/xpu/sycl/BatchNormKernels.h index 3bc559b38..9d9ae780b 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.h +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.h @@ -6,11 +6,11 @@ namespace at { namespace native { namespace xpu { -std::tuple batch_norm_stats_kernel( +TORCH_XPU_API std::tuple batch_norm_stats_kernel( const Tensor& self, double epsilon); -void batch_norm_elemt_kernel( +TORCH_XPU_API void batch_norm_elemt_kernel( Tensor& out, const Tensor& self, const c10::optional& weight_opt, @@ -18,7 +18,8 @@ void batch_norm_elemt_kernel( const Tensor& mean_, const Tensor& invstd_); -std::tuple batch_norm_backward_reduce_kernel( +TORCH_XPU_API std::tuple +batch_norm_backward_reduce_kernel( const Tensor& grad_output, const Tensor& input, const Tensor& mean, @@ -28,7 +29,7 @@ std::tuple batch_norm_backward_reduce_kernel( bool weight_g, bool bias_g); -Tensor batch_norm_backward_elemt_kernel( +TORCH_XPU_API Tensor batch_norm_backward_elemt_kernel( const Tensor& self, const Tensor& input, const Tensor& mean, @@ -38,13 +39,13 @@ Tensor batch_norm_backward_elemt_kernel( const Tensor& sum_dy_xmu, const Tensor& count); -std::tuple batch_norm_update_stats_kernel( +TORCH_XPU_API std::tuple batch_norm_update_stats_kernel( const Tensor& self, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, double momentum); -std::tuple batch_norm_kernel( +TORCH_XPU_API std::tuple batch_norm_kernel( const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, @@ -57,7 +58,7 @@ std::tuple batch_norm_kernel( Tensor& save_mean, Tensor& save_invstd); -std::tuple batch_norm_backward_kernel( +TORCH_XPU_API std::tuple batch_norm_backward_kernel( const Tensor& grad_out, const Tensor& input, const c10::optional& weight_opt, diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp index 7e2906e8b..34e50222e 100644 --- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h index 4247c0a79..f9712b85d 100644 --- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h @@ -6,11 +6,11 @@ namespace at { namespace native { namespace xpu { -void bitwise_and_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_and_kernel(TensorIteratorBase& iter); -void bitwise_or_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_or_kernel(TensorIteratorBase& iter); -void bitwise_xor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_xor_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp index e10d25dd5..6f35c0cb6 100644 --- a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp +++ b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { void div_true_kernel(TensorIteratorBase& iter) { diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp index e170760e8..3224ba54f 100644 --- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h index 588d52c4f..61af8b511 100644 --- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void atan2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void atan2_kernel(TensorIteratorBase& iter); -void hypot_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void hypot_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.cpp b/src/ATen/native/xpu/sycl/BinaryKernels.cpp index 327eb9486..2902486bf 100644 --- a/src/ATen/native/xpu/sycl/BinaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.h b/src/ATen/native/xpu/sycl/BinaryKernels.h index 78dfaa0da..f862595eb 100644 --- a/src/ATen/native/xpu/sycl/BinaryKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryKernels.h @@ -4,16 +4,16 @@ namespace at::native::xpu { -void add_kernel(TensorIteratorBase& iter, const Scalar& alpha); +TORCH_XPU_API void add_kernel(TensorIteratorBase& iter, const Scalar& alpha); -void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha); +TORCH_XPU_API void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha); -void mul_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mul_kernel(TensorIteratorBase& iter); -void div_true_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void div_true_kernel(TensorIteratorBase& iter); -void div_trunc_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void div_trunc_kernel(TensorIteratorBase& iter); -void div_floor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void div_floor_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp index 23146b47d..18bcffca6 100644 --- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h index ee641d9fb..82271e6a1 100644 --- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void logical_and_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_and_kernel(TensorIteratorBase& iter); -void logical_or_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_or_kernel(TensorIteratorBase& iter); -void logical_xor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_xor_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp index 667236123..a7a676675 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h index 37553191d..8bea1f44c 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h @@ -4,10 +4,12 @@ namespace at::native::xpu { -void sigmoid_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sigmoid_backward_kernel(TensorIteratorBase& iter); -void tanh_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void tanh_backward_kernel(TensorIteratorBase& iter); -void logit_backward_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar); +TORCH_XPU_API void logit_backward_kernel( + TensorIteratorBase& iter, + const Scalar& eps_scalar); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp index 5ac71c163..3e0989b24 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h index 17672ec29..ffe08be3e 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void mse_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mse_kernel(TensorIteratorBase& iter); -void smooth_l1_kernel(TensorIteratorBase& iter, double beta); +TORCH_XPU_API void smooth_l1_kernel(TensorIteratorBase& iter, double beta); -void huber_kernel(TensorIterator& iter, double delta); +TORCH_XPU_API void huber_kernel(TensorIterator& iter, double delta); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp index 4f06a94fc..9a5320c68 100644 --- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp +++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h index 1d400951f..f3d3e2390 100644 --- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h +++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h @@ -6,9 +6,9 @@ namespace at { namespace native { namespace xpu { -void remainder_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void remainder_kernel(TensorIteratorBase& iter); -void fmod_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void fmod_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp index 64adba17e..1a0027d2f 100644 --- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template @@ -45,4 +47,4 @@ void rshift_kernel(TensorIteratorBase& iter) { }); } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h index 7e661d919..64ded1705 100644 --- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void lshift_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lshift_kernel(TensorIteratorBase& iter); -void rshift_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void rshift_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp index d56eff222..c5686babb 100644 --- a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp +++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { // customized lower_bound func to ensure the low bound of 'nan', 'inf' etc. be diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.h b/src/ATen/native/xpu/sycl/BucketizationKernels.h index f47cea2af..1c86ece08 100644 --- a/src/ATen/native/xpu/sycl/BucketizationKernels.h +++ b/src/ATen/native/xpu/sycl/BucketizationKernels.h @@ -2,11 +2,13 @@ #include namespace at::native::xpu { -void searchsorted_kernel( + +TORCH_XPU_API void searchsorted_kernel( Tensor& result, const Tensor& input, const Tensor& sorted_sequence, bool out_int32, bool right, const Tensor& sorter); -} // namespace at::native::xpu \ No newline at end of file + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp index 299711162..1f9321692 100644 --- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp +++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.h b/src/ATen/native/xpu/sycl/Col2ImKernel.h index db220a6db..a910112e0 100644 --- a/src/ATen/native/xpu/sycl/Col2ImKernel.h +++ b/src/ATen/native/xpu/sycl/Col2ImKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void col2im_kernel( +TORCH_XPU_API void col2im_kernel( Tensor& output, const Tensor& input_, IntArrayRef output_size, diff --git a/src/ATen/native/xpu/sycl/CompareKernels.cpp b/src/ATen/native/xpu/sycl/CompareKernels.cpp index b9af47cb7..1096dfa40 100644 --- a/src/ATen/native/xpu/sycl/CompareKernels.cpp +++ b/src/ATen/native/xpu/sycl/CompareKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/CompareKernels.h b/src/ATen/native/xpu/sycl/CompareKernels.h index d68827631..52cbc151c 100644 --- a/src/ATen/native/xpu/sycl/CompareKernels.h +++ b/src/ATen/native/xpu/sycl/CompareKernels.h @@ -6,17 +6,17 @@ namespace at { namespace native { namespace xpu { -void eq_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void eq_kernel(TensorIteratorBase& iter); -void ne_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void ne_kernel(TensorIteratorBase& iter); -void lt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lt_kernel(TensorIteratorBase& iter); -void le_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void le_kernel(TensorIteratorBase& iter); -void gt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void gt_kernel(TensorIteratorBase& iter); -void ge_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void ge_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp index 87504bd5e..4e44d9064 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h index d51556b4f..900bbaca6 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.h +++ b/src/ATen/native/xpu/sycl/ComplexKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void complex_kernel(TensorIterator& iter); +TORCH_XPU_API void complex_kernel(TensorIterator& iter); -void polar_kernel(TensorIterator& iter); +TORCH_XPU_API void polar_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index c2dde5056..bdddd3f44 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -7,6 +7,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/CopyKernel.h b/src/ATen/native/xpu/sycl/CopyKernel.h index d02a1a45b..30232b27a 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.h +++ b/src/ATen/native/xpu/sycl/CopyKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void copy_kernel(TensorIterator& iter); +TORCH_XPU_API void copy_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.cpp b/src/ATen/native/xpu/sycl/CopysignKernel.cpp index 3b8351abf..327828adc 100644 --- a/src/ATen/native/xpu/sycl/CopysignKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopysignKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.h b/src/ATen/native/xpu/sycl/CopysignKernel.h index cf856728c..86b6a40d9 100644 --- a/src/ATen/native/xpu/sycl/CopysignKernel.h +++ b/src/ATen/native/xpu/sycl/CopysignKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void copysign_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void copysign_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CrossKernel.cpp b/src/ATen/native/xpu/sycl/CrossKernel.cpp index 0ed5f11eb..313261c9d 100644 --- a/src/ATen/native/xpu/sycl/CrossKernel.cpp +++ b/src/ATen/native/xpu/sycl/CrossKernel.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at::native::xpu { template struct CrossKernelFunctor { diff --git a/src/ATen/native/xpu/sycl/CrossKernel.h b/src/ATen/native/xpu/sycl/CrossKernel.h index cc77229c4..8d382b97e 100644 --- a/src/ATen/native/xpu/sycl/CrossKernel.h +++ b/src/ATen/native/xpu/sycl/CrossKernel.h @@ -1,9 +1,12 @@ #pragma once #include + namespace at::native::xpu { -void linalg_cross_kernel( + +TORCH_XPU_API void linalg_cross_kernel( const Tensor& result, const Tensor& x1, const Tensor& x2, int64_t dim); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.cpp b/src/ATen/native/xpu/sycl/CumprodKernel.cpp index f35cc7979..375e6a80c 100644 --- a/src/ATen/native/xpu/sycl/CumprodKernel.cpp +++ b/src/ATen/native/xpu/sycl/CumprodKernel.cpp @@ -3,6 +3,9 @@ #include +#include +#include + namespace at::native::xpu { void launch_cumprod_kernel( diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.h b/src/ATen/native/xpu/sycl/CumprodKernel.h index 71e8c7693..d78301f04 100644 --- a/src/ATen/native/xpu/sycl/CumprodKernel.h +++ b/src/ATen/native/xpu/sycl/CumprodKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void launch_cumprod_kernel( +TORCH_XPU_API void launch_cumprod_kernel( const Tensor& result, const Tensor& self, int64_t dim); diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.cpp b/src/ATen/native/xpu/sycl/CumsumKernel.cpp index c2754869b..0523db84b 100644 --- a/src/ATen/native/xpu/sycl/CumsumKernel.cpp +++ b/src/ATen/native/xpu/sycl/CumsumKernel.cpp @@ -3,6 +3,9 @@ #include +#include +#include + namespace at::native::xpu { void launch_cumsum_kernel( diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.h b/src/ATen/native/xpu/sycl/CumsumKernel.h index 79c299608..77ce62b61 100644 --- a/src/ATen/native/xpu/sycl/CumsumKernel.h +++ b/src/ATen/native/xpu/sycl/CumsumKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void launch_cumsum_kernel( +TORCH_XPU_API void launch_cumsum_kernel( const Tensor& result, const Tensor& self, int64_t dim); diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index eab6f4c48..8c130a0b9 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -15,6 +15,8 @@ #include #include +#include + namespace at::native::xpu { static inline int p_start( diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h index de550f4be..9d827c642 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void max_pool2d_with_indices_kernel( +TORCH_XPU_API void max_pool2d_with_indices_kernel( const Tensor& input, IntArrayRef kernel_size, IntArrayRef stride, @@ -14,7 +14,7 @@ void max_pool2d_with_indices_kernel( Tensor& output, Tensor& indices); -Tensor& max_pool2d_with_indices_backward_kernel( +TORCH_XPU_API Tensor& max_pool2d_with_indices_backward_kernel( Tensor& gradInput, const Tensor& gradOutput, const Tensor& input, diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp index eb0f1f50e..446466a3e 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp +++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template static double device_sqrt(scalar_t val) { diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.h b/src/ATen/native/xpu/sycl/DistanceKernels.h index 446a6b7c2..a53b84cdf 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.h +++ b/src/ATen/native/xpu/sycl/DistanceKernels.h @@ -2,9 +2,11 @@ #include namespace at::native::xpu { -void cdist_kernel( + +TORCH_XPU_API void cdist_kernel( Tensor& result, const Tensor& x1_expanded, const Tensor& x2_expanded, double p); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp index ab7bc609c..1a01a7cfc 100644 --- a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp +++ b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp index 4c1b83689..cfc3e1dfa 100644 --- a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp +++ b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { void exponential_kernel( diff --git a/src/ATen/native/xpu/sycl/DistributionKernels.h b/src/ATen/native/xpu/sycl/DistributionKernels.h index e5700f26e..88d62933f 100644 --- a/src/ATen/native/xpu/sycl/DistributionKernels.h +++ b/src/ATen/native/xpu/sycl/DistributionKernels.h @@ -4,41 +4,43 @@ namespace at::native::xpu { -void random_from_to_kernel( +TORCH_XPU_API void random_from_to_kernel( TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen_); -void random_full_64_bits_range_kernel( +TORCH_XPU_API void random_full_64_bits_range_kernel( TensorIteratorBase& iter, c10::optional gen_); -void random_kernel(TensorIteratorBase& iter, c10::optional gen_); +TORCH_XPU_API void random_kernel( + TensorIteratorBase& iter, + c10::optional gen_); -void uniform_kernel( +TORCH_XPU_API void uniform_kernel( TensorIteratorBase& iter, double from, double to, c10::optional gen); -void normal_kernel( +TORCH_XPU_API void normal_kernel( const TensorBase& self, double mean, double std, c10::optional gen); -void bernoulli_tensor_kernel( +TORCH_XPU_API void bernoulli_tensor_kernel( const TensorBase& self, const TensorBase& p_, c10::optional gen_); -void bernoulli_scalar_kernel( +TORCH_XPU_API void bernoulli_scalar_kernel( const TensorBase& self, double p, c10::optional gen); -void exponential_kernel( +TORCH_XPU_API void exponential_kernel( TensorIteratorBase& iter, double lambda, c10::optional gen); diff --git a/src/ATen/native/xpu/sycl/DistributionNormal.cpp b/src/ATen/native/xpu/sycl/DistributionNormal.cpp index 41b7c6df5..93b938210 100644 --- a/src/ATen/native/xpu/sycl/DistributionNormal.cpp +++ b/src/ATen/native/xpu/sycl/DistributionNormal.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp index 138d81454..3c15de1d9 100644 --- a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp +++ b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionUniform.cpp b/src/ATen/native/xpu/sycl/DistributionUniform.cpp index 91261e18d..c38626909 100644 --- a/src/ATen/native/xpu/sycl/DistributionUniform.cpp +++ b/src/ATen/native/xpu/sycl/DistributionUniform.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp index 3886a2a72..ddaee4d4c 100644 --- a/src/ATen/native/xpu/sycl/Dropout.cpp +++ b/src/ATen/native/xpu/sycl/Dropout.cpp @@ -10,6 +10,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DropoutKernels.h b/src/ATen/native/xpu/sycl/DropoutKernels.h index a72e8e2f8..922b2838c 100644 --- a/src/ATen/native/xpu/sycl/DropoutKernels.h +++ b/src/ATen/native/xpu/sycl/DropoutKernels.h @@ -6,15 +6,13 @@ namespace at { namespace native { namespace xpu { -std::tuple dropout_kernel( +TORCH_XPU_API std::tuple dropout_kernel( const Tensor& self, double p, c10::optional train); -Tensor dropout_backward_kernel( - const Tensor& grad, - const Tensor& mask, - double scale); +TORCH_XPU_API Tensor +dropout_backward_kernel(const Tensor& grad, const Tensor& mask, double scale); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/Embedding.cpp b/src/ATen/native/xpu/sycl/Embedding.cpp index b75494ca2..d905a4d97 100644 --- a/src/ATen/native/xpu/sycl/Embedding.cpp +++ b/src/ATen/native/xpu/sycl/Embedding.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp index 0c7338d5d..f4f35061f 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { std::pair promoteIndicesAndOffsets( diff --git a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h index 047981b57..97ad6f0d0 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h +++ b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -std::tuple _embedding_bag_kernel( +TORCH_XPU_API std::tuple _embedding_bag_kernel( const Tensor& weight_t, const Tensor& indices_t, const Tensor& offsets_t, diff --git a/src/ATen/native/xpu/sycl/EmbeddingKernels.h b/src/ATen/native/xpu/sycl/EmbeddingKernels.h index 5aaa3257a..f2e75b37d 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingKernels.h +++ b/src/ATen/native/xpu/sycl/EmbeddingKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -Tensor embedding_dense_backward_kernel( +TORCH_XPU_API Tensor embedding_dense_backward_kernel( const Tensor& grad_, const Tensor& indices_, int64_t num_weights, diff --git a/src/ATen/native/xpu/sycl/FillKernel.cpp b/src/ATen/native/xpu/sycl/FillKernel.cpp index a0ee93b81..22e845364 100644 --- a/src/ATen/native/xpu/sycl/FillKernel.cpp +++ b/src/ATen/native/xpu/sycl/FillKernel.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/FillKernel.h b/src/ATen/native/xpu/sycl/FillKernel.h index 82bdd5f39..11c813cc0 100644 --- a/src/ATen/native/xpu/sycl/FillKernel.h +++ b/src/ATen/native/xpu/sycl/FillKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void fill_kernel(TensorIterator& iter, const Scalar& scalar); +TORCH_XPU_API void fill_kernel(TensorIterator& iter, const Scalar& scalar); } } // namespace native diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h index 56afbf630..d4448ca1b 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h @@ -31,11 +31,11 @@ namespace at::native::xpu { std::vector FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)( \ TensorList tensor1, TensorList tensor2, const Scalar& alpha) -FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add); -FOREACH_BINARY_LIST_ALPHA_KERNEL(add); -FOREACH_BINARY_LIST_INPLACE_KERNEL(mul); -FOREACH_BINARY_LIST_KERNEL(mul); -FOREACH_BINARY_LIST_INPLACE_KERNEL(div); -FOREACH_BINARY_LIST_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(div); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h index 1f3ab4afc..4aac91b47 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h @@ -17,11 +17,11 @@ namespace at::native::xpu { std::vector FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)( \ TensorList tensors, const Scalar& scalar) -FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add); -FOREACH_BINARY_SCALAR_KERNEL(add); -FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul); -FOREACH_BINARY_SCALAR_KERNEL(mul); -FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div); -FOREACH_BINARY_SCALAR_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(div); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h index 331f5c0ba..a863983dd 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h @@ -17,11 +17,11 @@ namespace at::native::xpu { std::vector FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)( \ TensorList tensors, at::ArrayRef scalars) -FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add); -FOREACH_BINARY_SCALARLIST_KERNEL(add); -FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul); -FOREACH_BINARY_SCALARLIST_KERNEL(mul); -FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div); -FOREACH_BINARY_SCALARLIST_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(div); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h index 89e3b0ea2..4f2eb478d 100644 --- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h @@ -11,9 +11,9 @@ namespace at::native::xpu { #define FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(NAME) \ FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(NAME) -FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul); -FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul); -FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv); -FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h index ed8b7f95b..cac1d235f 100644 --- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h @@ -17,9 +17,9 @@ namespace at::native::xpu { TensorList tensors2, \ const Scalar& scalar) -FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv); -FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h index e1afdf6fb..69b89796d 100644 --- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h @@ -17,9 +17,9 @@ namespace at::native::xpu { TensorList tensors2, \ at::ArrayRef scalars) -FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv); -FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp index 849a77443..cc90fa893 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + enum class NormType { L1, L2 }; namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h index a5cd8b448..187246aca 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h @@ -3,7 +3,7 @@ namespace at::native::xpu { -std::vector foreach_norm_kernel( +TORCH_XPU_API std::vector foreach_norm_kernel( TensorList tensors, const Scalar& ord, double p, diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp index 297662928..44b11cfd9 100644 --- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp @@ -5,6 +5,9 @@ #include #include +#include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h index f8a725a5d..67d90a9db 100644 --- a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h @@ -3,13 +3,13 @@ namespace at::native::xpu { -void foreach_lerp_list_kernel( +TORCH_XPU_API void foreach_lerp_list_kernel( TensorList tensors1, TensorList tensors2, TensorList tensors3, TensorList result); -void foreach_lerp_list_kernel_( +TORCH_XPU_API void foreach_lerp_list_kernel_( TensorList tensors1, TensorList tensors2, TensorList tensors3); diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h index f6b8a71b7..2c52b99f4 100644 --- a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h @@ -3,13 +3,13 @@ namespace at::native::xpu { -void foreach_lerp_scalar_kernel( +TORCH_XPU_API void foreach_lerp_scalar_kernel( TensorList tensors1, TensorList tensors2, const Scalar& weight, TensorList result); -void foreach_lerp_scalar_kernel_( +TORCH_XPU_API void foreach_lerp_scalar_kernel_( TensorList tensors1, TensorList tensors2, const Scalar& weight); diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp index 9b046472c..791345e54 100644 --- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { template class Op> diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h index 3c910b7df..7d7b79d5c 100644 --- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h @@ -3,8 +3,8 @@ namespace at::native::xpu { -std::vector foreach_sqrt_kernel(TensorList tensors); +TORCH_XPU_API std::vector foreach_sqrt_kernel(TensorList tensors); -void foreach_sqrt_kernel_(TensorList tensors); +TORCH_XPU_API void foreach_sqrt_kernel_(TensorList tensors); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp index 2c257d4d2..a3b70383b 100644 --- a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp +++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template @@ -20,4 +22,4 @@ void gcd_kernel(TensorIteratorBase& iter) { }); } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.h b/src/ATen/native/xpu/sycl/GcdLcmKernels.h index acf855d40..ba429d762 100644 --- a/src/ATen/native/xpu/sycl/GcdLcmKernels.h +++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void gcd_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void gcd_kernel(TensorIteratorBase& iter); } } // namespace native diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp index 9427fd4ce..e0a51293f 100644 --- a/src/ATen/native/xpu/sycl/GridSampler.cpp +++ b/src/ATen/native/xpu/sycl/GridSampler.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu::detail; diff --git a/src/ATen/native/xpu/sycl/GridSamplerKernels.h b/src/ATen/native/xpu/sycl/GridSamplerKernels.h index cc9ba4be4..28bfc6972 100644 --- a/src/ATen/native/xpu/sycl/GridSamplerKernels.h +++ b/src/ATen/native/xpu/sycl/GridSamplerKernels.h @@ -4,14 +4,14 @@ namespace at::native::xpu { -Tensor grid_sampler_2d_kernel( +TORCH_XPU_API Tensor grid_sampler_2d_kernel( const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); -void grid_sampler_2d_backward_kernel( +TORCH_XPU_API void grid_sampler_2d_backward_kernel( const Tensor& grad_input, const Tensor& grad_grid, const Tensor& grad_output, @@ -22,4 +22,4 @@ void grid_sampler_2d_backward_kernel( bool align_corners, std::array output_mask); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 8dafdc8d2..66b3275e1 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -10,6 +10,8 @@ #include #include +#include + namespace at::native::xpu { template < diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.h b/src/ATen/native/xpu/sycl/GroupNormKernels.h index 97dac5332..3cc680c0b 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.h +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void group_norm_kernel( +TORCH_XPU_API void group_norm_kernel( const Tensor& X, const Tensor& gamma, const Tensor& beta, @@ -18,7 +18,7 @@ void group_norm_kernel( Tensor& rstd, ScalarType dtype); -void group_norm_backward_kernel( +TORCH_XPU_API void group_norm_backward_kernel( const Tensor& dY, const Tensor& X, const Tensor& mean, diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h index f05e91e25..e51f0fd10 100644 --- a/src/ATen/native/xpu/sycl/HistogramKernels.h +++ b/src/ATen/native/xpu/sycl/HistogramKernels.h @@ -4,14 +4,14 @@ namespace at::native::xpu { -void histogramdd_kernel( +TORCH_XPU_API void histogramdd_kernel( const Tensor& self, const std::optional& weight, bool density, Tensor& hist, const TensorList& bin_edges_); -void histogramdd_linear_kernel( +TORCH_XPU_API void histogramdd_linear_kernel( const Tensor& self, const std::optional& weight, bool density, @@ -19,4 +19,4 @@ void histogramdd_linear_kernel( const TensorList& bin_edges_, bool local_search); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp index 8a8a9beaf..4ffa24e48 100644 --- a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp +++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp @@ -17,6 +17,8 @@ #include #endif +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp index aa511e6df..996a85126 100644 --- a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp +++ b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.h b/src/ATen/native/xpu/sycl/Im2ColKernel.h index e18fa063b..4b7090231 100644 --- a/src/ATen/native/xpu/sycl/Im2ColKernel.h +++ b/src/ATen/native/xpu/sycl/Im2ColKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void im2col_kernel( +TORCH_XPU_API void im2col_kernel( Tensor& output, const Tensor& input_, IntArrayRef kernel_size, diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp index fcee372d1..e9b2a0b00 100644 --- a/src/ATen/native/xpu/sycl/Indexing.cpp +++ b/src/ATen/native/xpu/sycl/Indexing.cpp @@ -19,6 +19,8 @@ #include #include +#include + using namespace at::xpu::detail; using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/IndexingKernels.h b/src/ATen/native/xpu/sycl/IndexingKernels.h index 8f32f49f9..288cd5391 100644 --- a/src/ATen/native/xpu/sycl/IndexingKernels.h +++ b/src/ATen/native/xpu/sycl/IndexingKernels.h @@ -3,22 +3,24 @@ namespace at::native::xpu { -void index_kernel( +TORCH_XPU_API void index_kernel( TensorIterator& iter, at::IntArrayRef index_size, at::IntArrayRef index_stride, at::IntArrayRef non_index_size, at::IntArrayRef non_index_stride); -void index_select_kernel( +TORCH_XPU_API void index_select_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Tensor& out); -void masked_fill_kernel(TensorIterator& iter, const Scalar& value); +TORCH_XPU_API void masked_fill_kernel( + TensorIterator& iter, + const Scalar& value); -void index_add_kernel( +TORCH_XPU_API void index_add_kernel( const Tensor& self, int64_t dim, const Tensor& index, @@ -26,13 +28,13 @@ void index_add_kernel( const Scalar& alpha, const Tensor& out); -void index_fill_kernel( +TORCH_XPU_API void index_fill_kernel( Tensor& self, int64_t dim, const Tensor& index, const Scalar& source); -void index_put_kernel( +TORCH_XPU_API void index_put_kernel( TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, @@ -40,14 +42,14 @@ void index_put_kernel( IntArrayRef non_index_stride, bool accumulate); -void index_put_deterministic_kernel( +TORCH_XPU_API void index_put_deterministic_kernel( Tensor& self, const c10::List>& indices, const Tensor& value, bool accumulate, bool unsafe); -void masked_scatter_kernel( +TORCH_XPU_API void masked_scatter_kernel( const TensorBase& self, const TensorBase& mask, const TensorBase& maskPrefixSum, diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index b21ec1589..60d40b71d 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.h b/src/ATen/native/xpu/sycl/LayerNormKernels.h index 0797d50fd..0c57a61ba 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.h +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -std::tuple layer_norm_kernel( +TORCH_XPU_API std::tuple layer_norm_kernel( const Tensor& X, const Tensor& gamma, const Tensor& beta, @@ -17,7 +17,7 @@ std::tuple layer_norm_kernel( Tensor& mean, Tensor& rstd); -std::tuple layer_norm_backward_kernel( +TORCH_XPU_API std::tuple layer_norm_backward_kernel( const Tensor& dY, const Tensor& X, const Tensor& mean, diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp index b0f480ac3..1648f193b 100644 --- a/src/ATen/native/xpu/sycl/LerpKernels.cpp +++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LerpKernels.h b/src/ATen/native/xpu/sycl/LerpKernels.h index c455adee8..6f13ec6eb 100644 --- a/src/ATen/native/xpu/sycl/LerpKernels.h +++ b/src/ATen/native/xpu/sycl/LerpKernels.h @@ -4,8 +4,10 @@ namespace at::native::xpu { -void lerp_tensor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lerp_tensor_kernel(TensorIteratorBase& iter); -void lerp_scalar_kernel(TensorIteratorBase& iter, const c10::Scalar& weight); +TORCH_XPU_API void lerp_scalar_kernel( + TensorIteratorBase& iter, + const c10::Scalar& weight); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp index 8d3128e9e..0f478eceb 100644 --- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp +++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp @@ -1,6 +1,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h index 32f987a2e..ca61016b9 100644 --- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h +++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h @@ -3,6 +3,9 @@ namespace at::native::xpu { -void addr_kernel(TensorIterator& iter, const Scalar& beta, const Scalar& alpha); +TORCH_XPU_API void addr_kernel( + TensorIterator& iter, + const Scalar& beta, + const Scalar& alpha); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp index 69b6a73a1..d79964295 100644 --- a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp +++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp @@ -5,6 +5,9 @@ #include #include #include + +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.h b/src/ATen/native/xpu/sycl/LogAddExpKernels.h index 01b9f400f..3a5ae4c34 100644 --- a/src/ATen/native/xpu/sycl/LogAddExpKernels.h +++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void logaddexp_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logaddexp_kernel(TensorIteratorBase& iter); -void logaddexp2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logaddexp2_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LossKernels.cpp b/src/ATen/native/xpu/sycl/LossKernels.cpp index f7c260535..69315adf5 100644 --- a/src/ATen/native/xpu/sycl/LossKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossKernels.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LossKernels.h b/src/ATen/native/xpu/sycl/LossKernels.h index 17f61afbb..560eb2c44 100644 --- a/src/ATen/native/xpu/sycl/LossKernels.h +++ b/src/ATen/native/xpu/sycl/LossKernels.h @@ -3,14 +3,14 @@ namespace at::native::xpu { -Tensor& binary_cross_entropy_kernel( +TORCH_XPU_API Tensor& binary_cross_entropy_kernel( const Tensor& input, const Tensor& target, const Tensor& weight, int64_t reduction, Tensor& loss); -Tensor& binary_cross_entropy_backward_kernel( +TORCH_XPU_API Tensor& binary_cross_entropy_backward_kernel( const Tensor& grad, const Tensor& input, const Tensor& target, diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp index c84613a4f..4b93cb3c3 100644 --- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp @@ -11,6 +11,8 @@ #include #include +#include + namespace at::native::xpu { inline Tensor optional_contiguous(const Tensor& source) { return source.defined() ? source.contiguous() : source; @@ -251,7 +253,15 @@ void nll_loss2d_forward_kernel( at::ScalarType::BFloat16, input.scalar_type(), "nll_loss2d_forward_kernel", - [&] { + [&input_, + &weight_, + &target_, + &output, + &total_weight, + &input, + &target, + &reduction, + &ignore_index] { using accscalar_t = acc_type_device; AT_DISPATCH_INDEX_TYPES( at::native::canUse32BitIndexMath(input_, INT_MAX) @@ -527,4 +537,4 @@ void nll_loss2d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h index cb2084649..87519b5e7 100644 --- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h +++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h @@ -3,7 +3,8 @@ #include namespace at::native::xpu { -void nll_loss2d_forward_kernel( + +TORCH_XPU_API void nll_loss2d_forward_kernel( Tensor& output, Tensor& total_weight, const Tensor& input, @@ -12,7 +13,7 @@ void nll_loss2d_forward_kernel( int64_t reduction, int64_t ignore_index); -void nll_loss2d_backward_kernel( +TORCH_XPU_API void nll_loss2d_backward_kernel( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, @@ -21,4 +22,5 @@ void nll_loss2d_backward_kernel( int64_t reduction, int64_t ignore_index, const Tensor& total_weight); -} // namespace at::native::xpu \ No newline at end of file + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp index f3d6c1f94..2f8e2a790 100644 --- a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp +++ b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.h b/src/ATen/native/xpu/sycl/LossNLLKernel.h index 301cf53bf..fdbd7a29c 100644 --- a/src/ATen/native/xpu/sycl/LossNLLKernel.h +++ b/src/ATen/native/xpu/sycl/LossNLLKernel.h @@ -3,7 +3,7 @@ namespace at::native::xpu { -std::tuple nll_loss_forward_kernel( +TORCH_XPU_API std::tuple nll_loss_forward_kernel( const Tensor& self, const Tensor& target, const OptionalTensorRef weight_opt, @@ -12,7 +12,7 @@ std::tuple nll_loss_forward_kernel( Tensor& output, Tensor& total_weight); -Tensor& nll_loss_backward_kernel( +TORCH_XPU_API Tensor& nll_loss_backward_kernel( const Tensor& grad_output, const Tensor& self, const Tensor& target, @@ -22,4 +22,4 @@ Tensor& nll_loss_backward_kernel( const Tensor& total_weight, Tensor& grad_input); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/Math.h b/src/ATen/native/xpu/sycl/MathExtensions.h similarity index 100% rename from src/ATen/native/xpu/sycl/Math.h rename to src/ATen/native/xpu/sycl/MathExtensions.h diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp index 0d38a4882..6d535a40d 100644 --- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp +++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h index 063a086cd..b54c69717 100644 --- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h +++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void maximum_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void maximum_kernel(TensorIteratorBase& iter); -void minimum_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void minimum_kernel(TensorIteratorBase& iter); -void fmax_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void fmax_kernel(TensorIteratorBase& iter); -void fmin_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void fmin_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp index 866f62a1a..d8418fef9 100644 --- a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp +++ b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp @@ -15,6 +15,9 @@ #include #include #include + +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.h b/src/ATen/native/xpu/sycl/MultinomialKernel.h index d400e51cd..5adede6f6 100644 --- a/src/ATen/native/xpu/sycl/MultinomialKernel.h +++ b/src/ATen/native/xpu/sycl/MultinomialKernel.h @@ -3,10 +3,10 @@ namespace at::native::xpu { -void multinomial_kernel( +TORCH_XPU_API void multinomial_kernel( Tensor& result, const Tensor& self, const int64_t n_sample, c10::optional generator); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/NMSKernel.cpp b/src/ATen/native/xpu/sycl/NMSKernel.cpp index 3bf712582..78e885307 100644 --- a/src/ATen/native/xpu/sycl/NMSKernel.cpp +++ b/src/ATen/native/xpu/sycl/NMSKernel.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/NMSKernel.h b/src/ATen/native/xpu/sycl/NMSKernel.h index 17aec3f7b..198223731 100644 --- a/src/ATen/native/xpu/sycl/NMSKernel.h +++ b/src/ATen/native/xpu/sycl/NMSKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold); +TORCH_XPU_API Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold); } } // namespace native diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp index e1c50a263..0cb77d52f 100644 --- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp +++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { struct FlattenIdxtoRealIdxKernelFunctor { diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.h b/src/ATen/native/xpu/sycl/NonzeroKernel.h index 1306dfa28..de92eeddb 100644 --- a/src/ATen/native/xpu/sycl/NonzeroKernel.h +++ b/src/ATen/native/xpu/sycl/NonzeroKernel.h @@ -3,6 +3,6 @@ namespace at::native::xpu { -void nonzero_kernel(const Tensor& self, Tensor& out); +TORCH_XPU_API void nonzero_kernel(const Tensor& self, Tensor& out); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index 822a83e99..76e8469eb 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h index 613c3cca6..f4572ae70 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h @@ -4,15 +4,20 @@ namespace at::native::xpu { -void addcmul_kernel(TensorIterator& iter, Scalar value); +TORCH_XPU_API void addcmul_kernel(TensorIterator& iter, Scalar value); -void addcdiv_kernel(TensorIterator& iter, Scalar value); +TORCH_XPU_API void addcdiv_kernel(TensorIterator& iter, Scalar value); -void mse_backward_kernel(TensorIterator& iter, const Scalar& value); +TORCH_XPU_API void mse_backward_kernel( + TensorIterator& iter, + const Scalar& value); -void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta); +TORCH_XPU_API void smooth_l1_backward_kernel( + TensorIterator& iter, + Scalar norm, + double beta); -void huber_backward_kernel( +TORCH_XPU_API void huber_backward_kernel( TensorIterator& iter, const Scalar& norm, double delta); diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp index 5b6769d1f..c0fad9f27 100644 --- a/src/ATen/native/xpu/sycl/PowKernels.cpp +++ b/src/ATen/native/xpu/sycl/PowKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/PowKernels.h b/src/ATen/native/xpu/sycl/PowKernels.h index 31a31c420..d21d0d1a7 100644 --- a/src/ATen/native/xpu/sycl/PowKernels.h +++ b/src/ATen/native/xpu/sycl/PowKernels.h @@ -6,11 +6,11 @@ namespace at { namespace native { namespace xpu { -void pow_tensor_scalar_kernel( +TORCH_XPU_API void pow_tensor_scalar_kernel( TensorIteratorBase& iter, const Scalar& exp_scalar); -void pow_tensor_tensor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void pow_tensor_tensor_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.cpp b/src/ATen/native/xpu/sycl/RandpermKernel.cpp index d151de28c..d30fa3007 100644 --- a/src/ATen/native/xpu/sycl/RandpermKernel.cpp +++ b/src/ATen/native/xpu/sycl/RandpermKernel.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.h b/src/ATen/native/xpu/sycl/RandpermKernel.h index 3a25c311f..2fa5f8e4c 100644 --- a/src/ATen/native/xpu/sycl/RandpermKernel.h +++ b/src/ATen/native/xpu/sycl/RandpermKernel.h @@ -2,8 +2,8 @@ #include namespace at::native::xpu { -Tensor randperm_kernel( - Tensor& result, - int64_t n, - c10::optional generator); -} // namespace at::native::xpu \ No newline at end of file + +TORCH_XPU_API Tensor +randperm_kernel(Tensor& result, int64_t n, c10::optional generator); + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp index 54b6c6328..f50ce1abf 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h index 3cf08ca5d..71dc4f5fb 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h @@ -6,13 +6,13 @@ namespace at { namespace native { namespace xpu { -Tensor& arange_kernel( +TORCH_XPU_API Tensor& arange_kernel( const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result); -Tensor& range_kernel( +TORCH_XPU_API Tensor& range_kernel( const Scalar& start, const Scalar& end, const Scalar& step, diff --git a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp index 121761053..a2df2bd63 100644 --- a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp index 1e18b2e5b..c522dbe7c 100644 --- a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp index 3c9f8453d..60f92ef3b 100644 --- a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp index 8eabf3353..ec753c66a 100644 --- a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp index 16095056c..3dfbb05c2 100644 --- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h index 6503f8842..9a669ec7a 100644 --- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h +++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void max_values_kernel(TensorIterator& iter); +TORCH_XPU_API void max_values_kernel(TensorIterator& iter); -void max_kernel(TensorIterator& iter); +TORCH_XPU_API void max_kernel(TensorIterator& iter); -void max_all_kernel(TensorIterator& iter); +TORCH_XPU_API void max_all_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp index 2a0ce889c..f73df51a0 100644 --- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h index d5600f5d4..65f3acf8e 100644 --- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h +++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void min_values_kernel(TensorIterator& iter); +TORCH_XPU_API void min_values_kernel(TensorIterator& iter); -void min_kernel(TensorIterator& iter); +TORCH_XPU_API void min_kernel(TensorIterator& iter); -void min_all_kernel(TensorIterator& iter); +TORCH_XPU_API void min_all_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp index 6d0e75680..7e24f7ecc 100644 --- a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp index 658f2e21b..565f5868e 100644 --- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp @@ -1,7 +1,9 @@ #include -#include #include +#include + +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.h b/src/ATen/native/xpu/sycl/ReduceNormKernel.h index 5e255dd5b..bc14826ec 100644 --- a/src/ATen/native/xpu/sycl/ReduceNormKernel.h +++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void norm_kernel(TensorIterator& iter, const Scalar& val); +TORCH_XPU_API void norm_kernel(TensorIterator& iter, const Scalar& val); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h index d85754efb..8b03fb137 100644 --- a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h +++ b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h @@ -4,26 +4,29 @@ namespace at::native::xpu { -void argmax_kernel(TensorIterator& iter); +TORCH_XPU_API void argmax_kernel(TensorIterator& iter); -void argmin_kernel(TensorIterator& iter); +TORCH_XPU_API void argmin_kernel(TensorIterator& iter); -void and_kernel(TensorIterator& iter); +TORCH_XPU_API void and_kernel(TensorIterator& iter); -void or_kernel(TensorIterator& iter); +TORCH_XPU_API void or_kernel(TensorIterator& iter); -void mean_kernel(TensorIterator& iter); +TORCH_XPU_API void mean_kernel(TensorIterator& iter); -void sum_kernel(TensorIterator& iter); +TORCH_XPU_API void sum_kernel(TensorIterator& iter); -void prod_kernel(TensorIterator& iter); +TORCH_XPU_API void prod_kernel(TensorIterator& iter); -void nansum_kernel(TensorIterator& iter); +TORCH_XPU_API void nansum_kernel(TensorIterator& iter); -void std_var_kernel(TensorIterator& iter, double correction, bool take_sqrt); +TORCH_XPU_API void std_var_kernel( + TensorIterator& iter, + double correction, + bool take_sqrt); -void aminmax_kernel(TensorIterator& iter); +TORCH_XPU_API void aminmax_kernel(TensorIterator& iter); -void aminmax_allreduce_kernel(TensorIterator& iter); +TORCH_XPU_API void aminmax_allreduce_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp index f250b34ad..7bdc3a188 100644 --- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp index 1627a5def..efddb53e6 100644 --- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace at::native::xpu { inline std::pair get_index_mapping1d( @@ -874,4 +876,4 @@ void reflection_pad3d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h index a21f6c8ee..093c5d9b5 100644 --- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h +++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h @@ -4,37 +4,37 @@ namespace at::native::xpu { -void reflection_pad1d_kernel( +TORCH_XPU_API void reflection_pad1d_kernel( Tensor& output, const Tensor& input_, IntArrayRef padding); -void reflection_pad1d_backward_kernel( +TORCH_XPU_API void reflection_pad1d_backward_kernel( Tensor& grad_input, const Tensor& grad_output_, const Tensor& input, IntArrayRef padding); -void reflection_pad2d_kernel( +TORCH_XPU_API void reflection_pad2d_kernel( Tensor& output, const Tensor& input_, IntArrayRef padding); -void reflection_pad2d_backward_kernel( +TORCH_XPU_API void reflection_pad2d_backward_kernel( Tensor& grad_input, const Tensor& grad_output_, const Tensor& input, IntArrayRef padding); -void reflection_pad3d_kernel( +TORCH_XPU_API void reflection_pad3d_kernel( Tensor& output, const Tensor& input_, IntArrayRef padding); -void reflection_pad3d_backward_kernel( +TORCH_XPU_API void reflection_pad3d_backward_kernel( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/RenormKernel.cpp b/src/ATen/native/xpu/sycl/RenormKernel.cpp index 0a1696d73..0f08a6566 100644 --- a/src/ATen/native/xpu/sycl/RenormKernel.cpp +++ b/src/ATen/native/xpu/sycl/RenormKernel.cpp @@ -2,6 +2,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/RenormKernel.h b/src/ATen/native/xpu/sycl/RenormKernel.h index a619cf708..488e8365a 100644 --- a/src/ATen/native/xpu/sycl/RenormKernel.h +++ b/src/ATen/native/xpu/sycl/RenormKernel.h @@ -3,6 +3,8 @@ namespace at::native::xpu { -void renorm_scale_factor_kernel(TensorIteratorBase& iter, double maxnorm); +TORCH_XPU_API void renorm_scale_factor_kernel( + TensorIteratorBase& iter, + double maxnorm); } diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp index 0b10f982a..f1eeb426b 100644 --- a/src/ATen/native/xpu/sycl/RepeatKernel.cpp +++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp @@ -2,6 +2,9 @@ #include #include #include + +#include + namespace at::native::xpu { template struct RepeatInterleaveKernelFunctor { @@ -72,4 +75,4 @@ Tensor repeat_interleave_kernel( }); return output; } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h index 3df41a956..9856c4305 100644 --- a/src/ATen/native/xpu/sycl/RepeatKernel.h +++ b/src/ATen/native/xpu/sycl/RepeatKernel.h @@ -2,8 +2,8 @@ #include namespace at::native::xpu { -Tensor repeat_interleave_kernel( +TORCH_XPU_API Tensor repeat_interleave_kernel( const Tensor& repeats, c10::optional output_size); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp index 4ba4eafb9..2371ad695 100644 --- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace at::native::xpu { inline int imin(int a, int b) { @@ -763,4 +765,4 @@ void replication_pad3d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h index 85d19d5bf..0269b962e 100644 --- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h +++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h @@ -4,37 +4,37 @@ namespace at::native::xpu { -void replication_pad1d_kernel( +TORCH_XPU_API void replication_pad1d_kernel( Tensor& output, const Tensor& input, IntArrayRef padding); -void replication_pad1d_backward_kernel( +TORCH_XPU_API void replication_pad1d_backward_kernel( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -void replication_pad2d_kernel( +TORCH_XPU_API void replication_pad2d_kernel( Tensor& output, const Tensor& input, IntArrayRef padding); -void replication_pad2d_backward_kernel( +TORCH_XPU_API void replication_pad2d_backward_kernel( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -void replication_pad3d_kernel( +TORCH_XPU_API void replication_pad3d_kernel( Tensor& output, const Tensor& input, IntArrayRef padding); -void replication_pad3d_backward_kernel( +TORCH_XPU_API void replication_pad3d_backward_kernel( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp index 195d526d8..0835b53ff 100644 --- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp +++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace at::native::xpu { void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) { @@ -58,7 +60,7 @@ TensorImpl* resize_impl_xpu_( TensorImpl* self, IntArrayRef size, at::OptionalIntArrayRef stride, - bool device_guard = true) { + bool device_guard) { if (self->sizes() == size && (!stride || self->strides() == stride)) { return self; } diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.h b/src/ATen/native/xpu/sycl/ResizeKernel.h index 5cef196ed..c5c113194 100644 --- a/src/ATen/native/xpu/sycl/ResizeKernel.h +++ b/src/ATen/native/xpu/sycl/ResizeKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -TensorImpl* resize_impl_xpu_( +TORCH_XPU_API TensorImpl* resize_impl_xpu_( TensorImpl* self, IntArrayRef size, at::OptionalIntArrayRef stride, diff --git a/src/ATen/native/xpu/sycl/ScanKernels.h b/src/ATen/native/xpu/sycl/ScanKernels.h index af632fa83..ea20e4460 100644 --- a/src/ATen/native/xpu/sycl/ScanKernels.h +++ b/src/ATen/native/xpu/sycl/ScanKernels.h @@ -3,8 +3,14 @@ namespace at::native::xpu { -void cumsum_kernel(const Tensor& result, const Tensor& self, int64_t dim); +TORCH_XPU_API void cumsum_kernel( + const Tensor& result, + const Tensor& self, + int64_t dim); -void cumprod_kernel(const Tensor& result, const Tensor& self, int64_t dim); +TORCH_XPU_API void cumprod_kernel( + const Tensor& result, + const Tensor& self, + int64_t dim); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp index 81d4b9e9a..597be8553 100644 --- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp +++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h index 0f8b7e5be..4d7d375b7 100644 --- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h +++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h @@ -6,45 +6,45 @@ namespace at { namespace native { namespace xpu { -void gather_kernel( +TORCH_XPU_API void gather_kernel( const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index); -void scatter_kernel( +TORCH_XPU_API void scatter_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); -void scatter_fill_kernel( +TORCH_XPU_API void scatter_fill_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src); -void scatter_add_kernel( +TORCH_XPU_API void scatter_add_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); -void scatter_reduce_kernel( +TORCH_XPU_API void scatter_reduce_kernel( const Tensor& self, const int64_t dim, const Tensor& index, const Tensor& src, const ReductionType& reduce); -void scatter_reduce_two_kernel( +TORCH_XPU_API void scatter_reduce_two_kernel( const Tensor& self, const int64_t dim, const Tensor& index, const Tensor& src, const ReductionType& reduce); -void scatter_scalar_reduce_kernel( +TORCH_XPU_API void scatter_scalar_reduce_kernel( const Tensor& self, const int64_t dim, const Tensor& index, diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp index 1c9c1b9d5..98089995c 100644 --- a/src/ATen/native/xpu/sycl/Shape.cpp +++ b/src/ATen/native/xpu/sycl/Shape.cpp @@ -8,6 +8,8 @@ #include #include +#include + namespace at::native::xpu { // The best performance is achieved for parallel computing with 1024 batch sizes diff --git a/src/ATen/native/xpu/sycl/ShapeKernels.h b/src/ATen/native/xpu/sycl/ShapeKernels.h index 494f37020..8f758bfb2 100644 --- a/src/ATen/native/xpu/sycl/ShapeKernels.h +++ b/src/ATen/native/xpu/sycl/ShapeKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void cat_out_kernel( +TORCH_XPU_API void cat_out_kernel( const ITensorListRef& tensors, int64_t dim, int64_t valid, diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index eb72795fc..7c0dd11db 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -8,6 +8,8 @@ #include #include +#include + using namespace xpu::sycl; namespace at { diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h index 7ed23aeb2..66c4c3228 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h @@ -6,26 +6,26 @@ namespace at { namespace native { namespace xpu { -Tensor& _softmax_kernel( +TORCH_XPU_API Tensor& _softmax_kernel( const Tensor& input, const int64_t dim, const bool half_to_float, Tensor& output); -Tensor& _log_softmax_kernel( +TORCH_XPU_API Tensor& _log_softmax_kernel( const Tensor& input, const int64_t dim, const bool half_to_float, Tensor& output); -Tensor& _softmax_backward_kernel( +TORCH_XPU_API Tensor& _softmax_backward_kernel( const Tensor& grad, const Tensor& output, int64_t dim, bool half_to_float, Tensor& grad_input); -Tensor& _log_softmax_backward_kernel( +TORCH_XPU_API Tensor& _log_softmax_backward_kernel( const Tensor& grad, const Tensor& output, int64_t dim, diff --git a/src/ATen/native/xpu/sycl/Sorting.cpp b/src/ATen/native/xpu/sycl/Sorting.cpp index 6e3a4f49e..a4cf160a2 100644 --- a/src/ATen/native/xpu/sycl/Sorting.cpp +++ b/src/ATen/native/xpu/sycl/Sorting.cpp @@ -15,6 +15,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/Sorting.h b/src/ATen/native/xpu/sycl/Sorting.h index 6abf7a475..813fe9ed3 100644 --- a/src/ATen/native/xpu/sycl/Sorting.h +++ b/src/ATen/native/xpu/sycl/Sorting.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -std::tuple sort_stable_kernel( +TORCH_XPU_API std::tuple sort_stable_kernel( const Tensor& self, c10::optional stable, Tensor& values, @@ -12,7 +12,7 @@ std::tuple sort_stable_kernel( int dim, bool descending); -void launch_median_kernel( +TORCH_XPU_API void launch_median_kernel( const TensorBase& vals, const TensorBase& inds, const TensorBase& self, diff --git a/src/ATen/native/xpu/sycl/StepKernels.cpp b/src/ATen/native/xpu/sycl/StepKernels.cpp index d3d76926b..34f703591 100644 --- a/src/ATen/native/xpu/sycl/StepKernels.cpp +++ b/src/ATen/native/xpu/sycl/StepKernels.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/StepKernels.h b/src/ATen/native/xpu/sycl/StepKernels.h index 90f949d9a..c026a6b65 100644 --- a/src/ATen/native/xpu/sycl/StepKernels.h +++ b/src/ATen/native/xpu/sycl/StepKernels.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void nextafter_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void nextafter_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp index 265170821..6e88a987f 100644 --- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp @@ -10,6 +10,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::native; using namespace at::xpu::detail; diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h index becf3ac54..1a6465ab6 100644 --- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h +++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h @@ -3,9 +3,7 @@ namespace at::native::xpu { -Tensor bincount_kernel( - const Tensor& self, - const Tensor& weights, - int64_t minlength); +TORCH_XPU_API Tensor +bincount_kernel(const Tensor& self, const Tensor& weights, int64_t minlength); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp index 8159db75f..1d2d7f772 100644 --- a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp @@ -6,6 +6,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.h b/src/ATen/native/xpu/sycl/TensorCompareKernels.h index 51cb74a40..0e83f33e9 100644 --- a/src/ATen/native/xpu/sycl/TensorCompareKernels.h +++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.h @@ -4,20 +4,24 @@ namespace at::native::xpu { -void where_kernel(TensorIterator& iter); +TORCH_XPU_API void where_kernel(TensorIterator& iter); -void clamp_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void clamp_kernel(TensorIteratorBase& iter); -void clamp_scalar_kernel( +TORCH_XPU_API void clamp_scalar_kernel( TensorIteratorBase& iter, const Scalar& min, const Scalar& max); -void clamp_min_scalar_kernel(TensorIteratorBase& iter, Scalar min); +TORCH_XPU_API void clamp_min_scalar_kernel( + TensorIteratorBase& iter, + Scalar min); -void clamp_max_scalar_kernel(TensorIteratorBase& iter, Scalar max); +TORCH_XPU_API void clamp_max_scalar_kernel( + TensorIteratorBase& iter, + Scalar max); -void isin_kernel( +TORCH_XPU_API void isin_kernel( const Tensor& elements, const Tensor& test_elements, bool invert, diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp index 2de5fbb8b..d2ec40bb9 100644 --- a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp +++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.h b/src/ATen/native/xpu/sycl/TensorTopKKernel.h index cb5c5a654..44db5de70 100644 --- a/src/ATen/native/xpu/sycl/TensorTopKKernel.h +++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -std::tuple topk_kernel( +TORCH_XPU_API std::tuple topk_kernel( const at::Tensor& input, int64_t k, int64_t dim, diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp index ecf832de3..4a1ddcb74 100644 --- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + #ifdef _WIN32 #define RESTRICT __restrict #else diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h index b43bd52a8..b5fdbb661 100644 --- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h +++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h @@ -4,9 +4,9 @@ namespace at::native::xpu { -void flip_kernel(TensorIterator& iter); +TORCH_XPU_API void flip_kernel(TensorIterator& iter); -void roll_kernel( +TORCH_XPU_API void roll_kernel( const Tensor& input, Tensor& output, IntArrayRef shifts, diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp index 65c165c2c..4e646bcdd 100644 --- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h index 289fd0bd7..8ae777429 100644 --- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h +++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h @@ -3,8 +3,14 @@ namespace at::native::xpu { -Tensor& tril_kernel(Tensor& result, const Tensor& self, int64_t k); +TORCH_XPU_API Tensor& tril_kernel( + Tensor& result, + const Tensor& self, + int64_t k); -Tensor& triu_kernel(Tensor& result, const Tensor& self, int64_t k); +TORCH_XPU_API Tensor& triu_kernel( + Tensor& result, + const Tensor& self, + int64_t k); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp index 87de57a3a..d7a49f7d7 100644 --- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h index d3ad4fe15..7dd91771b 100644 --- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void conj_kernel(TensorIterator& iter); +TORCH_XPU_API void conj_kernel(TensorIterator& iter); -void conj_physical_kernel(TensorIterator& iter); +TORCH_XPU_API void conj_physical_kernel(TensorIterator& iter); -void neg_conj_kernel(TensorIterator& iter); +TORCH_XPU_API void neg_conj_kernel(TensorIterator& iter); -void neg_kernel(TensorIterator& iter); +TORCH_XPU_API void neg_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp index 96510a0e0..a8551c262 100644 --- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h index 10f4889b0..a3a2015df 100644 --- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h @@ -4,16 +4,18 @@ namespace at::native::xpu { -void reciprocal_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void reciprocal_kernel(TensorIteratorBase& iter); -void floor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void floor_kernel(TensorIteratorBase& iter); -void ceil_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void ceil_kernel(TensorIteratorBase& iter); -void round_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void round_kernel(TensorIteratorBase& iter); -void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals); +TORCH_XPU_API void round_decimals_kernel( + TensorIteratorBase& iter, + int64_t decimals); -void frac_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void frac_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp index 4a8a4a55e..d2546e842 100644 --- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp @@ -5,10 +5,12 @@ #include #include #include -#include +#include #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h index 5eda977f6..2908a4b13 100644 --- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void digamma_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void digamma_kernel(TensorIteratorBase& iter); -void polygamma_kernel(TensorIteratorBase& iter, int64_t n); +TORCH_XPU_API void polygamma_kernel(TensorIteratorBase& iter, int64_t n); -void lgamma_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lgamma_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp index e1b8ff80f..d35dedf96 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template struct AcosFunctor { diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h index 4ff06d256..17acb1061 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h @@ -3,6 +3,7 @@ #include namespace at::native::xpu { -void acos_kernel(TensorIteratorBase& iter); -} // namespace at::native::xpu \ No newline at end of file +TORCH_XPU_API void acos_kernel(TensorIteratorBase& iter); + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp index cd9864275..9062ee17a 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h index 98dc2eed5..732d307ec 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void acosh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void acosh_kernel(TensorIteratorBase& iter); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp index c8b1aaca6..1819b47dc 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h index 194ce6479..ff449e2cd 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void asin_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void asin_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp index 009a68b47..24169a1dc 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h index 4d37288de..2668ee963 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void asinh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void asinh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp index f241e9da3..c6ce4b1b0 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h index 022720223..6ee127539 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void atan_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void atan_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp index 630a64d39..3033ff8f8 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h index 5536641e7..267b59eb6 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void atanh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void atanh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp index 21552d99d..e4eee5e7a 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h index 35123a666..fc31580b7 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void cos_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void cos_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp index 11440b3d3..16cffe893 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h index 7f031e3ff..bbd10e1aa 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void cosh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void cosh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp index 2a17cefcb..27e3e2906 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h index 76dca73c8..ecd4da1d4 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void sin_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sin_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp index 2155aafad..06309f45c 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h index e3ceaf643..a5005e1d0 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void sinh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sinh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp index 57c0049b1..996da809b 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h index b96a35f93..9446ff058 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void tan_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void tan_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp index dcbe8a970..1b71b3974 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h index 6cecc99e2..831421a12 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void tanh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void tanh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp index ef404a4fd..2f0645c4d 100644 --- a/src/ATen/native/xpu/sycl/UnaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.h b/src/ATen/native/xpu/sycl/UnaryKernels.h index cc394de6b..db7005523 100644 --- a/src/ATen/native/xpu/sycl/UnaryKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryKernels.h @@ -4,17 +4,17 @@ namespace at::native::xpu { -void sqrt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sqrt_kernel(TensorIteratorBase& iter); -void rsqrt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void rsqrt_kernel(TensorIteratorBase& iter); -void bitwise_not_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_not_kernel(TensorIteratorBase& iter); -void exp_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void exp_kernel(TensorIteratorBase& iter); -void expm1_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void expm1_kernel(TensorIteratorBase& iter); -void nan_to_num_kernel( +TORCH_XPU_API void nan_to_num_kernel( TensorIteratorBase& iter, std::optional nan, std::optional pos_inf, diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp index 57ee1c706..81fc380bb 100644 --- a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.h b/src/ATen/native/xpu/sycl/UnaryLogKernels.h index c8d16ceae..72db9235a 100644 --- a/src/ATen/native/xpu/sycl/UnaryLogKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void log_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log_kernel(TensorIteratorBase& iter); -void log10_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log10_kernel(TensorIteratorBase& iter); -void log1p_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log1p_kernel(TensorIteratorBase& iter); -void log2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log2_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp index b35aba437..031a2ace2 100644 --- a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp @@ -7,6 +7,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.h b/src/ATen/native/xpu/sycl/UnarySignKernels.h index 98b6eeb5f..01759a725 100644 --- a/src/ATen/native/xpu/sycl/UnarySignKernels.h +++ b/src/ATen/native/xpu/sycl/UnarySignKernels.h @@ -4,14 +4,14 @@ namespace at::native::xpu { -void logical_not_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_not_kernel(TensorIteratorBase& iter); -void neg_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void neg_kernel(TensorIteratorBase& iter); -void sgn_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sgn_kernel(TensorIteratorBase& iter); -void sign_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sign_kernel(TensorIteratorBase& iter); -void signbit_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void signbit_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp index e2303634e..cb00537fe 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp @@ -10,6 +10,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h index 6bed9c6de..f810da576 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h @@ -4,16 +4,18 @@ namespace at::native::xpu { -void sigmoid_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sigmoid_kernel(TensorIteratorBase& iter); -void erf_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void erf_kernel(TensorIteratorBase& iter); -void erfc_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void erfc_kernel(TensorIteratorBase& iter); -void erfinv_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void erfinv_kernel(TensorIteratorBase& iter); -void exp2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void exp2_kernel(TensorIteratorBase& iter); -void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar); +TORCH_XPU_API void logit_kernel( + TensorIteratorBase& iter, + const Scalar& eps_scalar); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp index ad188f39b..4b95fc553 100644 --- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp @@ -8,6 +8,8 @@ #include #include +#include + namespace at::native::xpu { constexpr int n_elems_per_work_item = 4; // UNROLLED_ELEM_PER_WORK_ITEM; diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h index 35c2f83c1..136f88482 100644 --- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h +++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void unfold_backward_kernel( +TORCH_XPU_API void unfold_backward_kernel( Tensor& grad_out, const Tensor& grad_in, int64_t dim, diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.cpp b/src/ATen/native/xpu/sycl/UniqueKernels.cpp index b57274a8e..359712d1f 100644 --- a/src/ATen/native/xpu/sycl/UniqueKernels.cpp +++ b/src/ATen/native/xpu/sycl/UniqueKernels.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace at::native::xpu { struct UniqueNotEqualFunctor { diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.h b/src/ATen/native/xpu/sycl/UniqueKernels.h index b7a7cd1ae..25b36cc64 100644 --- a/src/ATen/native/xpu/sycl/UniqueKernels.h +++ b/src/ATen/native/xpu/sycl/UniqueKernels.h @@ -3,29 +3,30 @@ #include namespace at::native::xpu { -std::tuple unique_consecutive_kernel( + +TORCH_XPU_API std::tuple unique_consecutive_kernel( const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional dim); -std::tuple unique_dim_consecutive_kernel( +TORCH_XPU_API std::tuple unique_dim_consecutive_kernel( const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts); -std::tuple unique_dim_kernel( +TORCH_XPU_API std::tuple unique_dim_kernel( const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts); -std::tuple _unique_kernel( +TORCH_XPU_API std::tuple _unique_kernel( const Tensor& self, const bool return_inverse); -std::tuple _unique2_kernel( +TORCH_XPU_API std::tuple _unique2_kernel( const Tensor& self, const bool return_inverse, const bool return_counts); diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp index 8c6c33afd..a60136e30 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h index 835e25def..e4d6d9485 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void upsample_bicubic2d_kernel( +TORCH_XPU_API void upsample_bicubic2d_kernel( Tensor& output, const Tensor& input, IntArrayRef output_size, @@ -12,4 +12,4 @@ void upsample_bicubic2d_kernel( std::optional scales_h, std::optional scales_w); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp index 1f578af13..e7c81e29a 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h index 3f75f79cf..fa0e8601a 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void upsample_bilinear2d_out_kernel( +TORCH_XPU_API void upsample_bilinear2d_out_kernel( Tensor& output, const Tensor& input, IntArrayRef output_size, @@ -12,7 +12,7 @@ void upsample_bilinear2d_out_kernel( std::optional scales_h, std::optional scales_w); -void upsample_bilinear2d_backward_out_kernel( +TORCH_XPU_API void upsample_bilinear2d_backward_out_kernel( Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, @@ -21,4 +21,4 @@ void upsample_bilinear2d_backward_out_kernel( c10::optional scales_h, c10::optional scales_w); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp index 71fc04ab1..b0ac8cc54 100644 --- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp @@ -14,6 +14,8 @@ #include "ATen/Context.h" #include "ATen/core/TensorBase.h" +#include + namespace at::native::xpu { template struct UpsampleLinear1dKernelFunctor { @@ -213,4 +215,4 @@ void upsample_linear1d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h index 941de1638..5beeeebc7 100644 --- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h @@ -3,14 +3,15 @@ #include namespace at::native::xpu { -void upsample_linear1d_kernel( + +TORCH_XPU_API void upsample_linear1d_kernel( const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional scales, Tensor& output); -void upsample_linear1d_backward_kernel( +TORCH_XPU_API void upsample_linear1d_backward_kernel( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size, @@ -18,4 +19,4 @@ void upsample_linear1d_backward_kernel( std::optional scales, Tensor& grad_input); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp index 3c4a01d91..4ceda372c 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h index bb6dd83ff..173302bc6 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h @@ -5,14 +5,14 @@ namespace at::native::xpu { -void upsample_nearest1d_kernel( +TORCH_XPU_API void upsample_nearest1d_kernel( Tensor& output, const Tensor& input_, IntArrayRef output_size, c10::optional scales, bool is_exact); -void upsample_nearest1d_backward_kernel( +TORCH_XPU_API void upsample_nearest1d_backward_kernel( Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp index 62e2ff9e0..682a73cc6 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp @@ -707,4 +707,4 @@ void upsample_nearest2d_kernel( } } } // namespace xpu -} // namespace at::native \ No newline at end of file +} // namespace at::native diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h index 7d11e03af..8f8f6fff7 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h @@ -5,7 +5,7 @@ namespace at::native::xpu { -void upsample_nearest2d_kernel( +TORCH_XPU_API void upsample_nearest2d_kernel( Tensor& output, const Tensor& input_, IntArrayRef output_size, @@ -13,7 +13,7 @@ void upsample_nearest2d_kernel( c10::optional scales_w, bool is_exact); -void upsample_nearest2d_backward_kernel( +TORCH_XPU_API void upsample_nearest2d_backward_kernel( Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp index dd93f68c3..ac67d5d34 100644 --- a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp @@ -7,6 +7,8 @@ #include #include "comm/Runtime.h" +#include + namespace at::native::xpu { template @@ -1024,4 +1026,4 @@ std::tuple weight_norm_backward_kernel( return {grad_v, grad_g}; } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.h b/src/ATen/native/xpu/sycl/WeightNormKernels.h index 5b0d7afd2..88a45b298 100644 --- a/src/ATen/native/xpu/sycl/WeightNormKernels.h +++ b/src/ATen/native/xpu/sycl/WeightNormKernels.h @@ -2,15 +2,17 @@ #include namespace at::native::xpu { -std::tuple weight_norm_kernel( + +TORCH_XPU_API std::tuple weight_norm_kernel( const Tensor& v, const Tensor& g, int64_t dim); -std::tuple weight_norm_backward_kernel( +TORCH_XPU_API std::tuple weight_norm_backward_kernel( const Tensor& grad_w, const Tensor& saved_v, const Tensor& saved_g, const Tensor& saved_norms, int64_t dim); -} // namespace at::native::xpu \ No newline at end of file + +} // namespace at::native::xpu diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake new file mode 100644 index 000000000..3e304b31f --- /dev/null +++ b/src/BuildOnLinux.cmake @@ -0,0 +1,79 @@ +# Build on Linux + +set(TORCH_XPU_OPS_LIBRARIES) +set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) + +add_library( + torch_xpu_ops + STATIC + ${ATen_XPU_CPP_SRCS} + ${ATen_XPU_NATIVE_CPP_SRCS} + ${ATen_XPU_GEN_SRCS}) + +if(BUILD_SEPARATE_OPS) + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + get_filename_component(name ${sycl_src} NAME_WLE REALPATH) + set(sycl_lib torch-xpu-ops-sycl-${name}) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${sycl_src}) + target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + endforeach() +else() + # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others. + set(ATen_XPU_SYCL_UNARY_BINARY_SRCS) + set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS) + + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + string(REGEX MATCH "Binary" IS_BINARY ${sycl_src}) + string(REGEX MATCH "Unary" IS_UNARY ${sycl_src}) + if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "") + list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src}) + else() + list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src}) + endif() + endforeach() + + set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels) + sycl_add_library( + ${sycl_unary_binary_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS}) + target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + set(sycl_lib torch_xpu_ops_sycl_kernels) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS}) + target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") +endif() +set(SYCL_LINK_LIBRARIES_KEYWORD) + +list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) + +foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) + # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake + torch_compile_options(${lib}) + target_compile_options_if_supported(${lib} "-Wno-deprecated-copy") + target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS}) + + target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) + + target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) +endforeach() diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake new file mode 100644 index 000000000..a8e00cb55 --- /dev/null +++ b/src/BuildOnWindows.cmake @@ -0,0 +1,106 @@ +# Build on Windows + +set(TORCH_XPU_OPS_LIBRARIES) +set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) + +# Walk around cyclic dependence +# libtorch_xpu.so links to libtorch_xpu_ops.a +# libtorch_xpu_ops.a dlopens libtorch_xpu_ops_aten.so (Break cycle) +# libtorch_xpu_ops_aten.so links to libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so +# libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so links to libtorch_xpu.so +add_library( + torch_xpu_ops + STATIC + ${ATen_XPU_CPP_SRCS} + "bridge.cpp") +set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\") +target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB}) + +add_library( + torch_xpu_ops_aten + SHARED + ${ATen_XPU_NATIVE_CPP_SRCS} + ${ATen_XPU_GEN_SRCS}) +install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}") +target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu) + +if(BUILD_SEPARATE_OPS) + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + get_filename_component(name ${sycl_src} NAME_WLE REALPATH) + set(sycl_lib torch-xpu-ops-sycl-${name}) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${sycl_src}) + target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + endforeach() +else() + # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others. + set(ATen_XPU_SYCL_UNARY_BINARY_SRCS) + set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS) + + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + string(REGEX MATCH "Binary" IS_BINARY ${sycl_src}) + string(REGEX MATCH "Unary" IS_UNARY ${sycl_src}) + # Resolve cyclic dependences between + # torch_xpu_ops_sycl_unary_binary_kernels.dll and + # torch_xpu_ops_sycl_kernels.dll. Move definition and invoke of kernels + # into a same kernel library. Here we move elementwise kernel pow and copy + # into torch_xpu_ops_sycl_unary_binary_kernels.dll. + string(REGEX MATCH "Pow" IS_POW ${sycl_src}) + string(REGEX MATCH "Copy" IS_COPY ${sycl_src}) + if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "" AND IS_POW STREQUAL "" AND IS_COPY STREQUAL "") + list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src}) + else() + list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src}) + endif() + endforeach() + + set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels) + sycl_add_library( + ${sycl_unary_binary_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS}) + target_compile_definitions(${sycl_unary_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_binary_lib}) + target_link_libraries(${sycl_unary_binary_lib} PUBLIC torch_xpu) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + set(sycl_lib torch_xpu_ops_sycl_kernels) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS}) + target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib}) + target_link_libraries(${sycl_lib} PUBLIC torch_xpu) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") +endif() +set(SYCL_LINK_LIBRARIES_KEYWORD) + +list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) +list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten) + +foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) + # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake + torch_compile_options(${lib}) + target_compile_options_if_supported(${lib} "-Wno-deprecated-copy") + target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS}) + + target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) + + target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) + target_link_libraries(${lib} PUBLIC c10_xpu) +endforeach() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ba40a4b8c..0716ca5af 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,88 +2,24 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake) set(ATen_XPU_CPP_SRCS) +set(ATen_XPU_NATIVE_CPP_SRCS) set(ATen_XPU_SYCL_SRCS) set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory") add_subdirectory(ATen) -set(TORCH_XPU_OPS_LIBRARIES) -set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) - -add_library( - torch_xpu_ops - STATIC - ${ATen_XPU_CPP_SRCS} - ${ATen_XPU_GEN_SRCS}) - -if(BUILD_SEPARATE_OPS) - foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) - get_filename_component(name ${sycl_src} NAME_WLE REALPATH) - set(sycl_lib torch-xpu-ops-sycl-${name}) - sycl_add_library( - ${sycl_lib} - SHARED - SYCL_SOURCES ${sycl_src}) - target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) - list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) - - # Decouple with PyTorch cmake definition. - install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") - endforeach() +# With the increasement of bin size, we have to split libtorch_xpu.so into +# multiple libraries. Because of strict linkage requirements on Windows, +# we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible. +# To isolate impact and keep both maintainable, there are separate code pieces +# to setup CMAKE targets. Will revert splitting library and unify these once +# compression of device code is supported by SYCL/backend compiler. +if(WIN32) + include(BuildOnWindows.cmake) else() - # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others. - set(ATen_XPU_SYCL_UNARY_BINARY_SRCS) - set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS) - - foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) - string(REGEX MATCH "Binary" IS_BINARY ${sycl_src}) - string(REGEX MATCH "Unary" IS_UNARY ${sycl_src}) - if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "") - list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src}) - else() - list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src}) - endif() - endforeach() - - set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels) - sycl_add_library( - ${sycl_unary_binary_lib} - SHARED - SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS}) - target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib}) - list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib}) - - # Decouple with PyTorch cmake definition. - install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") - - set(sycl_lib torch_xpu_ops_sycl_kernels) - sycl_add_library( - ${sycl_lib} - SHARED - SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS}) - target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) - list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) - - # Decouple with PyTorch cmake definition. - install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + include(BuildOnLinux.cmake) endif() -set(SYCL_LINK_LIBRARIES_KEYWORD) - -list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) - -foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) - # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake - torch_compile_options(${lib}) - target_compile_options_if_supported(${lib} "-Wno-deprecated-copy") - target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS}) - - target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS}) - target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) - target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) - - target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) -endforeach() include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake) if(CLANG_FORMAT) diff --git a/src/bridge.cpp b/src/bridge.cpp new file mode 100644 index 000000000..c19ce2554 --- /dev/null +++ b/src/bridge.cpp @@ -0,0 +1,34 @@ +#include +#include +#include + +// The implementation helps walk around cyclic dependence, when we separate +// kernels into multiple dll/so to avoid a large bin (>2GB). +// The file is built into libtorch_xpu.so. libtorch_xpu.so won't depend on +// libtorch_xpu_ops_aten.so but dlopen the library to walk around cyclic +// dependence during linkage. To break cycle like, +// libtorch_xpu.so -> (dlopen) libtorch_xpu_ops_aten.so -> (link) +// libtorch_xpu_ops_kernels.so +// -> (link) +// libtorch_xpu_ops_unary_binary_kernels.so +// libtorch_xpu_ops_kernels.so -> (link) libtorch_xpu.so +// libtorch_xpu_ops_unary_binary_kernels.so -> (link) libtorch_xpu.so +namespace { + +class LoadTorchXPUOps { + public: + LoadTorchXPUOps() { + if (NULL == LoadLibrary(PATH_TO_TORCH_XPU_OPS_ATEN_LIB)) { + std::ostringstream error; + error << "PyTorch XPU operators library is not loaded (ERROR: " + << GetLastError() + << "). Please check if PyTorch is installed correctly." + << " Or please file an issue on https://github.com/intel/torch-xpu-ops/issues."; + throw std::runtime_error(error.str()); + } + } +}; + +static LoadTorchXPUOps init; + +} // namespace