diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake index 0b5052f7b..0ee38df7f 100644 --- a/cmake/BuildFlags.cmake +++ b/cmake/BuildFlags.cmake @@ -120,14 +120,17 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-fp32-correctly-rounded-divide-sqrt") set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'") - if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL "")) - set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device $ENV{TORCH_XPU_ARCH_LIST}") + if(WIN32) + set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h") else() - set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device pvc,xe-lpg,ats-m150") - message(STATUS "'TORCH_XPU_ARCH_LIST' not set. Using default configuration for a full AOT build." - "Try specifying from 'pvc,xe-lpg,ats-m150' if you don't need.") + set(AOT_TARGETS "pvc,xe-lpg,ats-m150") + endif() + if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL "")) + set(AOT_TARGETS "$ENV{TORCH_XPU_ARCH_LIST}") endif() - message(STATUS " SYCL_OFFLINE_COMPILER_AOT_OPTIONS: ${SYCL_OFFLINE_COMPILER_AOT_OPTIONS}") + + set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device ${AOT_TARGETS}") + message(STATUS "Compile Intel GPU AOT Targets for ${AOT_TARGETS}") set(SYCL_OFFLINE_COMPILER_FLAGS "${SYCL_OFFLINE_COMPILER_AOT_OPTIONS} ${SYCL_OFFLINE_COMPILER_CG_OPTIONS}") else() diff --git a/cmake/Modules/FindSYCL/run_sycl.cmake b/cmake/Modules/FindSYCL/run_sycl.cmake index dcc99c18f..f56d49768 100644 --- a/cmake/Modules/FindSYCL/run_sycl.cmake +++ b/cmake/Modules/FindSYCL/run_sycl.cmake @@ -56,6 +56,12 @@ endforeach() # Choose host flags in FindSYCL.cmake @SYCL_host_flags@ +# Adding permissive flag for MSVC build to overcome ambiguous symbol error. +if(WIN32) + string(APPEND SYCL_host_compiler_flags "/permissive- ") +endif() + + list(REMOVE_DUPLICATES CMAKE_HOST_FLAGS) foreach(flag ${CMAKE_HOST_FLAGS}) # Extra quotes are added around each flag to help SYCL parse out flags with spaces. diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt index 815ad018f..57c4783cd 100644 --- a/src/ATen/CMakeLists.txt +++ b/src/ATen/CMakeLists.txt @@ -1,10 +1,13 @@ # ATen XPU sources -file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp") +file(GLOB xpu_cpp "xpu/*.cpp") +file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp") file(GLOB xpu_sycl "native/xpu/sycl/*.cpp") list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp}) +list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp}) list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl}) set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE) +set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE) set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE) diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index ba3d3a58b..22a031bc6 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -31,7 +31,7 @@ Tensor& arange_out( TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); TORCH_CHECK( - std::isfinite(xstart) && std::isfinite(xend), + std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", @@ -87,5 +87,33 @@ Tensor& arange_out( return xpu::arange_kernel(start, end, step, out); } + +Tensor& range_xpu_out( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& out) { + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + int64_t size = static_cast(((xend - xstart) / xstep) + 1); + if (out.numel() != size) { + out.resize_({size}); + } + + return at::native::xpu::range_kernel(start, end, step, out); +} + } // namespace native } // namespace at diff --git a/src/ATen/native/xpu/sycl/AbsKernel.cpp b/src/ATen/native/xpu/sycl/AbsKernel.cpp index 596000ecb..3df01baee 100644 --- a/src/ATen/native/xpu/sycl/AbsKernel.cpp +++ b/src/ATen/native/xpu/sycl/AbsKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/AbsKernel.h b/src/ATen/native/xpu/sycl/AbsKernel.h index 3b51e993e..3423c32cf 100644 --- a/src/ATen/native/xpu/sycl/AbsKernel.h +++ b/src/ATen/native/xpu/sycl/AbsKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void abs_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp index bfb6eb58c..ee73856ee 100644 --- a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.h b/src/ATen/native/xpu/sycl/ActivationEluKernels.h index 555bfb5f4..a15eaa590 100644 --- a/src/ATen/native/xpu/sycl/ActivationEluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.h @@ -4,13 +4,13 @@ namespace at::native::xpu { -void elu_kernel( +TORCH_XPU_API void elu_kernel( TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale); -void elu_backward_kernel( +TORCH_XPU_API void elu_backward_kernel( TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp index 29d6c5d09..2bfcccb69 100644 --- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp +++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h index 6c373a7cd..660fae9ff 100644 --- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h +++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h @@ -6,9 +6,11 @@ namespace at { namespace native { namespace xpu { -void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate); +TORCH_XPU_API void gelu_kernel( + TensorIteratorBase& iter, + c10::string_view approximate); -void gelu_backward_kernel( +TORCH_XPU_API void gelu_backward_kernel( TensorIteratorBase& iter, c10::string_view approximate); diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp index 056e8c332..f60613ec7 100644 --- a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp @@ -2,9 +2,12 @@ #include #include +#include #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.h b/src/ATen/native/xpu/sycl/ActivationGluKernels.h index c0fd6e22c..a4b11dbb4 100644 --- a/src/ATen/native/xpu/sycl/ActivationGluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.h @@ -4,9 +4,9 @@ namespace at::native::xpu { -void glu_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter); -void glu_backward_kernel( +TORCH_XPU_API void glu_backward_kernel( const TensorIteratorBase& iter, int64_t gI_stride, int64_t I_stride); diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp index 376d8d7ca..c3fe41fd8 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h index ee5f0e87c..47bfa636c 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void hardsigmoid_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter); -void hardsigmoid_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp index 09270a5c0..8c6e47f77 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h index ea4e81fc9..6b741e744 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h @@ -6,9 +6,9 @@ namespace at { namespace native { namespace xpu { -void hardswish_kernel(TensorIterator& iter); +TORCH_XPU_API void hardswish_kernel(TensorIterator& iter); -void hardswish_backward_kernel(TensorIterator& iter); +TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp index 730d65350..1dcd78b88 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h index c2d110bae..745c0f69b 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void hardtanh_backward_kernel( +TORCH_XPU_API void hardtanh_backward_kernel( TensorIterator& iter, const Scalar& min, const Scalar& max); diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp index 9db04defc..88d170352 100644 --- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp @@ -6,6 +6,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h index 758701ce9..5656c661e 100644 --- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h @@ -4,9 +4,11 @@ namespace at::native::xpu { -void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_); +TORCH_XPU_API void leaky_relu_kernel( + TensorIteratorBase& iter, + const Scalar& negval_); -void leaky_relu_backward_kernel( +TORCH_XPU_API void leaky_relu_backward_kernel( TensorIteratorBase& iter, const Scalar& negval_); diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp index 113ea32ed..f4051184e 100644 --- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp @@ -6,6 +6,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h index eb009d0cf..42ee9dbea 100644 --- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void log_sigmoid_forward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter); -void log_sigmoid_backward_kernel(TensorIterator& iter); +TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp index d7a78bb0f..9033b103c 100644 --- a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp @@ -8,6 +8,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h index 1c663dfe5..7c4efc0af 100644 --- a/src/ATen/native/xpu/sycl/ActivationMishKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void mish_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter); -void mish_backward_kernel(TensorIterator& iter); +TORCH_XPU_API void mish_backward_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp index 8a2c7012f..9ec4e598b 100644 --- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template @@ -40,4 +42,4 @@ void prelu_backward_kernel(TensorIterator& iter) { }); } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h index 23f60c242..4d33e9064 100644 --- a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void prelu_kernel(TensorIterator& iter); +TORCH_XPU_API void prelu_kernel(TensorIterator& iter); -void prelu_backward_kernel(TensorIterator& iter); +TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp index da656dc00..7d9f0872a 100644 --- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h index c31acdc9a..eaec67986 100644 --- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void silu_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter); -void silu_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp index 99b1c5716..8dc3684e3 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h index 8a5e5ef2b..a9b6027bc 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void softplus_kernel( +TORCH_XPU_API void softplus_kernel( TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_); -void softplus_backward_kernel( +TORCH_XPU_API void softplus_backward_kernel( TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_); diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp index 4393576e9..b96ab461e 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h index 481d1e5a1..398e3f4a4 100644 --- a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h @@ -4,8 +4,12 @@ namespace at::native::xpu { -void softshrink_kernel(TensorIteratorBase& iter, const Scalar& value); +TORCH_XPU_API void softshrink_kernel( + TensorIteratorBase& iter, + const Scalar& value); -void softshrink_backward_kernel(TensorIteratorBase& iter, const Scalar& value); +TORCH_XPU_API void softshrink_backward_kernel( + TensorIteratorBase& iter, + const Scalar& value); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp index 0b34c013f..3915a5560 100644 --- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp +++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h index c22a6b18c..df38c6d3d 100644 --- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h +++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void threshold_kernel( +TORCH_XPU_API void threshold_kernel( TensorIteratorBase& iter, const Scalar& threshold, const Scalar& value); diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp index ab7dd1396..86fd7edfe 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h index 9b6d9a046..0d58a6e7f 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void adaptive_avg_pool2d_backward_kernel( +TORCH_XPU_API void adaptive_avg_pool2d_backward_kernel( Tensor& gradInput, const Tensor& gradOutput, const Tensor& input); -void adaptive_avg_pool2d_kernel( +TORCH_XPU_API void adaptive_avg_pool2d_kernel( Tensor& output, const Tensor& input, IntArrayRef output_size); diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp index 4a8540bde..dacae3e68 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp @@ -12,6 +12,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h index 1a77f69cd..2714e6627 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h +++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h @@ -4,13 +4,13 @@ namespace at::native::xpu { -void adaptive_max_pool2d_kernel( +TORCH_XPU_API void adaptive_max_pool2d_kernel( const Tensor& input, IntArrayRef output_size, const Tensor& output, const Tensor& indices); -void adaptive_max_pool2d_backward_kernel( +TORCH_XPU_API void adaptive_max_pool2d_backward_kernel( const Tensor& grad_output, const Tensor& input, const Tensor& indices, diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp index 87354eea9..137ae3902 100644 --- a/src/ATen/native/xpu/sycl/AmpKernels.cpp +++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/AmpKernels.h b/src/ATen/native/xpu/sycl/AmpKernels.h index bfc17487f..7c703e70e 100644 --- a/src/ATen/native/xpu/sycl/AmpKernels.h +++ b/src/ATen/native/xpu/sycl/AmpKernels.h @@ -3,17 +3,17 @@ namespace at::native::xpu { -void amp_non_finite_check_and_unscale_kernel( +TORCH_XPU_API void amp_non_finite_check_and_unscale_kernel( Tensor& scaled_grad, Tensor& found_inf, const Tensor& inv_scale); -void amp_foreach_non_finite_check_and_unscale_kernel( +TORCH_XPU_API void amp_foreach_non_finite_check_and_unscale_kernel( std::vector> scaled_grads, Tensor& found_inf, const Tensor& inv_scale); -Tensor& amp_update_scale_kernel( +TORCH_XPU_API Tensor& amp_update_scale_kernel( Tensor& current_scale, Tensor& growth_tracker, const Tensor& found_inf, diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp index c2a0458c9..e7a2c2e1f 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at::native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h index 3229d306a..7667fe021 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h @@ -2,7 +2,7 @@ namespace at::native::xpu { -void avg_pool2d_kernel( +TORCH_XPU_API void avg_pool2d_kernel( const Tensor& input_, int64_t kH_, int64_t kW_, @@ -15,7 +15,7 @@ void avg_pool2d_kernel( c10::optional divisor_override, const Tensor& output); -void avg_pool2d_backward_kernel( +TORCH_XPU_API void avg_pool2d_backward_kernel( const Tensor& gradOutput_, const Tensor& input_, IntArrayRef kernel_size, diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index 417b0e24a..5e14a0a94 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -14,6 +14,14 @@ #include #include +#include + +#ifdef _WIN32 +#define RESTRICT __restrict +#else +#define RESTRICT __restrict__ +#endif + namespace at { namespace native { namespace xpu { @@ -1009,9 +1017,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor } BatchNormCollectStatisticsChannelsLastKernelFunctor( - const scalar_t* __restrict__ input, - accscalar_t* __restrict__ out_mean, - accscalar_t* __restrict__ out_invstd, + const scalar_t* RESTRICT input, + accscalar_t* RESTRICT out_mean, + accscalar_t* RESTRICT out_invstd, volatile accscalar_t* staging_data, int* semaphores, const int reduction_size, @@ -1029,9 +1037,9 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor wg_size_(wg_size) {} private: - const scalar_t* __restrict__ input_; - accscalar_t* __restrict__ out_mean_; - accscalar_t* __restrict__ out_invstd_; + const scalar_t* RESTRICT input_; + accscalar_t* RESTRICT out_mean_; + accscalar_t* RESTRICT out_invstd_; volatile accscalar_t* staging_data_; int* semaphores_; const int reduction_size_; @@ -1376,13 +1384,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor { } BatchNormTransformInputChannelsLastKernelFunctor( - const scalar_t* __restrict__ input, - const scalar_t* __restrict__ z, - const accscalar_t* __restrict__ mean, - const accscalar_t* __restrict__ inv_std, - const layerscalar_t* __restrict__ weight, - const layerscalar_t* __restrict__ shift, - scalar_t* __restrict__ out, + const scalar_t* RESTRICT input, + const scalar_t* RESTRICT z, + const accscalar_t* RESTRICT mean, + const accscalar_t* RESTRICT inv_std, + const layerscalar_t* RESTRICT weight, + const layerscalar_t* RESTRICT shift, + scalar_t* RESTRICT out, const int reduction_size, const int stride, const bool fuse_relu) @@ -1398,13 +1406,13 @@ struct BatchNormTransformInputChannelsLastKernelFunctor { fuse_relu_(fuse_relu) {} private: - const scalar_t* __restrict__ input_; - const scalar_t* __restrict__ z_; - const accscalar_t* __restrict__ mean_; - const accscalar_t* __restrict__ inv_std_; - const layerscalar_t* __restrict__ weight_; - const layerscalar_t* __restrict__ shift_; - scalar_t* __restrict__ out_; + const scalar_t* RESTRICT input_; + const scalar_t* RESTRICT z_; + const accscalar_t* RESTRICT mean_; + const accscalar_t* RESTRICT inv_std_; + const layerscalar_t* RESTRICT weight_; + const layerscalar_t* RESTRICT shift_; + scalar_t* RESTRICT out_; const int reduction_size_; const int stride_; const bool fuse_relu_; @@ -2035,14 +2043,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor } BatchNormBackwardReduceChannelsLastKernelFunctor( - const scalar_t* __restrict__ input, - const scalar_t* __restrict__ grad_output, - const accscalar_t* __restrict__ mean, - const accscalar_t* __restrict__ inv_std, - accscalar_t* __restrict__ sum_dy_o, - accscalar_t* __restrict__ sum_dy_xmu_o, - layerscalar_t* __restrict__ grad_weight, - layerscalar_t* __restrict__ grad_bias, + const scalar_t* RESTRICT input, + const scalar_t* RESTRICT grad_output, + const accscalar_t* RESTRICT mean, + const accscalar_t* RESTRICT inv_std, + accscalar_t* RESTRICT sum_dy_o, + accscalar_t* RESTRICT sum_dy_xmu_o, + layerscalar_t* RESTRICT grad_weight, + layerscalar_t* RESTRICT grad_bias, volatile accscalar_t* staging_data, int* semaphores, const int reduction_size, @@ -2063,14 +2071,14 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor wg_size_(wg_size) {} private: - const scalar_t* __restrict__ input_; - const scalar_t* __restrict__ grad_output_; - const accscalar_t* __restrict__ mean_; - const accscalar_t* __restrict__ inv_std_; - accscalar_t* __restrict__ sum_dy_o_; - accscalar_t* __restrict__ sum_dy_xmu_o_; - layerscalar_t* __restrict__ grad_weight_; - layerscalar_t* __restrict__ grad_bias_; + const scalar_t* RESTRICT input_; + const scalar_t* RESTRICT grad_output_; + const accscalar_t* RESTRICT mean_; + const accscalar_t* RESTRICT inv_std_; + accscalar_t* RESTRICT sum_dy_o_; + accscalar_t* RESTRICT sum_dy_xmu_o_; + layerscalar_t* RESTRICT grad_weight_; + layerscalar_t* RESTRICT grad_bias_; volatile accscalar_t* staging_data_; int* semaphores_; const int reduction_size_; @@ -2396,7 +2404,7 @@ struct BatchNormBackwardElemtKernelFunctor { GenericPackedTensorAccessor grad_input, const stat_accscalar_t norm_fct, - const int* __restrict__ numel = nullptr, + const int* RESTRICT numel = nullptr, const int world_size = 0) : input_(input), grad_output_(grad_output), @@ -2452,7 +2460,7 @@ struct BatchNormBackwardElemtKernelFunctor { GenericPackedTensorAccessor grad_input_; const stat_accscalar_t norm_fct_; - const int* __restrict__ numel_; + const int* RESTRICT numel_; const int world_size_; }; @@ -2681,18 +2689,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor { } BatchNormBackwardElemtChannelsLastKernelFunctor( - const scalar_t* __restrict__ grad_output, - const scalar_t* __restrict__ input, - const accscalar_t* __restrict__ mean, - const accscalar_t* __restrict__ inv_std, - const layerscalar_t* __restrict__ weight, - const accscalar_t* __restrict__ sum_dy, - const accscalar_t* __restrict__ sum_dy_xmu, - scalar_t* __restrict__ grad_input, + const scalar_t* RESTRICT grad_output, + const scalar_t* RESTRICT input, + const accscalar_t* RESTRICT mean, + const accscalar_t* RESTRICT inv_std, + const layerscalar_t* RESTRICT weight, + const accscalar_t* RESTRICT sum_dy, + const accscalar_t* RESTRICT sum_dy_xmu, + scalar_t* RESTRICT grad_input, const accscalar_t norm_fct, const int reduction_size, const int stride, - const int* __restrict__ numel = nullptr, + const int* RESTRICT numel = nullptr, const int64_t world_size = 0) : grad_output_(grad_output), input_(input), @@ -2709,18 +2717,18 @@ struct BatchNormBackwardElemtChannelsLastKernelFunctor { world_size_(world_size) {} private: - const scalar_t* __restrict__ grad_output_; - const scalar_t* __restrict__ input_; - const accscalar_t* __restrict__ mean_; - const accscalar_t* __restrict__ inv_std_; - const layerscalar_t* __restrict__ weight_; - const accscalar_t* __restrict__ sum_dy_; - const accscalar_t* __restrict__ sum_dy_xmu_; - scalar_t* __restrict__ grad_input_; + const scalar_t* RESTRICT grad_output_; + const scalar_t* RESTRICT input_; + const accscalar_t* RESTRICT mean_; + const accscalar_t* RESTRICT inv_std_; + const layerscalar_t* RESTRICT weight_; + const accscalar_t* RESTRICT sum_dy_; + const accscalar_t* RESTRICT sum_dy_xmu_; + scalar_t* RESTRICT grad_input_; const accscalar_t norm_fct_; const int reduction_size_; const int stride_; - const int* __restrict__ numel_; + const int* RESTRICT numel_; const int64_t world_size_; }; diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.h b/src/ATen/native/xpu/sycl/BatchNormKernels.h index 3bc559b38..9d9ae780b 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.h +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.h @@ -6,11 +6,11 @@ namespace at { namespace native { namespace xpu { -std::tuple batch_norm_stats_kernel( +TORCH_XPU_API std::tuple batch_norm_stats_kernel( const Tensor& self, double epsilon); -void batch_norm_elemt_kernel( +TORCH_XPU_API void batch_norm_elemt_kernel( Tensor& out, const Tensor& self, const c10::optional& weight_opt, @@ -18,7 +18,8 @@ void batch_norm_elemt_kernel( const Tensor& mean_, const Tensor& invstd_); -std::tuple batch_norm_backward_reduce_kernel( +TORCH_XPU_API std::tuple +batch_norm_backward_reduce_kernel( const Tensor& grad_output, const Tensor& input, const Tensor& mean, @@ -28,7 +29,7 @@ std::tuple batch_norm_backward_reduce_kernel( bool weight_g, bool bias_g); -Tensor batch_norm_backward_elemt_kernel( +TORCH_XPU_API Tensor batch_norm_backward_elemt_kernel( const Tensor& self, const Tensor& input, const Tensor& mean, @@ -38,13 +39,13 @@ Tensor batch_norm_backward_elemt_kernel( const Tensor& sum_dy_xmu, const Tensor& count); -std::tuple batch_norm_update_stats_kernel( +TORCH_XPU_API std::tuple batch_norm_update_stats_kernel( const Tensor& self, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, double momentum); -std::tuple batch_norm_kernel( +TORCH_XPU_API std::tuple batch_norm_kernel( const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, @@ -57,7 +58,7 @@ std::tuple batch_norm_kernel( Tensor& save_mean, Tensor& save_invstd); -std::tuple batch_norm_backward_kernel( +TORCH_XPU_API std::tuple batch_norm_backward_kernel( const Tensor& grad_out, const Tensor& input, const c10::optional& weight_opt, diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp index 7634921d2..2c2dd1550 100644 --- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h index 4247c0a79..f9712b85d 100644 --- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h @@ -6,11 +6,11 @@ namespace at { namespace native { namespace xpu { -void bitwise_and_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_and_kernel(TensorIteratorBase& iter); -void bitwise_or_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_or_kernel(TensorIteratorBase& iter); -void bitwise_xor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_xor_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp index e10d25dd5..6f35c0cb6 100644 --- a/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp +++ b/src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { void div_true_kernel(TensorIteratorBase& iter) { diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp index 862162807..e0e8487f6 100644 --- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h index 588d52c4f..61af8b511 100644 --- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void atan2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void atan2_kernel(TensorIteratorBase& iter); -void hypot_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void hypot_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.cpp b/src/ATen/native/xpu/sycl/BinaryKernels.cpp index f6ab24de9..daafadd23 100644 --- a/src/ATen/native/xpu/sycl/BinaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.h b/src/ATen/native/xpu/sycl/BinaryKernels.h index 78dfaa0da..f862595eb 100644 --- a/src/ATen/native/xpu/sycl/BinaryKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryKernels.h @@ -4,16 +4,16 @@ namespace at::native::xpu { -void add_kernel(TensorIteratorBase& iter, const Scalar& alpha); +TORCH_XPU_API void add_kernel(TensorIteratorBase& iter, const Scalar& alpha); -void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha); +TORCH_XPU_API void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha); -void mul_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mul_kernel(TensorIteratorBase& iter); -void div_true_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void div_true_kernel(TensorIteratorBase& iter); -void div_trunc_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void div_trunc_kernel(TensorIteratorBase& iter); -void div_floor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void div_floor_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp index be4a25539..3268ab90f 100644 --- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h index 552f961c3..dce1a1a5e 100644 --- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void logical_and_kernel(TensorIterator& iter); +TORCH_XPU_API void logical_and_kernel(TensorIterator& iter); -void logical_or_kernel(TensorIterator& iter); +TORCH_XPU_API void logical_or_kernel(TensorIterator& iter); -void logical_xor_kernel(TensorIterator& iter); +TORCH_XPU_API void logical_xor_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp index fde7f5186..73732e02f 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h index 37553191d..8bea1f44c 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h @@ -4,10 +4,12 @@ namespace at::native::xpu { -void sigmoid_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sigmoid_backward_kernel(TensorIteratorBase& iter); -void tanh_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void tanh_backward_kernel(TensorIteratorBase& iter); -void logit_backward_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar); +TORCH_XPU_API void logit_backward_kernel( + TensorIteratorBase& iter, + const Scalar& eps_scalar); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp index 8d63e48b5..d96e5064e 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h index 17672ec29..ffe08be3e 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void mse_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mse_kernel(TensorIteratorBase& iter); -void smooth_l1_kernel(TensorIteratorBase& iter, double beta); +TORCH_XPU_API void smooth_l1_kernel(TensorIteratorBase& iter, double beta); -void huber_kernel(TensorIterator& iter, double delta); +TORCH_XPU_API void huber_kernel(TensorIterator& iter, double delta); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp index 7a3e7908f..92eac3da2 100644 --- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp +++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h index 1d400951f..f3d3e2390 100644 --- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h +++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h @@ -6,9 +6,9 @@ namespace at { namespace native { namespace xpu { -void remainder_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void remainder_kernel(TensorIteratorBase& iter); -void fmod_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void fmod_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp index 64adba17e..1a0027d2f 100644 --- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template @@ -45,4 +47,4 @@ void rshift_kernel(TensorIteratorBase& iter) { }); } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h index 7e661d919..64ded1705 100644 --- a/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void lshift_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lshift_kernel(TensorIteratorBase& iter); -void rshift_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void rshift_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp index d56eff222..c5686babb 100644 --- a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp +++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { // customized lower_bound func to ensure the low bound of 'nan', 'inf' etc. be diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.h b/src/ATen/native/xpu/sycl/BucketizationKernels.h index f47cea2af..1c86ece08 100644 --- a/src/ATen/native/xpu/sycl/BucketizationKernels.h +++ b/src/ATen/native/xpu/sycl/BucketizationKernels.h @@ -2,11 +2,13 @@ #include namespace at::native::xpu { -void searchsorted_kernel( + +TORCH_XPU_API void searchsorted_kernel( Tensor& result, const Tensor& input, const Tensor& sorted_sequence, bool out_int32, bool right, const Tensor& sorter); -} // namespace at::native::xpu \ No newline at end of file + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp index 2ff7cbc96..9d1396209 100644 --- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp +++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.h b/src/ATen/native/xpu/sycl/Col2ImKernel.h index 337aec367..62d87b406 100644 --- a/src/ATen/native/xpu/sycl/Col2ImKernel.h +++ b/src/ATen/native/xpu/sycl/Col2ImKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void col2im_kernel( +TORCH_XPU_API void col2im_kernel( Tensor& output, const Tensor& input_, IntArrayRef output_size, diff --git a/src/ATen/native/xpu/sycl/CompareKernels.cpp b/src/ATen/native/xpu/sycl/CompareKernels.cpp index 8c403a819..91c3ac614 100644 --- a/src/ATen/native/xpu/sycl/CompareKernels.cpp +++ b/src/ATen/native/xpu/sycl/CompareKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/CompareKernels.h b/src/ATen/native/xpu/sycl/CompareKernels.h index d68827631..52cbc151c 100644 --- a/src/ATen/native/xpu/sycl/CompareKernels.h +++ b/src/ATen/native/xpu/sycl/CompareKernels.h @@ -6,17 +6,17 @@ namespace at { namespace native { namespace xpu { -void eq_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void eq_kernel(TensorIteratorBase& iter); -void ne_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void ne_kernel(TensorIteratorBase& iter); -void lt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lt_kernel(TensorIteratorBase& iter); -void le_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void le_kernel(TensorIteratorBase& iter); -void gt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void gt_kernel(TensorIteratorBase& iter); -void ge_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void ge_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp index 87504bd5e..4e44d9064 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h index d51556b4f..900bbaca6 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.h +++ b/src/ATen/native/xpu/sycl/ComplexKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void complex_kernel(TensorIterator& iter); +TORCH_XPU_API void complex_kernel(TensorIterator& iter); -void polar_kernel(TensorIterator& iter); +TORCH_XPU_API void polar_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index 552d148ae..dc2991cc5 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -7,6 +7,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/CopyKernel.h b/src/ATen/native/xpu/sycl/CopyKernel.h index da078f3ea..3a8e4d263 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.h +++ b/src/ATen/native/xpu/sycl/CopyKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void copy_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void copy_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.cpp b/src/ATen/native/xpu/sycl/CopysignKernel.cpp index 3b8351abf..327828adc 100644 --- a/src/ATen/native/xpu/sycl/CopysignKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopysignKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.h b/src/ATen/native/xpu/sycl/CopysignKernel.h index cf856728c..86b6a40d9 100644 --- a/src/ATen/native/xpu/sycl/CopysignKernel.h +++ b/src/ATen/native/xpu/sycl/CopysignKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void copysign_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void copysign_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CrossKernel.cpp b/src/ATen/native/xpu/sycl/CrossKernel.cpp index 0ed5f11eb..313261c9d 100644 --- a/src/ATen/native/xpu/sycl/CrossKernel.cpp +++ b/src/ATen/native/xpu/sycl/CrossKernel.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at::native::xpu { template struct CrossKernelFunctor { diff --git a/src/ATen/native/xpu/sycl/CrossKernel.h b/src/ATen/native/xpu/sycl/CrossKernel.h index cc77229c4..8d382b97e 100644 --- a/src/ATen/native/xpu/sycl/CrossKernel.h +++ b/src/ATen/native/xpu/sycl/CrossKernel.h @@ -1,9 +1,12 @@ #pragma once #include + namespace at::native::xpu { -void linalg_cross_kernel( + +TORCH_XPU_API void linalg_cross_kernel( const Tensor& result, const Tensor& x1, const Tensor& x2, int64_t dim); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.cpp b/src/ATen/native/xpu/sycl/CumprodKernel.cpp index f35cc7979..375e6a80c 100644 --- a/src/ATen/native/xpu/sycl/CumprodKernel.cpp +++ b/src/ATen/native/xpu/sycl/CumprodKernel.cpp @@ -3,6 +3,9 @@ #include +#include +#include + namespace at::native::xpu { void launch_cumprod_kernel( diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.h b/src/ATen/native/xpu/sycl/CumprodKernel.h index 71e8c7693..d78301f04 100644 --- a/src/ATen/native/xpu/sycl/CumprodKernel.h +++ b/src/ATen/native/xpu/sycl/CumprodKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void launch_cumprod_kernel( +TORCH_XPU_API void launch_cumprod_kernel( const Tensor& result, const Tensor& self, int64_t dim); diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.cpp b/src/ATen/native/xpu/sycl/CumsumKernel.cpp index c2754869b..0523db84b 100644 --- a/src/ATen/native/xpu/sycl/CumsumKernel.cpp +++ b/src/ATen/native/xpu/sycl/CumsumKernel.cpp @@ -3,6 +3,9 @@ #include +#include +#include + namespace at::native::xpu { void launch_cumsum_kernel( diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.h b/src/ATen/native/xpu/sycl/CumsumKernel.h index 79c299608..77ce62b61 100644 --- a/src/ATen/native/xpu/sycl/CumsumKernel.h +++ b/src/ATen/native/xpu/sycl/CumsumKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void launch_cumsum_kernel( +TORCH_XPU_API void launch_cumsum_kernel( const Tensor& result, const Tensor& self, int64_t dim); diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index 86283fba6..ba0283b8b 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -15,6 +15,8 @@ #include #include +#include + namespace at::native::xpu { static inline int p_start( diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h index 578b43ae7..d530560e6 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void max_pool2d_with_indices_kernel( +TORCH_XPU_API void max_pool2d_with_indices_kernel( const Tensor& input, IntArrayRef kernel_size, IntArrayRef stride, @@ -14,7 +14,7 @@ void max_pool2d_with_indices_kernel( const Tensor& output, const Tensor& indices); -void max_pool2d_with_indices_backward_kernel( +TORCH_XPU_API void max_pool2d_with_indices_backward_kernel( const Tensor& gradInput, const Tensor& gradOutput, const Tensor& input, diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp index 2f0fc68c1..5006dd8b4 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp +++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template static double device_sqrt(scalar_t val) { @@ -177,6 +179,8 @@ static inline scalar_t group_reduce_agg_without_broadcast( do { agg = subgroup_reduce_agg_without_broadcast( item, agg, sg_size); + if (num_active_sg == 1) + return agg; item.barrier(sycl_local_fence); if (0 == lane_id) { local_shared_mem[sg_id] = agg; @@ -184,14 +188,13 @@ static inline scalar_t group_reduce_agg_without_broadcast( item.barrier(sycl_local_fence); agg = local_id < num_active_sg ? local_shared_mem[local_id] : (scalar_t)0.0f; - num_active_sg = (num_active_sg + sg_size - 1) / sg_size; + if (num_active_sg > sg_size) + num_active_sg = (num_active_sg + sg_size - 1) / sg_size; } while (num_active_sg > sg_size); // num of active sgs < sg_size item.barrier(sycl_local_fence); if (0 == sg_id) { - agg = - local_id < num_active_sg ? local_shared_mem[local_id] : (scalar_t)0.0f; agg = subgroup_reduce_agg_without_broadcast( item, agg, sg_size); } diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.h b/src/ATen/native/xpu/sycl/DistanceKernels.h index ea263d94c..46a34d031 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.h +++ b/src/ATen/native/xpu/sycl/DistanceKernels.h @@ -2,9 +2,11 @@ #include namespace at::native::xpu { -void cdist_kernel( + +TORCH_XPU_API void cdist_kernel( Tensor& result, const Tensor& x1_expanded, const Tensor& x2_expanded, double p); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp index 0c17910be..c3de4c593 100644 --- a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp +++ b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp index 4c1b83689..cfc3e1dfa 100644 --- a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp +++ b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { void exponential_kernel( diff --git a/src/ATen/native/xpu/sycl/DistributionKernels.h b/src/ATen/native/xpu/sycl/DistributionKernels.h index e5700f26e..88d62933f 100644 --- a/src/ATen/native/xpu/sycl/DistributionKernels.h +++ b/src/ATen/native/xpu/sycl/DistributionKernels.h @@ -4,41 +4,43 @@ namespace at::native::xpu { -void random_from_to_kernel( +TORCH_XPU_API void random_from_to_kernel( TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen_); -void random_full_64_bits_range_kernel( +TORCH_XPU_API void random_full_64_bits_range_kernel( TensorIteratorBase& iter, c10::optional gen_); -void random_kernel(TensorIteratorBase& iter, c10::optional gen_); +TORCH_XPU_API void random_kernel( + TensorIteratorBase& iter, + c10::optional gen_); -void uniform_kernel( +TORCH_XPU_API void uniform_kernel( TensorIteratorBase& iter, double from, double to, c10::optional gen); -void normal_kernel( +TORCH_XPU_API void normal_kernel( const TensorBase& self, double mean, double std, c10::optional gen); -void bernoulli_tensor_kernel( +TORCH_XPU_API void bernoulli_tensor_kernel( const TensorBase& self, const TensorBase& p_, c10::optional gen_); -void bernoulli_scalar_kernel( +TORCH_XPU_API void bernoulli_scalar_kernel( const TensorBase& self, double p, c10::optional gen); -void exponential_kernel( +TORCH_XPU_API void exponential_kernel( TensorIteratorBase& iter, double lambda, c10::optional gen); diff --git a/src/ATen/native/xpu/sycl/DistributionNormal.cpp b/src/ATen/native/xpu/sycl/DistributionNormal.cpp index 9630eaf7f..3aacf6639 100644 --- a/src/ATen/native/xpu/sycl/DistributionNormal.cpp +++ b/src/ATen/native/xpu/sycl/DistributionNormal.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp index 995f4979c..e4698a723 100644 --- a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp +++ b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionUniform.cpp b/src/ATen/native/xpu/sycl/DistributionUniform.cpp index 321dffd58..17ff4d698 100644 --- a/src/ATen/native/xpu/sycl/DistributionUniform.cpp +++ b/src/ATen/native/xpu/sycl/DistributionUniform.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp index 2432c804a..54b1b4d4c 100644 --- a/src/ATen/native/xpu/sycl/Dropout.cpp +++ b/src/ATen/native/xpu/sycl/Dropout.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DropoutKernels.h b/src/ATen/native/xpu/sycl/DropoutKernels.h index a72e8e2f8..922b2838c 100644 --- a/src/ATen/native/xpu/sycl/DropoutKernels.h +++ b/src/ATen/native/xpu/sycl/DropoutKernels.h @@ -6,15 +6,13 @@ namespace at { namespace native { namespace xpu { -std::tuple dropout_kernel( +TORCH_XPU_API std::tuple dropout_kernel( const Tensor& self, double p, c10::optional train); -Tensor dropout_backward_kernel( - const Tensor& grad, - const Tensor& mask, - double scale); +TORCH_XPU_API Tensor +dropout_backward_kernel(const Tensor& grad, const Tensor& mask, double scale); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/Embedding.cpp b/src/ATen/native/xpu/sycl/Embedding.cpp index 0ed27196b..4ea45b4c2 100644 --- a/src/ATen/native/xpu/sycl/Embedding.cpp +++ b/src/ATen/native/xpu/sycl/Embedding.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp index c4cf312fd..0e2d02b6a 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { std::pair promoteIndicesAndOffsets( diff --git a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h index dd8812f97..f73dabb6b 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h +++ b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -std::tuple _embedding_bag_kernel( +TORCH_XPU_API std::tuple _embedding_bag_kernel( const Tensor& weight_t, const Tensor& indices_t, const Tensor& offsets_t, diff --git a/src/ATen/native/xpu/sycl/EmbeddingKernels.h b/src/ATen/native/xpu/sycl/EmbeddingKernels.h index 5aaa3257a..f2e75b37d 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingKernels.h +++ b/src/ATen/native/xpu/sycl/EmbeddingKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -Tensor embedding_dense_backward_kernel( +TORCH_XPU_API Tensor embedding_dense_backward_kernel( const Tensor& grad_, const Tensor& indices_, int64_t num_weights, diff --git a/src/ATen/native/xpu/sycl/FillKernel.cpp b/src/ATen/native/xpu/sycl/FillKernel.cpp index 483177210..6dbb55c67 100644 --- a/src/ATen/native/xpu/sycl/FillKernel.cpp +++ b/src/ATen/native/xpu/sycl/FillKernel.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/FillKernel.h b/src/ATen/native/xpu/sycl/FillKernel.h index 82bdd5f39..11c813cc0 100644 --- a/src/ATen/native/xpu/sycl/FillKernel.h +++ b/src/ATen/native/xpu/sycl/FillKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void fill_kernel(TensorIterator& iter, const Scalar& scalar); +TORCH_XPU_API void fill_kernel(TensorIterator& iter, const Scalar& scalar); } } // namespace native diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h index 1fc19d34d..2c501a2bc 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h @@ -31,11 +31,11 @@ namespace at::native::xpu { std::vector FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)( \ TensorList tensor1, TensorList tensor2, const Scalar& alpha) -FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add); -FOREACH_BINARY_LIST_ALPHA_KERNEL(add); -FOREACH_BINARY_LIST_INPLACE_KERNEL(mul); -FOREACH_BINARY_LIST_KERNEL(mul); -FOREACH_BINARY_LIST_INPLACE_KERNEL(div); -FOREACH_BINARY_LIST_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(div); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h index fe63fb8de..00044fa7d 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h @@ -17,11 +17,11 @@ namespace at::native::xpu { std::vector FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)( \ TensorList tensors, const Scalar& scalar) -FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add); -FOREACH_BINARY_SCALAR_KERNEL(add); -FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul); -FOREACH_BINARY_SCALAR_KERNEL(mul); -FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div); -FOREACH_BINARY_SCALAR_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(div); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h index 723b07619..bafd220c2 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h @@ -17,11 +17,11 @@ namespace at::native::xpu { std::vector FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)( \ TensorList tensors, at::ArrayRef scalars) -FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add); -FOREACH_BINARY_SCALARLIST_KERNEL(add); -FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul); -FOREACH_BINARY_SCALARLIST_KERNEL(mul); -FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div); -FOREACH_BINARY_SCALARLIST_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(add); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(mul); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div); +TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(div); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h index 4fc55fcf3..9623b7370 100644 --- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h @@ -11,9 +11,9 @@ namespace at::native::xpu { #define FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(NAME) \ FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(NAME) -FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul); -FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul); -FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv); -FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h index 94445b86d..f15795ef5 100644 --- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h @@ -17,9 +17,9 @@ namespace at::native::xpu { TensorList tensors2, \ const Scalar& scalar) -FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv); -FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h index 8df3aa4e0..8a5bf444d 100644 --- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h @@ -17,9 +17,9 @@ namespace at::native::xpu { TensorList tensors2, \ at::ArrayRef scalars) -FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul); -FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv); -FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcmul); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(addcdiv); +TORCH_XPU_API FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(addcdiv); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp index 849a77443..cc90fa893 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + enum class NormType { L1, L2 }; namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h index a7dceefec..cefb3d840 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h @@ -3,7 +3,7 @@ namespace at::native::xpu { -std::vector foreach_norm_kernel( +TORCH_XPU_API std::vector foreach_norm_kernel( TensorList tensors, const Scalar& ord, double p, diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp index fa7062cfa..21168f8a3 100644 --- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp @@ -5,6 +5,9 @@ #include #include +#include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h index 50392598e..7b776b1eb 100644 --- a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h @@ -3,13 +3,13 @@ namespace at::native::xpu { -void foreach_lerp_list_kernel( +TORCH_XPU_API void foreach_lerp_list_kernel( TensorList tensors1, TensorList tensors2, TensorList tensors3, TensorList result); -void foreach_lerp_list_kernel_( +TORCH_XPU_API void foreach_lerp_list_kernel_( TensorList tensors1, TensorList tensors2, TensorList tensors3); diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h index 0f36a5215..9d5e252eb 100644 --- a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h @@ -3,13 +3,13 @@ namespace at::native::xpu { -void foreach_lerp_scalar_kernel( +TORCH_XPU_API void foreach_lerp_scalar_kernel( TensorList tensors1, TensorList tensors2, const Scalar& weight, TensorList result); -void foreach_lerp_scalar_kernel_( +TORCH_XPU_API void foreach_lerp_scalar_kernel_( TensorList tensors1, TensorList tensors2, const Scalar& weight); diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp index db0ac1769..9341c0ab6 100644 --- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp @@ -8,6 +8,8 @@ #include #include +#include + namespace at::native::xpu { template class Op> diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h index cd5312106..26ab95fa8 100644 --- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h @@ -3,8 +3,8 @@ namespace at::native::xpu { -std::vector foreach_sqrt_kernel(TensorList tensors); +TORCH_XPU_API std::vector foreach_sqrt_kernel(TensorList tensors); -void foreach_sqrt_kernel_(TensorList tensors); +TORCH_XPU_API void foreach_sqrt_kernel_(TensorList tensors); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp index 6c832270c..fb74e19bc 100644 --- a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp +++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template @@ -20,4 +22,4 @@ void gcd_kernel(TensorIteratorBase& iter) { }); } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.h b/src/ATen/native/xpu/sycl/GcdLcmKernels.h index acf855d40..ba429d762 100644 --- a/src/ATen/native/xpu/sycl/GcdLcmKernels.h +++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void gcd_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void gcd_kernel(TensorIteratorBase& iter); } } // namespace native diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp index affd6b02b..2bfd0505a 100644 --- a/src/ATen/native/xpu/sycl/GridSampler.cpp +++ b/src/ATen/native/xpu/sycl/GridSampler.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu::detail; diff --git a/src/ATen/native/xpu/sycl/GridSamplerKernels.h b/src/ATen/native/xpu/sycl/GridSamplerKernels.h index deb35006f..b56ed8dcd 100644 --- a/src/ATen/native/xpu/sycl/GridSamplerKernels.h +++ b/src/ATen/native/xpu/sycl/GridSamplerKernels.h @@ -4,14 +4,14 @@ namespace at::native::xpu { -Tensor grid_sampler_2d_kernel( +TORCH_XPU_API Tensor grid_sampler_2d_kernel( const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); -void grid_sampler_2d_backward_kernel( +TORCH_XPU_API void grid_sampler_2d_backward_kernel( const Tensor& grad_input, const Tensor& grad_grid, const Tensor& grad_output, @@ -22,4 +22,4 @@ void grid_sampler_2d_backward_kernel( bool align_corners, std::array output_mask); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index c7d0818ee..572b4d62b 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -10,6 +10,8 @@ #include #include +#include + namespace at::native::xpu { template < diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.h b/src/ATen/native/xpu/sycl/GroupNormKernels.h index 7f0cf38d3..943d6bcb7 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.h +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void group_norm_kernel( +TORCH_XPU_API void group_norm_kernel( const Tensor& X, const Tensor& gamma, const Tensor& beta, @@ -17,7 +17,7 @@ void group_norm_kernel( Tensor& mean, Tensor& rstd); -void group_norm_backward_kernel( +TORCH_XPU_API void group_norm_backward_kernel( const Tensor& dY, const Tensor& X, const Tensor& mean, diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h index bb865da20..581ee04c5 100644 --- a/src/ATen/native/xpu/sycl/HistogramKernels.h +++ b/src/ATen/native/xpu/sycl/HistogramKernels.h @@ -4,14 +4,14 @@ namespace at::native::xpu { -void histogramdd_kernel( +TORCH_XPU_API void histogramdd_kernel( const Tensor& self, const std::optional& weight, bool density, Tensor& hist, const TensorList& bin_edges_); -void histogramdd_linear_kernel( +TORCH_XPU_API void histogramdd_linear_kernel( const Tensor& self, const std::optional& weight, bool density, @@ -19,10 +19,10 @@ void histogramdd_linear_kernel( const TensorList& bin_edges_, bool local_search); -void histogram_select_outer_bin_edges_kernel( +TORCH_XPU_API void histogram_select_outer_bin_edges_kernel( const Tensor& input, const int64_t N, std::vector& leftmost_edges, std::vector& rightmost_edges); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp index 3b5accfb9..d269d94ae 100644 --- a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp +++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp @@ -17,6 +17,8 @@ #include #endif +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp index 1922d9498..dc6394b39 100644 --- a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp +++ b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.h b/src/ATen/native/xpu/sycl/Im2ColKernel.h index 4efae219e..3ca9a852b 100644 --- a/src/ATen/native/xpu/sycl/Im2ColKernel.h +++ b/src/ATen/native/xpu/sycl/Im2ColKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void im2col_kernel( +TORCH_XPU_API void im2col_kernel( Tensor& output, const Tensor& input_, IntArrayRef kernel_size, diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp index dd588e7e1..8c755a64b 100644 --- a/src/ATen/native/xpu/sycl/Indexing.cpp +++ b/src/ATen/native/xpu/sycl/Indexing.cpp @@ -18,6 +18,8 @@ #include #include +#include + using namespace at::xpu::detail; using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/IndexingKernels.h b/src/ATen/native/xpu/sycl/IndexingKernels.h index 89f7c2ca5..ca2b866b8 100644 --- a/src/ATen/native/xpu/sycl/IndexingKernels.h +++ b/src/ATen/native/xpu/sycl/IndexingKernels.h @@ -3,20 +3,22 @@ namespace at::native::xpu { -void index_kernel( +TORCH_XPU_API void index_kernel( TensorIteratorBase& iter, at::IntArrayRef index_size, at::IntArrayRef index_stride); -void index_select_kernel( +TORCH_XPU_API void index_select_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Tensor& out); -void masked_fill_kernel(TensorIterator& iter, const Scalar& value); +TORCH_XPU_API void masked_fill_kernel( + TensorIterator& iter, + const Scalar& value); -void index_add_kernel( +TORCH_XPU_API void index_add_kernel( const Tensor& self, int64_t dim, const Tensor& index, @@ -24,27 +26,27 @@ void index_add_kernel( const Scalar& alpha, const Tensor& out); -void index_fill_kernel( +TORCH_XPU_API void index_fill_kernel( TensorIterator& iter, const int64_t dim, const int64_t self_dim_size, const int64_t self_dim_stride, const Scalar& source); -void index_put_kernel( +TORCH_XPU_API void index_put_kernel( TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate); -void index_put_deterministic_kernel( +TORCH_XPU_API void index_put_deterministic_kernel( Tensor& self, const c10::List>& indices, const Tensor& value, bool accumulate, bool unsafe); -void masked_scatter_kernel( +TORCH_XPU_API void masked_scatter_kernel( const TensorBase& self, const TensorBase& mask, const TensorBase& maskPrefixSum, diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index b73ee96b3..6a4efc440 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.h b/src/ATen/native/xpu/sycl/LayerNormKernels.h index 0797d50fd..0c57a61ba 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.h +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -std::tuple layer_norm_kernel( +TORCH_XPU_API std::tuple layer_norm_kernel( const Tensor& X, const Tensor& gamma, const Tensor& beta, @@ -17,7 +17,7 @@ std::tuple layer_norm_kernel( Tensor& mean, Tensor& rstd); -std::tuple layer_norm_backward_kernel( +TORCH_XPU_API std::tuple layer_norm_backward_kernel( const Tensor& dY, const Tensor& X, const Tensor& mean, diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp index b0f480ac3..1648f193b 100644 --- a/src/ATen/native/xpu/sycl/LerpKernels.cpp +++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LerpKernels.h b/src/ATen/native/xpu/sycl/LerpKernels.h index c455adee8..6f13ec6eb 100644 --- a/src/ATen/native/xpu/sycl/LerpKernels.h +++ b/src/ATen/native/xpu/sycl/LerpKernels.h @@ -4,8 +4,10 @@ namespace at::native::xpu { -void lerp_tensor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lerp_tensor_kernel(TensorIteratorBase& iter); -void lerp_scalar_kernel(TensorIteratorBase& iter, const c10::Scalar& weight); +TORCH_XPU_API void lerp_scalar_kernel( + TensorIteratorBase& iter, + const c10::Scalar& weight); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp index 8d3128e9e..0f478eceb 100644 --- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp +++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp @@ -1,6 +1,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h index 32f987a2e..ca61016b9 100644 --- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h +++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h @@ -3,6 +3,9 @@ namespace at::native::xpu { -void addr_kernel(TensorIterator& iter, const Scalar& beta, const Scalar& alpha); +TORCH_XPU_API void addr_kernel( + TensorIterator& iter, + const Scalar& beta, + const Scalar& alpha); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp index 69b6a73a1..d79964295 100644 --- a/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp +++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.cpp @@ -5,6 +5,9 @@ #include #include #include + +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LogAddExpKernels.h b/src/ATen/native/xpu/sycl/LogAddExpKernels.h index 01b9f400f..3a5ae4c34 100644 --- a/src/ATen/native/xpu/sycl/LogAddExpKernels.h +++ b/src/ATen/native/xpu/sycl/LogAddExpKernels.h @@ -4,8 +4,8 @@ namespace at::native::xpu { -void logaddexp_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logaddexp_kernel(TensorIteratorBase& iter); -void logaddexp2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logaddexp2_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LossKernels.cpp b/src/ATen/native/xpu/sycl/LossKernels.cpp index f7c260535..69315adf5 100644 --- a/src/ATen/native/xpu/sycl/LossKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossKernels.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/LossKernels.h b/src/ATen/native/xpu/sycl/LossKernels.h index 17f61afbb..560eb2c44 100644 --- a/src/ATen/native/xpu/sycl/LossKernels.h +++ b/src/ATen/native/xpu/sycl/LossKernels.h @@ -3,14 +3,14 @@ namespace at::native::xpu { -Tensor& binary_cross_entropy_kernel( +TORCH_XPU_API Tensor& binary_cross_entropy_kernel( const Tensor& input, const Tensor& target, const Tensor& weight, int64_t reduction, Tensor& loss); -Tensor& binary_cross_entropy_backward_kernel( +TORCH_XPU_API Tensor& binary_cross_entropy_backward_kernel( const Tensor& grad, const Tensor& input, const Tensor& target, diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp index c84613a4f..4b93cb3c3 100644 --- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp @@ -11,6 +11,8 @@ #include #include +#include + namespace at::native::xpu { inline Tensor optional_contiguous(const Tensor& source) { return source.defined() ? source.contiguous() : source; @@ -251,7 +253,15 @@ void nll_loss2d_forward_kernel( at::ScalarType::BFloat16, input.scalar_type(), "nll_loss2d_forward_kernel", - [&] { + [&input_, + &weight_, + &target_, + &output, + &total_weight, + &input, + &target, + &reduction, + &ignore_index] { using accscalar_t = acc_type_device; AT_DISPATCH_INDEX_TYPES( at::native::canUse32BitIndexMath(input_, INT_MAX) @@ -527,4 +537,4 @@ void nll_loss2d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h index cb2084649..87519b5e7 100644 --- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.h +++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.h @@ -3,7 +3,8 @@ #include namespace at::native::xpu { -void nll_loss2d_forward_kernel( + +TORCH_XPU_API void nll_loss2d_forward_kernel( Tensor& output, Tensor& total_weight, const Tensor& input, @@ -12,7 +13,7 @@ void nll_loss2d_forward_kernel( int64_t reduction, int64_t ignore_index); -void nll_loss2d_backward_kernel( +TORCH_XPU_API void nll_loss2d_backward_kernel( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, @@ -21,4 +22,5 @@ void nll_loss2d_backward_kernel( int64_t reduction, int64_t ignore_index, const Tensor& total_weight); -} // namespace at::native::xpu \ No newline at end of file + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp index 27610c814..d45d06545 100644 --- a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp +++ b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.h b/src/ATen/native/xpu/sycl/LossNLLKernel.h index bde50aa7b..f680aeb29 100644 --- a/src/ATen/native/xpu/sycl/LossNLLKernel.h +++ b/src/ATen/native/xpu/sycl/LossNLLKernel.h @@ -3,7 +3,7 @@ namespace at::native::xpu { -void nll_loss_forward_kernel( +TORCH_XPU_API void nll_loss_forward_kernel( const Tensor& self, const Tensor& target, const OptionalTensorRef weight_opt, @@ -12,7 +12,7 @@ void nll_loss_forward_kernel( const Tensor& output, const Tensor& total_weight); -void nll_loss_backward_kernel( +TORCH_XPU_API void nll_loss_backward_kernel( const Tensor& grad_output, const Tensor& self, const Tensor& target, @@ -22,4 +22,4 @@ void nll_loss_backward_kernel( const Tensor& total_weight, const Tensor& grad_input); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/Math.h b/src/ATen/native/xpu/sycl/MathExtensions.h similarity index 93% rename from src/ATen/native/xpu/sycl/Math.h rename to src/ATen/native/xpu/sycl/MathExtensions.h index 71e49d902..3a66e5983 100644 --- a/src/ATen/native/xpu/sycl/Math.h +++ b/src/ATen/native/xpu/sycl/MathExtensions.h @@ -9,12 +9,12 @@ namespace at::native::xpu { * For licensing information, please refer to the cpu implementation located in * "ATen/native/Math.h". */ -template +template static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { // [C++ Standard Reference: Gamma Function] // https://en.cppreference.com/w/cpp/numeric/math/tgamma using accscalar_t = at::acc_type_device; - static const double PI_f64 = 3.14159265358979323846; + static const pi_t PI_f64 = 3.14159265358979323846; const accscalar_t PSI_10 = 2.25175258906672110764; const accscalar_t A[] = { 8.33333333333333333333E-2, @@ -27,7 +27,7 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { }; accscalar_t x = static_cast(in); - if (x == 0) { + if (x == accscalar_t(0)) { // As per C++ standard for gamma related functions and SciPy, // If the argument is ±0, ±∞ is returned return std::copysign(static_cast(INFINITY), -x); @@ -35,7 +35,7 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { bool x_is_integer = x == std::trunc(x); accscalar_t result = 0; - if (x < 0) { + if (x < accscalar_t(0)) { if (x_is_integer) { // As per C++ standard for gamma related functions and SciPy, // If the argument is a negative integer, NaN is returned @@ -46,23 +46,23 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { // mathematically equivalent since both x and r are in radians and tan() has // a periodicity of pi, in practice the computation of pi * x is a source of // error (when |x| > 1). - double q, r; - r = std::modf(static_cast(x), &q); + pi_t q, r; + r = std::modf(static_cast(x), &q); result = static_cast(-PI_f64 / std::tan(PI_f64 * r)); x = 1 - x; } - while (x < 10) { + while (x < accscalar_t(10)) { result -= 1 / x; x += 1; } - if (x == 10) { + if (x == accscalar_t(10)) { return static_cast(result + PSI_10); } accscalar_t y = 0; - if (x < 1.0e17) { - accscalar_t z = 1 / (x * x); + if (x < accscalar_t(1.0e17)) { + accscalar_t z = accscalar_t(1) / (x * x); accscalar_t polevl_result = 0; for (int i = 0; i <= 6; i++) { @@ -82,20 +82,23 @@ static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) { accscalar_t x = static_cast(in); accscalar_t sign = +1; accscalar_t result = 0; - if (x < 0.5f) { + if (x < accscalar_t(0.5)) { sign = -1; accscalar_t sin_pi_x = std::sin(PI * x); result -= (PI * PI) / (sin_pi_x * sin_pi_x); - x = 1 - x; + x = accscalar_t(1) - x; } for (int i = 0; i < 6; ++i) { - result += 1 / (x * x); - x += 1; + result += accscalar_t(1) / (x * x); + x += accscalar_t(1); } - const accscalar_t one = static_cast(1); - const accscalar_t ixx = 1 / (x * x); - result += (1 + 1 / (2 * x) + - ixx * (one / 6 - ixx * (one / 30 - ixx * (one / 42)))) / + const accscalar_t one = accscalar_t(1); + const accscalar_t ixx = accscalar_t(1) / (x * x); + result += + (accscalar_t(1) + accscalar_t(1) / (accscalar_t(2) * x) + + ixx * + (one / accscalar_t(6) - + ixx * (one / accscalar_t(30) - ixx * (one / accscalar_t(42))))) / x; return static_cast(sign * result); } @@ -122,7 +125,7 @@ chbevl(scalar_t _x, const scalar_t array[], size_t len) { b0 = _x * b1 - b2 + array[i]; } - return (0.5 * (b0 - b2)); + return (scalar_t(0.5) * (b0 - b2)); } /* diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp index 0d38a4882..6d535a40d 100644 --- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp +++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h index 063a086cd..b54c69717 100644 --- a/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h +++ b/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void maximum_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void maximum_kernel(TensorIteratorBase& iter); -void minimum_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void minimum_kernel(TensorIteratorBase& iter); -void fmax_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void fmax_kernel(TensorIteratorBase& iter); -void fmin_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void fmin_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp index 866f62a1a..d8418fef9 100644 --- a/src/ATen/native/xpu/sycl/MultinomialKernel.cpp +++ b/src/ATen/native/xpu/sycl/MultinomialKernel.cpp @@ -15,6 +15,9 @@ #include #include #include + +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/MultinomialKernel.h b/src/ATen/native/xpu/sycl/MultinomialKernel.h index d400e51cd..5adede6f6 100644 --- a/src/ATen/native/xpu/sycl/MultinomialKernel.h +++ b/src/ATen/native/xpu/sycl/MultinomialKernel.h @@ -3,10 +3,10 @@ namespace at::native::xpu { -void multinomial_kernel( +TORCH_XPU_API void multinomial_kernel( Tensor& result, const Tensor& self, const int64_t n_sample, c10::optional generator); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/NMSKernel.cpp b/src/ATen/native/xpu/sycl/NMSKernel.cpp index ea4d0b08f..fe42226c5 100644 --- a/src/ATen/native/xpu/sycl/NMSKernel.cpp +++ b/src/ATen/native/xpu/sycl/NMSKernel.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/NMSKernel.h b/src/ATen/native/xpu/sycl/NMSKernel.h index 17aec3f7b..198223731 100644 --- a/src/ATen/native/xpu/sycl/NMSKernel.h +++ b/src/ATen/native/xpu/sycl/NMSKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold); +TORCH_XPU_API Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold); } } // namespace native diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp index e1c50a263..0cb77d52f 100644 --- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp +++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { struct FlattenIdxtoRealIdxKernelFunctor { diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.h b/src/ATen/native/xpu/sycl/NonzeroKernel.h index ae79d01f8..f8e602ed2 100644 --- a/src/ATen/native/xpu/sycl/NonzeroKernel.h +++ b/src/ATen/native/xpu/sycl/NonzeroKernel.h @@ -3,6 +3,6 @@ namespace at::native::xpu { -void nonzero_kernel(const Tensor& self, Tensor& out); +TORCH_XPU_API void nonzero_kernel(const Tensor& self, Tensor& out); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index 68ef76bff..10f3765ac 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -6,6 +6,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h index 8c8bc0d26..230b693f5 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h @@ -4,18 +4,20 @@ namespace at::native::xpu { -void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value); +TORCH_XPU_API void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value); -void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value); +TORCH_XPU_API void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value); -void mse_backward_kernel(TensorIterator& iter, const Scalar& value); +TORCH_XPU_API void mse_backward_kernel( + TensorIterator& iter, + const Scalar& value); -void smooth_l1_backward_kernel( +TORCH_XPU_API void smooth_l1_backward_kernel( TensorIterator& iter, const Scalar& norm, double beta); -void huber_backward_kernel( +TORCH_XPU_API void huber_backward_kernel( TensorIterator& iter, const Scalar& norm, double delta); diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp index f96b3f482..e080511d2 100644 --- a/src/ATen/native/xpu/sycl/PowKernels.cpp +++ b/src/ATen/native/xpu/sycl/PowKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { @@ -35,6 +37,16 @@ static inline c10::complex pow_(c10::complex base, c10::complex exp) { } // namespace impl +#ifdef _MSC_VER +// Divergence for MSVC due to accuracy issue. https://github.com/intel/torch-xpu-ops/issues/842. +template +struct PowTensorTensorCastFunctor { + using opmath_t = at::opmath_type; + opmath_t operator()(opmath_t base, opmath_t exp) const { + return impl::pow_(base, exp); + } +}; +#else template struct PowTensorTensorCastFunctor { scalar_t operator()(scalar_t base, scalar_t exp) const { @@ -42,6 +54,7 @@ struct PowTensorTensorCastFunctor { return impl::pow_(opmath_t{base}, opmath_t{exp}); } }; +#endif template struct PowTensorTensorFunctor { diff --git a/src/ATen/native/xpu/sycl/PowKernels.h b/src/ATen/native/xpu/sycl/PowKernels.h index 31a31c420..d21d0d1a7 100644 --- a/src/ATen/native/xpu/sycl/PowKernels.h +++ b/src/ATen/native/xpu/sycl/PowKernels.h @@ -6,11 +6,11 @@ namespace at { namespace native { namespace xpu { -void pow_tensor_scalar_kernel( +TORCH_XPU_API void pow_tensor_scalar_kernel( TensorIteratorBase& iter, const Scalar& exp_scalar); -void pow_tensor_tensor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void pow_tensor_tensor_kernel(TensorIteratorBase& iter); } // namespace xpu } // namespace native diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.cpp b/src/ATen/native/xpu/sycl/RandpermKernel.cpp index cfc595fb5..824cd2d0e 100644 --- a/src/ATen/native/xpu/sycl/RandpermKernel.cpp +++ b/src/ATen/native/xpu/sycl/RandpermKernel.cpp @@ -9,6 +9,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.h b/src/ATen/native/xpu/sycl/RandpermKernel.h index 3547c74ad..ef73145d4 100644 --- a/src/ATen/native/xpu/sycl/RandpermKernel.h +++ b/src/ATen/native/xpu/sycl/RandpermKernel.h @@ -2,8 +2,8 @@ #include namespace at::native::xpu { -Tensor randperm_kernel( - Tensor& result, - int64_t n, - c10::optional generator); -} // namespace at::native::xpu \ No newline at end of file + +TORCH_XPU_API Tensor +randperm_kernel(Tensor& result, int64_t n, c10::optional generator); + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp index b7d542b48..152d75eae 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h index 3cf08ca5d..71dc4f5fb 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h @@ -6,13 +6,13 @@ namespace at { namespace native { namespace xpu { -Tensor& arange_kernel( +TORCH_XPU_API Tensor& arange_kernel( const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result); -Tensor& range_kernel( +TORCH_XPU_API Tensor& range_kernel( const Scalar& start, const Scalar& end, const Scalar& step, diff --git a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp index 121761053..a2df2bd63 100644 --- a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp index 1e18b2e5b..c522dbe7c 100644 --- a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp index 3c9f8453d..60f92ef3b 100644 --- a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp index b9148494a..3c70c8951 100644 --- a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template struct AndFunctor { diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp index 16095056c..3dfbb05c2 100644 --- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h index 6503f8842..9a669ec7a 100644 --- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h +++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void max_values_kernel(TensorIterator& iter); +TORCH_XPU_API void max_values_kernel(TensorIterator& iter); -void max_kernel(TensorIterator& iter); +TORCH_XPU_API void max_kernel(TensorIterator& iter); -void max_all_kernel(TensorIterator& iter); +TORCH_XPU_API void max_all_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp index 2a0ce889c..f73df51a0 100644 --- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h index d5600f5d4..65f3acf8e 100644 --- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h +++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void min_values_kernel(TensorIterator& iter); +TORCH_XPU_API void min_values_kernel(TensorIterator& iter); -void min_kernel(TensorIterator& iter); +TORCH_XPU_API void min_kernel(TensorIterator& iter); -void min_all_kernel(TensorIterator& iter); +TORCH_XPU_API void min_all_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp index 6d0e75680..7e24f7ecc 100644 --- a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp index 074e8022d..ef405be49 100644 --- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp @@ -1,8 +1,10 @@ #include -#include #include #include +#include + +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.h b/src/ATen/native/xpu/sycl/ReduceNormKernel.h index 5e255dd5b..bc14826ec 100644 --- a/src/ATen/native/xpu/sycl/ReduceNormKernel.h +++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void norm_kernel(TensorIterator& iter, const Scalar& val); +TORCH_XPU_API void norm_kernel(TensorIterator& iter, const Scalar& val); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h index d85754efb..8b03fb137 100644 --- a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h +++ b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h @@ -4,26 +4,29 @@ namespace at::native::xpu { -void argmax_kernel(TensorIterator& iter); +TORCH_XPU_API void argmax_kernel(TensorIterator& iter); -void argmin_kernel(TensorIterator& iter); +TORCH_XPU_API void argmin_kernel(TensorIterator& iter); -void and_kernel(TensorIterator& iter); +TORCH_XPU_API void and_kernel(TensorIterator& iter); -void or_kernel(TensorIterator& iter); +TORCH_XPU_API void or_kernel(TensorIterator& iter); -void mean_kernel(TensorIterator& iter); +TORCH_XPU_API void mean_kernel(TensorIterator& iter); -void sum_kernel(TensorIterator& iter); +TORCH_XPU_API void sum_kernel(TensorIterator& iter); -void prod_kernel(TensorIterator& iter); +TORCH_XPU_API void prod_kernel(TensorIterator& iter); -void nansum_kernel(TensorIterator& iter); +TORCH_XPU_API void nansum_kernel(TensorIterator& iter); -void std_var_kernel(TensorIterator& iter, double correction, bool take_sqrt); +TORCH_XPU_API void std_var_kernel( + TensorIterator& iter, + double correction, + bool take_sqrt); -void aminmax_kernel(TensorIterator& iter); +TORCH_XPU_API void aminmax_kernel(TensorIterator& iter); -void aminmax_allreduce_kernel(TensorIterator& iter); +TORCH_XPU_API void aminmax_allreduce_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp index f250b34ad..7bdc3a188 100644 --- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp index d90f63e18..ef838a969 100644 --- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace at::native::xpu { inline std::pair get_index_mapping1d( @@ -613,7 +615,7 @@ void reflection_pad1d_backward_kernel( } void reflection_pad2d_kernel( - Tensor& output, + const Tensor& output, const Tensor& input_, IntArrayRef padding) { TORCH_CHECK( @@ -710,7 +712,7 @@ void reflection_pad2d_kernel( } void reflection_pad2d_backward_kernel( - Tensor& grad_input, + const Tensor& grad_input, const Tensor& grad_output_, const Tensor& input, IntArrayRef padding) { @@ -873,4 +875,4 @@ void reflection_pad3d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h index efd61f208..fc41b0b75 100644 --- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h +++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h @@ -4,37 +4,37 @@ namespace at::native::xpu { -void reflection_pad1d_kernel( +TORCH_XPU_API void reflection_pad1d_kernel( const Tensor& output, const Tensor& input_, IntArrayRef padding); -void reflection_pad1d_backward_kernel( +TORCH_XPU_API void reflection_pad1d_backward_kernel( const Tensor& grad_input, const Tensor& grad_output_, const Tensor& input, IntArrayRef padding); -void reflection_pad2d_kernel( - Tensor& output, +TORCH_XPU_API void reflection_pad2d_kernel( + const Tensor& output, const Tensor& input_, IntArrayRef padding); -void reflection_pad2d_backward_kernel( - Tensor& grad_input, +TORCH_XPU_API void reflection_pad2d_backward_kernel( + const Tensor& grad_input, const Tensor& grad_output_, const Tensor& input, IntArrayRef padding); -void reflection_pad3d_kernel( +TORCH_XPU_API void reflection_pad3d_kernel( const Tensor& output, const Tensor& input_, IntArrayRef padding); -void reflection_pad3d_backward_kernel( +TORCH_XPU_API void reflection_pad3d_backward_kernel( const Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/RenormKernel.cpp b/src/ATen/native/xpu/sycl/RenormKernel.cpp index 0a1696d73..0f08a6566 100644 --- a/src/ATen/native/xpu/sycl/RenormKernel.cpp +++ b/src/ATen/native/xpu/sycl/RenormKernel.cpp @@ -2,6 +2,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/RenormKernel.h b/src/ATen/native/xpu/sycl/RenormKernel.h index a619cf708..488e8365a 100644 --- a/src/ATen/native/xpu/sycl/RenormKernel.h +++ b/src/ATen/native/xpu/sycl/RenormKernel.h @@ -3,6 +3,8 @@ namespace at::native::xpu { -void renorm_scale_factor_kernel(TensorIteratorBase& iter, double maxnorm); +TORCH_XPU_API void renorm_scale_factor_kernel( + TensorIteratorBase& iter, + double maxnorm); } diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp index 0b10f982a..f1eeb426b 100644 --- a/src/ATen/native/xpu/sycl/RepeatKernel.cpp +++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp @@ -2,6 +2,9 @@ #include #include #include + +#include + namespace at::native::xpu { template struct RepeatInterleaveKernelFunctor { @@ -72,4 +75,4 @@ Tensor repeat_interleave_kernel( }); return output; } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h index 3df41a956..9856c4305 100644 --- a/src/ATen/native/xpu/sycl/RepeatKernel.h +++ b/src/ATen/native/xpu/sycl/RepeatKernel.h @@ -2,8 +2,8 @@ #include namespace at::native::xpu { -Tensor repeat_interleave_kernel( +TORCH_XPU_API Tensor repeat_interleave_kernel( const Tensor& repeats, c10::optional output_size); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp index 6b85e9318..75b9322b3 100644 --- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace at::native::xpu { inline int imin(int a, int b) { @@ -517,7 +519,7 @@ void replication_pad2d_kernel( } void replication_pad2d_backward_kernel( - Tensor& grad_input, + const Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding) { @@ -763,4 +765,4 @@ void replication_pad3d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h index 545b26da8..35e20c269 100644 --- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h +++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h @@ -4,37 +4,37 @@ namespace at::native::xpu { -void replication_pad1d_kernel( +TORCH_XPU_API void replication_pad1d_kernel( const Tensor& output, const Tensor& input, IntArrayRef padding); -void replication_pad1d_backward_kernel( +TORCH_XPU_API void replication_pad1d_backward_kernel( const Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -void replication_pad2d_kernel( +TORCH_XPU_API void replication_pad2d_kernel( const Tensor& output, const Tensor& input, IntArrayRef padding); -void replication_pad2d_backward_kernel( - Tensor& grad_input, +TORCH_XPU_API void replication_pad2d_backward_kernel( + const Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -void replication_pad3d_kernel( +TORCH_XPU_API void replication_pad3d_kernel( const Tensor& output, const Tensor& input, IntArrayRef padding); -void replication_pad3d_backward_kernel( +TORCH_XPU_API void replication_pad3d_backward_kernel( const Tensor& grad_input, const Tensor& grad_output, const Tensor& input, IntArrayRef padding); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp index 195d526d8..0835b53ff 100644 --- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp +++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace at::native::xpu { void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) { @@ -58,7 +60,7 @@ TensorImpl* resize_impl_xpu_( TensorImpl* self, IntArrayRef size, at::OptionalIntArrayRef stride, - bool device_guard = true) { + bool device_guard) { if (self->sizes() == size && (!stride || self->strides() == stride)) { return self; } diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.h b/src/ATen/native/xpu/sycl/ResizeKernel.h index 5cef196ed..c5c113194 100644 --- a/src/ATen/native/xpu/sycl/ResizeKernel.h +++ b/src/ATen/native/xpu/sycl/ResizeKernel.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -TensorImpl* resize_impl_xpu_( +TORCH_XPU_API TensorImpl* resize_impl_xpu_( TensorImpl* self, IntArrayRef size, at::OptionalIntArrayRef stride, diff --git a/src/ATen/native/xpu/sycl/ScanKernels.h b/src/ATen/native/xpu/sycl/ScanKernels.h index af632fa83..ea20e4460 100644 --- a/src/ATen/native/xpu/sycl/ScanKernels.h +++ b/src/ATen/native/xpu/sycl/ScanKernels.h @@ -3,8 +3,14 @@ namespace at::native::xpu { -void cumsum_kernel(const Tensor& result, const Tensor& self, int64_t dim); +TORCH_XPU_API void cumsum_kernel( + const Tensor& result, + const Tensor& self, + int64_t dim); -void cumprod_kernel(const Tensor& result, const Tensor& self, int64_t dim); +TORCH_XPU_API void cumprod_kernel( + const Tensor& result, + const Tensor& self, + int64_t dim); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp index 81d4b9e9a..597be8553 100644 --- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp +++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h index 0f8b7e5be..4d7d375b7 100644 --- a/src/ATen/native/xpu/sycl/ScatterGatherKernels.h +++ b/src/ATen/native/xpu/sycl/ScatterGatherKernels.h @@ -6,45 +6,45 @@ namespace at { namespace native { namespace xpu { -void gather_kernel( +TORCH_XPU_API void gather_kernel( const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index); -void scatter_kernel( +TORCH_XPU_API void scatter_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); -void scatter_fill_kernel( +TORCH_XPU_API void scatter_fill_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src); -void scatter_add_kernel( +TORCH_XPU_API void scatter_add_kernel( const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); -void scatter_reduce_kernel( +TORCH_XPU_API void scatter_reduce_kernel( const Tensor& self, const int64_t dim, const Tensor& index, const Tensor& src, const ReductionType& reduce); -void scatter_reduce_two_kernel( +TORCH_XPU_API void scatter_reduce_two_kernel( const Tensor& self, const int64_t dim, const Tensor& index, const Tensor& src, const ReductionType& reduce); -void scatter_scalar_reduce_kernel( +TORCH_XPU_API void scatter_scalar_reduce_kernel( const Tensor& self, const int64_t dim, const Tensor& index, diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp index 23d94b037..eb1f0e090 100644 --- a/src/ATen/native/xpu/sycl/Shape.cpp +++ b/src/ATen/native/xpu/sycl/Shape.cpp @@ -11,6 +11,8 @@ #include #include +#include + namespace at::native::xpu { // The best performance is achieved for parallel computing with 1024 batch sizes diff --git a/src/ATen/native/xpu/sycl/ShapeKernels.h b/src/ATen/native/xpu/sycl/ShapeKernels.h index 494f37020..8f758bfb2 100644 --- a/src/ATen/native/xpu/sycl/ShapeKernels.h +++ b/src/ATen/native/xpu/sycl/ShapeKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void cat_out_kernel( +TORCH_XPU_API void cat_out_kernel( const ITensorListRef& tensors, int64_t dim, int64_t valid, diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index e0a35a295..45c9cb016 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -10,6 +10,8 @@ #include +#include + using namespace xpu::sycl; namespace at { diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h index 4b8741a2a..87d205442 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h @@ -6,26 +6,26 @@ namespace at { namespace native { namespace xpu { -void _softmax_kernel( +TORCH_XPU_API void _softmax_kernel( const Tensor& input, const int64_t dim, const bool half_to_float, const Tensor& output); -void _log_softmax_kernel( +TORCH_XPU_API void _log_softmax_kernel( const Tensor& input, const int64_t dim, const bool half_to_float, const Tensor& output); -void _softmax_backward_kernel( +TORCH_XPU_API void _softmax_backward_kernel( const Tensor& grad, const Tensor& output, int64_t dim, bool half_to_float, const Tensor& grad_input); -void _log_softmax_backward_kernel( +TORCH_XPU_API void _log_softmax_backward_kernel( const Tensor& grad, const Tensor& output, int64_t dim, diff --git a/src/ATen/native/xpu/sycl/Sorting.cpp b/src/ATen/native/xpu/sycl/Sorting.cpp index f75b50ac5..cf41810dc 100644 --- a/src/ATen/native/xpu/sycl/Sorting.cpp +++ b/src/ATen/native/xpu/sycl/Sorting.cpp @@ -17,6 +17,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/Sorting.h b/src/ATen/native/xpu/sycl/Sorting.h index f4ce097bc..14164ddf9 100644 --- a/src/ATen/native/xpu/sycl/Sorting.h +++ b/src/ATen/native/xpu/sycl/Sorting.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void sort_stable_kernel( +TORCH_XPU_API void sort_stable_kernel( const TensorBase& self_base, const TensorBase& values_base, const TensorBase& indices_base, @@ -12,7 +12,7 @@ void sort_stable_kernel( bool descending, bool stable); -void launch_median_kernel( +TORCH_XPU_API void launch_median_kernel( const TensorBase& vals, const TensorBase& inds, const TensorBase& self, diff --git a/src/ATen/native/xpu/sycl/StepKernels.cpp b/src/ATen/native/xpu/sycl/StepKernels.cpp index d3d76926b..34f703591 100644 --- a/src/ATen/native/xpu/sycl/StepKernels.cpp +++ b/src/ATen/native/xpu/sycl/StepKernels.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/StepKernels.h b/src/ATen/native/xpu/sycl/StepKernels.h index 90f949d9a..c026a6b65 100644 --- a/src/ATen/native/xpu/sycl/StepKernels.h +++ b/src/ATen/native/xpu/sycl/StepKernels.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void nextafter_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void nextafter_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp index 265170821..6e88a987f 100644 --- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp @@ -10,6 +10,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::native; using namespace at::xpu::detail; diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h index 4d0f06dfb..50f863212 100644 --- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h +++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h @@ -3,9 +3,7 @@ namespace at::native::xpu { -Tensor bincount_kernel( - const Tensor& self, - const Tensor& weights, - int64_t minlength); +TORCH_XPU_API Tensor +bincount_kernel(const Tensor& self, const Tensor& weights, int64_t minlength); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp index 7185b6453..c6471aac2 100644 --- a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp @@ -6,6 +6,8 @@ #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.h b/src/ATen/native/xpu/sycl/TensorCompareKernels.h index 51cb74a40..0e83f33e9 100644 --- a/src/ATen/native/xpu/sycl/TensorCompareKernels.h +++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.h @@ -4,20 +4,24 @@ namespace at::native::xpu { -void where_kernel(TensorIterator& iter); +TORCH_XPU_API void where_kernel(TensorIterator& iter); -void clamp_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void clamp_kernel(TensorIteratorBase& iter); -void clamp_scalar_kernel( +TORCH_XPU_API void clamp_scalar_kernel( TensorIteratorBase& iter, const Scalar& min, const Scalar& max); -void clamp_min_scalar_kernel(TensorIteratorBase& iter, Scalar min); +TORCH_XPU_API void clamp_min_scalar_kernel( + TensorIteratorBase& iter, + Scalar min); -void clamp_max_scalar_kernel(TensorIteratorBase& iter, Scalar max); +TORCH_XPU_API void clamp_max_scalar_kernel( + TensorIteratorBase& iter, + Scalar max); -void isin_kernel( +TORCH_XPU_API void isin_kernel( const Tensor& elements, const Tensor& test_elements, bool invert, diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp index 581182ec9..8bd6c7c6b 100644 --- a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp +++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.h b/src/ATen/native/xpu/sycl/TensorTopKKernel.h index 7259e3f44..f04d5065e 100644 --- a/src/ATen/native/xpu/sycl/TensorTopKKernel.h +++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.h @@ -6,7 +6,7 @@ namespace at { namespace native { namespace xpu { -void topk_kernel( +TORCH_XPU_API void topk_kernel( const at::Tensor& input, int64_t k, int64_t dim, diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp index 79d27b93a..9cdb0dacf 100644 --- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + #ifdef _WIN32 #define RESTRICT __restrict #else diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h index 84fa03f0a..1de970634 100644 --- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h +++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h @@ -4,9 +4,9 @@ namespace at::native::xpu { -void flip_kernel(TensorIterator& iter, bool quantized); +TORCH_XPU_API void flip_kernel(TensorIterator& iter, bool quantized); -void roll_kernel( +TORCH_XPU_API void roll_kernel( const Tensor& input, Tensor& output, IntArrayRef shifts, diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp index c503e5b16..45fcc655e 100644 --- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace at::native::xpu { using namespace at::xpu; diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h index 85e144308..0cf9630b9 100644 --- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h +++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h @@ -3,8 +3,14 @@ namespace at::native::xpu { -void tril_kernel(const Tensor& result, const Tensor& self, int64_t k); +TORCH_XPU_API void tril_kernel( + const Tensor& result, + const Tensor& self, + int64_t k); -void triu_kernel(const Tensor& result, const Tensor& self, int64_t k); +TORCH_XPU_API void triu_kernel( + const Tensor& result, + const Tensor& self, + int64_t k); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp index d00720cd8..31117dd17 100644 --- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp @@ -9,6 +9,10 @@ #include #include +#include +#include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h index 877a1e311..aef6cde9f 100644 --- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void conj_kernel(TensorIterator& iter); +TORCH_XPU_API void conj_kernel(TensorIterator& iter); -void conj_physical_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void conj_physical_kernel(TensorIteratorBase& iter); -void neg_conj_kernel(TensorIterator& iter); +TORCH_XPU_API void neg_conj_kernel(TensorIterator& iter); -void neg_kernel(TensorIterator& iter); +TORCH_XPU_API void neg_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp index 96510a0e0..a8551c262 100644 --- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp @@ -4,6 +4,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h index 10f4889b0..a3a2015df 100644 --- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h @@ -4,16 +4,18 @@ namespace at::native::xpu { -void reciprocal_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void reciprocal_kernel(TensorIteratorBase& iter); -void floor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void floor_kernel(TensorIteratorBase& iter); -void ceil_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void ceil_kernel(TensorIteratorBase& iter); -void round_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void round_kernel(TensorIteratorBase& iter); -void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals); +TORCH_XPU_API void round_decimals_kernel( + TensorIteratorBase& iter, + int64_t decimals); -void frac_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void frac_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp index 7b23e22ed..d2546e842 100644 --- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.cpp @@ -5,16 +5,23 @@ #include #include #include -#include +#include #include #include +#include + namespace at::native::xpu { -template +template struct DigammaFunctor { scalar_t operator()(scalar_t a) const { - return calc_digamma(a); + if constexpr (USE_FP64_PI) { + return calc_digamma(a); + } else { + using pi_t = at::acc_type_device; + return calc_digamma(a); + } } }; @@ -24,7 +31,13 @@ void digamma_kernel(TensorIteratorBase& iter) { at::ScalarType::BFloat16, iter.common_dtype(), "digamma_xpu", - [&]() { gpu_kernel(iter, DigammaFunctor()); }); + [&]() { + if (syclHasFloat64()) { + gpu_kernel(iter, DigammaFunctor()); + } else { + gpu_kernel(iter, DigammaFunctor()); + } + }); } template diff --git a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h index 5eda977f6..2908a4b13 100644 --- a/src/ATen/native/xpu/sycl/UnaryGammaKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryGammaKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -void digamma_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void digamma_kernel(TensorIteratorBase& iter); -void polygamma_kernel(TensorIteratorBase& iter, int64_t n); +TORCH_XPU_API void polygamma_kernel(TensorIteratorBase& iter, int64_t n); -void lgamma_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void lgamma_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp index e1b8ff80f..d35dedf96 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template struct AcosFunctor { diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h index 4ff06d256..17acb1061 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h @@ -3,6 +3,7 @@ #include namespace at::native::xpu { -void acos_kernel(TensorIteratorBase& iter); -} // namespace at::native::xpu \ No newline at end of file +TORCH_XPU_API void acos_kernel(TensorIteratorBase& iter); + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp index cd9864275..9062ee17a 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h index 98dc2eed5..732d307ec 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void acosh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void acosh_kernel(TensorIteratorBase& iter); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp index c8b1aaca6..1819b47dc 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h index 194ce6479..ff449e2cd 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void asin_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void asin_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp index 009a68b47..24169a1dc 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h index 4d37288de..2668ee963 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void asinh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void asinh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp index f241e9da3..c6ce4b1b0 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h index 022720223..6ee127539 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void atan_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void atan_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp index 630a64d39..3033ff8f8 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h index 5536641e7..267b59eb6 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void atanh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void atanh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp index 21552d99d..e4eee5e7a 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h index 35123a666..fc31580b7 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void cos_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void cos_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp index 11440b3d3..16cffe893 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h index 7f031e3ff..bbd10e1aa 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void cosh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void cosh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp index 2a17cefcb..27e3e2906 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h index 76dca73c8..ecd4da1d4 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void sin_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sin_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp index 2155aafad..06309f45c 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h index e3ceaf643..a5005e1d0 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void sinh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sinh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp index 57c0049b1..996da809b 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.cpp @@ -3,6 +3,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h index b96a35f93..9446ff058 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void tan_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void tan_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp index dcbe8a970..1b71b3974 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.cpp @@ -5,6 +5,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h index 6cecc99e2..831421a12 100644 --- a/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h +++ b/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -void tanh_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void tanh_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp index d9291022d..614838f65 100644 --- a/src/ATen/native/xpu/sycl/UnaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryKernels.cpp @@ -11,6 +11,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.h b/src/ATen/native/xpu/sycl/UnaryKernels.h index cc394de6b..db7005523 100644 --- a/src/ATen/native/xpu/sycl/UnaryKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryKernels.h @@ -4,17 +4,17 @@ namespace at::native::xpu { -void sqrt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sqrt_kernel(TensorIteratorBase& iter); -void rsqrt_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void rsqrt_kernel(TensorIteratorBase& iter); -void bitwise_not_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void bitwise_not_kernel(TensorIteratorBase& iter); -void exp_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void exp_kernel(TensorIteratorBase& iter); -void expm1_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void expm1_kernel(TensorIteratorBase& iter); -void nan_to_num_kernel( +TORCH_XPU_API void nan_to_num_kernel( TensorIteratorBase& iter, std::optional nan, std::optional pos_inf, diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp index eb531cfcc..6be7cee18 100644 --- a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.h b/src/ATen/native/xpu/sycl/UnaryLogKernels.h index c8d16ceae..72db9235a 100644 --- a/src/ATen/native/xpu/sycl/UnaryLogKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.h @@ -4,12 +4,12 @@ namespace at::native::xpu { -void log_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log_kernel(TensorIteratorBase& iter); -void log10_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log10_kernel(TensorIteratorBase& iter); -void log1p_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log1p_kernel(TensorIteratorBase& iter); -void log2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log2_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp index 655c1b503..7e878fdeb 100644 --- a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp @@ -7,6 +7,8 @@ #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.h b/src/ATen/native/xpu/sycl/UnarySignKernels.h index 98b6eeb5f..01759a725 100644 --- a/src/ATen/native/xpu/sycl/UnarySignKernels.h +++ b/src/ATen/native/xpu/sycl/UnarySignKernels.h @@ -4,14 +4,14 @@ namespace at::native::xpu { -void logical_not_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_not_kernel(TensorIteratorBase& iter); -void neg_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void neg_kernel(TensorIteratorBase& iter); -void sgn_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sgn_kernel(TensorIteratorBase& iter); -void sign_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sign_kernel(TensorIteratorBase& iter); -void signbit_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void signbit_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp index e5150176a..05cdd31b2 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp @@ -11,6 +11,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h index 6bed9c6de..f810da576 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h @@ -4,16 +4,18 @@ namespace at::native::xpu { -void sigmoid_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void sigmoid_kernel(TensorIteratorBase& iter); -void erf_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void erf_kernel(TensorIteratorBase& iter); -void erfc_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void erfc_kernel(TensorIteratorBase& iter); -void erfinv_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void erfinv_kernel(TensorIteratorBase& iter); -void exp2_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void exp2_kernel(TensorIteratorBase& iter); -void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar); +TORCH_XPU_API void logit_kernel( + TensorIteratorBase& iter, + const Scalar& eps_scalar); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp index d4b0d32f2..77f310029 100644 --- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp @@ -8,6 +8,8 @@ #include #include +#include + namespace at::native::xpu { constexpr int n_elems_per_work_item = 4; // UNROLLED_ELEM_PER_WORK_ITEM; diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h index 2d618ce03..c74a8a950 100644 --- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h +++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void unfold_backward_kernel( +TORCH_XPU_API void unfold_backward_kernel( Tensor& grad_out, const Tensor& grad_in, int64_t dim, diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.cpp b/src/ATen/native/xpu/sycl/UniqueKernels.cpp index b57274a8e..359712d1f 100644 --- a/src/ATen/native/xpu/sycl/UniqueKernels.cpp +++ b/src/ATen/native/xpu/sycl/UniqueKernels.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace at::native::xpu { struct UniqueNotEqualFunctor { diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.h b/src/ATen/native/xpu/sycl/UniqueKernels.h index b7a7cd1ae..25b36cc64 100644 --- a/src/ATen/native/xpu/sycl/UniqueKernels.h +++ b/src/ATen/native/xpu/sycl/UniqueKernels.h @@ -3,29 +3,30 @@ #include namespace at::native::xpu { -std::tuple unique_consecutive_kernel( + +TORCH_XPU_API std::tuple unique_consecutive_kernel( const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional dim); -std::tuple unique_dim_consecutive_kernel( +TORCH_XPU_API std::tuple unique_dim_consecutive_kernel( const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts); -std::tuple unique_dim_kernel( +TORCH_XPU_API std::tuple unique_dim_kernel( const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts); -std::tuple _unique_kernel( +TORCH_XPU_API std::tuple _unique_kernel( const Tensor& self, const bool return_inverse); -std::tuple _unique2_kernel( +TORCH_XPU_API std::tuple _unique2_kernel( const Tensor& self, const bool return_inverse, const bool return_counts); diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp index 5023d5241..1a64e233b 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h index a70a08c8c..fbf900238 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void upsample_bicubic2d_kernel( +TORCH_XPU_API void upsample_bicubic2d_kernel( const Tensor& output, const Tensor& input, IntArrayRef output_size, @@ -12,4 +12,4 @@ void upsample_bicubic2d_kernel( std::optional scales_h, std::optional scales_w); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp index f9075c139..653a293d8 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace at::native::xpu { template diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h index 3071c51c3..aa5ee2c09 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h @@ -4,7 +4,7 @@ namespace at::native::xpu { -void upsample_bilinear2d_out_kernel( +TORCH_XPU_API void upsample_bilinear2d_out_kernel( const Tensor& output, const Tensor& input, IntArrayRef output_size, @@ -12,7 +12,7 @@ void upsample_bilinear2d_out_kernel( std::optional scales_h, std::optional scales_w); -void upsample_bilinear2d_backward_out_kernel( +TORCH_XPU_API void upsample_bilinear2d_backward_out_kernel( const Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, @@ -21,4 +21,4 @@ void upsample_bilinear2d_backward_out_kernel( c10::optional scales_h, c10::optional scales_w); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp index f3ae596af..2bce3027e 100644 --- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp @@ -14,6 +14,8 @@ #include "ATen/Context.h" #include "ATen/core/TensorBase.h" +#include + namespace at::native::xpu { template struct UpsampleLinear1dKernelFunctor { @@ -213,4 +215,4 @@ void upsample_linear1d_backward_kernel( } // namespace at::native::xpu #pragma GCC diagnostic pop -#pragma clang diagnostic pop \ No newline at end of file +#pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h index 8928ea906..70befad77 100644 --- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h @@ -3,14 +3,15 @@ #include namespace at::native::xpu { -void upsample_linear1d_kernel( + +TORCH_XPU_API void upsample_linear1d_kernel( const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional scales, const Tensor& output); -void upsample_linear1d_backward_kernel( +TORCH_XPU_API void upsample_linear1d_backward_kernel( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size, @@ -18,4 +19,4 @@ void upsample_linear1d_backward_kernel( std::optional scales, const Tensor& grad_input); -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp index b482978f2..0751749fc 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace at::native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h index 9f7b80d21..79801b4d6 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h @@ -5,14 +5,14 @@ namespace at::native::xpu { -void upsample_nearest1d_kernel( +TORCH_XPU_API void upsample_nearest1d_kernel( const Tensor& output, const Tensor& input_, IntArrayRef output_size, c10::optional scales, bool is_exact); -void upsample_nearest1d_backward_kernel( +TORCH_XPU_API void upsample_nearest1d_backward_kernel( const Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp index 3eab9f202..76e95c1ed 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp @@ -707,4 +707,4 @@ void upsample_nearest2d_kernel( } } } // namespace xpu -} // namespace at::native \ No newline at end of file +} // namespace at::native diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h index c6383a56c..ea12e7b39 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h +++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h @@ -5,7 +5,7 @@ namespace at::native::xpu { -void upsample_nearest2d_kernel( +TORCH_XPU_API void upsample_nearest2d_kernel( const Tensor& output, const Tensor& input_, IntArrayRef output_size, @@ -13,7 +13,7 @@ void upsample_nearest2d_kernel( c10::optional scales_w, bool is_exact); -void upsample_nearest2d_backward_kernel( +TORCH_XPU_API void upsample_nearest2d_backward_kernel( const Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp index dd93f68c3..ac67d5d34 100644 --- a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp @@ -7,6 +7,8 @@ #include #include "comm/Runtime.h" +#include + namespace at::native::xpu { template @@ -1024,4 +1026,4 @@ std::tuple weight_norm_backward_kernel( return {grad_v, grad_g}; } -} // namespace at::native::xpu \ No newline at end of file +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.h b/src/ATen/native/xpu/sycl/WeightNormKernels.h index 5b0d7afd2..88a45b298 100644 --- a/src/ATen/native/xpu/sycl/WeightNormKernels.h +++ b/src/ATen/native/xpu/sycl/WeightNormKernels.h @@ -2,15 +2,17 @@ #include namespace at::native::xpu { -std::tuple weight_norm_kernel( + +TORCH_XPU_API std::tuple weight_norm_kernel( const Tensor& v, const Tensor& g, int64_t dim); -std::tuple weight_norm_backward_kernel( +TORCH_XPU_API std::tuple weight_norm_backward_kernel( const Tensor& grad_w, const Tensor& saved_v, const Tensor& saved_g, const Tensor& saved_norms, int64_t dim); -} // namespace at::native::xpu \ No newline at end of file + +} // namespace at::native::xpu diff --git a/src/BuildOnLinux.cmake b/src/BuildOnLinux.cmake new file mode 100644 index 000000000..3e304b31f --- /dev/null +++ b/src/BuildOnLinux.cmake @@ -0,0 +1,79 @@ +# Build on Linux + +set(TORCH_XPU_OPS_LIBRARIES) +set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) + +add_library( + torch_xpu_ops + STATIC + ${ATen_XPU_CPP_SRCS} + ${ATen_XPU_NATIVE_CPP_SRCS} + ${ATen_XPU_GEN_SRCS}) + +if(BUILD_SEPARATE_OPS) + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + get_filename_component(name ${sycl_src} NAME_WLE REALPATH) + set(sycl_lib torch-xpu-ops-sycl-${name}) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${sycl_src}) + target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + endforeach() +else() + # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others. + set(ATen_XPU_SYCL_UNARY_BINARY_SRCS) + set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS) + + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + string(REGEX MATCH "Binary" IS_BINARY ${sycl_src}) + string(REGEX MATCH "Unary" IS_UNARY ${sycl_src}) + if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "") + list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src}) + else() + list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src}) + endif() + endforeach() + + set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels) + sycl_add_library( + ${sycl_unary_binary_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS}) + target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + set(sycl_lib torch_xpu_ops_sycl_kernels) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS}) + target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") +endif() +set(SYCL_LINK_LIBRARIES_KEYWORD) + +list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) + +foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) + # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake + torch_compile_options(${lib}) + target_compile_options_if_supported(${lib} "-Wno-deprecated-copy") + target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS}) + + target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) + + target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) +endforeach() diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake new file mode 100644 index 000000000..a8e00cb55 --- /dev/null +++ b/src/BuildOnWindows.cmake @@ -0,0 +1,106 @@ +# Build on Windows + +set(TORCH_XPU_OPS_LIBRARIES) +set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) + +# Walk around cyclic dependence +# libtorch_xpu.so links to libtorch_xpu_ops.a +# libtorch_xpu_ops.a dlopens libtorch_xpu_ops_aten.so (Break cycle) +# libtorch_xpu_ops_aten.so links to libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so +# libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so links to libtorch_xpu.so +add_library( + torch_xpu_ops + STATIC + ${ATen_XPU_CPP_SRCS} + "bridge.cpp") +set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\") +target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB}) + +add_library( + torch_xpu_ops_aten + SHARED + ${ATen_XPU_NATIVE_CPP_SRCS} + ${ATen_XPU_GEN_SRCS}) +install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}") +target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu) + +if(BUILD_SEPARATE_OPS) + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + get_filename_component(name ${sycl_src} NAME_WLE REALPATH) + set(sycl_lib torch-xpu-ops-sycl-${name}) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${sycl_src}) + target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib}) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + endforeach() +else() + # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others. + set(ATen_XPU_SYCL_UNARY_BINARY_SRCS) + set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS) + + foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) + string(REGEX MATCH "Binary" IS_BINARY ${sycl_src}) + string(REGEX MATCH "Unary" IS_UNARY ${sycl_src}) + # Resolve cyclic dependences between + # torch_xpu_ops_sycl_unary_binary_kernels.dll and + # torch_xpu_ops_sycl_kernels.dll. Move definition and invoke of kernels + # into a same kernel library. Here we move elementwise kernel pow and copy + # into torch_xpu_ops_sycl_unary_binary_kernels.dll. + string(REGEX MATCH "Pow" IS_POW ${sycl_src}) + string(REGEX MATCH "Copy" IS_COPY ${sycl_src}) + if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "" AND IS_POW STREQUAL "" AND IS_COPY STREQUAL "") + list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src}) + else() + list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src}) + endif() + endforeach() + + set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels) + sycl_add_library( + ${sycl_unary_binary_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS}) + target_compile_definitions(${sycl_unary_binary_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_unary_binary_lib}) + target_link_libraries(${sycl_unary_binary_lib} PUBLIC torch_xpu) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + set(sycl_lib torch_xpu_ops_sycl_kernels) + sycl_add_library( + ${sycl_lib} + SHARED + SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS}) + target_compile_definitions(${sycl_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB) + target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_lib}) + target_link_libraries(${sycl_lib} PUBLIC torch_xpu) + list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) + + # Decouple with PyTorch cmake definition. + install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") +endif() +set(SYCL_LINK_LIBRARIES_KEYWORD) + +list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) +list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops_aten) + +foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) + # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake + torch_compile_options(${lib}) + target_compile_options_if_supported(${lib} "-Wno-deprecated-copy") + target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS}) + + target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) + target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) + + target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) + target_link_libraries(${lib} PUBLIC c10_xpu) +endforeach() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aa7551ebe..0716ca5af 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,95 +2,24 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake) set(ATen_XPU_CPP_SRCS) +set(ATen_XPU_NATIVE_CPP_SRCS) set(ATen_XPU_SYCL_SRCS) set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory") add_subdirectory(ATen) -set(TORCH_XPU_OPS_LIBRARIES) -set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE) - -add_library( - torch_xpu_ops - STATIC - ${ATen_XPU_CPP_SRCS} - ${ATen_XPU_GEN_SRCS}) - -if(BUILD_SEPARATE_OPS) - foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) - get_filename_component(name ${sycl_src} NAME_WLE REALPATH) - set(sycl_lib torch-xpu-ops-sycl-${name}) - sycl_add_library( - ${sycl_lib} - SHARED - SYCL_SOURCES ${sycl_src}) - target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) - if(USE_PER_OPERATOR_HEADERS) - target_compile_definitions(torch_xpu_ops PUBLIC -DAT_PER_OPERATOR_HEADERS) - endif() - list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) - - # Decouple with PyTorch cmake definition. - install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") - endforeach() +# With the increasement of bin size, we have to split libtorch_xpu.so into +# multiple libraries. Because of strict linkage requirements on Windows, +# we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible. +# To isolate impact and keep both maintainable, there are separate code pieces +# to setup CMAKE targets. Will revert splitting library and unify these once +# compression of device code is supported by SYCL/backend compiler. +if(WIN32) + include(BuildOnWindows.cmake) else() - # Split SYCL kernels into 2 libraries as categories 1) Unary+Binary 2) Others. - set(ATen_XPU_SYCL_UNARY_BINARY_SRCS) - set(ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS) - - foreach(sycl_src ${ATen_XPU_SYCL_SRCS}) - string(REGEX MATCH "Binary" IS_BINARY ${sycl_src}) - string(REGEX MATCH "Unary" IS_UNARY ${sycl_src}) - if(IS_BINARY STREQUAL "" AND IS_UNARY STREQUAL "") - list(APPEND ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS ${sycl_src}) - else() - list(APPEND ATen_XPU_SYCL_UNARY_BINARY_SRCS ${sycl_src}) - endif() - endforeach() - - set(sycl_unary_binary_lib torch_xpu_ops_sycl_unary_binary_kernels) - sycl_add_library( - ${sycl_unary_binary_lib} - SHARED - SYCL_SOURCES ${ATen_XPU_SYCL_UNARY_BINARY_SRCS}) - target_link_libraries(torch_xpu_ops PUBLIC ${sycl_unary_binary_lib}) - list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_unary_binary_lib}) - - # Decouple with PyTorch cmake definition. - install(TARGETS ${sycl_unary_binary_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") - - set(sycl_lib torch_xpu_ops_sycl_kernels) - sycl_add_library( - ${sycl_lib} - SHARED - SYCL_SOURCES ${ATen_XPU_SYCL_NON_UNARY_BINARY_SRCS}) - target_link_libraries(torch_xpu_ops PUBLIC ${sycl_lib}) - list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_lib}) - - # Decouple with PyTorch cmake definition. - install(TARGETS ${sycl_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}") + include(BuildOnLinux.cmake) endif() -set(SYCL_LINK_LIBRARIES_KEYWORD) - -list(APPEND TORCH_XPU_OPS_LIBRARIES torch_xpu_ops) - -foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) - # Align with PyTorch compile options PYTORCH_SRC_DIR/cmake/public/utils.cmake - torch_compile_options(${lib}) - target_compile_options_if_supported(${lib} "-Wno-deprecated-copy") - target_compile_options(${lib} PRIVATE ${TORCH_XPU_OPS_FLAGS}) - - if(USE_PER_OPERATOR_HEADERS) - target_compile_definitions(${lib} PUBLIC -DAT_PER_OPERATOR_HEADERS) - endif() - - target_include_directories(${lib} PUBLIC ${TORCH_XPU_OPS_INCLUDE_DIRS}) - target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) - target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) - - target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) -endforeach() include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake) if(CLANG_FORMAT) diff --git a/src/bridge.cpp b/src/bridge.cpp new file mode 100644 index 000000000..c19ce2554 --- /dev/null +++ b/src/bridge.cpp @@ -0,0 +1,34 @@ +#include +#include +#include + +// The implementation helps walk around cyclic dependence, when we separate +// kernels into multiple dll/so to avoid a large bin (>2GB). +// The file is built into libtorch_xpu.so. libtorch_xpu.so won't depend on +// libtorch_xpu_ops_aten.so but dlopen the library to walk around cyclic +// dependence during linkage. To break cycle like, +// libtorch_xpu.so -> (dlopen) libtorch_xpu_ops_aten.so -> (link) +// libtorch_xpu_ops_kernels.so +// -> (link) +// libtorch_xpu_ops_unary_binary_kernels.so +// libtorch_xpu_ops_kernels.so -> (link) libtorch_xpu.so +// libtorch_xpu_ops_unary_binary_kernels.so -> (link) libtorch_xpu.so +namespace { + +class LoadTorchXPUOps { + public: + LoadTorchXPUOps() { + if (NULL == LoadLibrary(PATH_TO_TORCH_XPU_OPS_ATEN_LIB)) { + std::ostringstream error; + error << "PyTorch XPU operators library is not loaded (ERROR: " + << GetLastError() + << "). Please check if PyTorch is installed correctly." + << " Or please file an issue on https://github.com/intel/torch-xpu-ops/issues."; + throw std::runtime_error(error.str()); + } + } +}; + +static LoadTorchXPUOps init; + +} // namespace diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h index 0f4c084c8..01beffa74 100644 --- a/src/comm/DeviceProperties.h +++ b/src/comm/DeviceProperties.h @@ -185,5 +185,11 @@ uint32_t syclNativeVectorWidth( "Invalid data type to fetch native vector width!"); } +static inline bool syclHasFloat64( + at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + auto* dev_prop = at::xpu::getDeviceProperties(dev_id); + return dev_prop->has_fp64; +} + } // namespace sycl } // namespace xpu diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 86ef8176a..9c7b2270f 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -1,165 +1,22 @@ import os import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win -skip_list = ( - # Calculation error between XPU implementation and CPU implementation, - # 1. Compiler optimization causes failing to promote data type to higher precision. - # 2. Accumulate error is amplified by some operations in some extreme cases. (std::exp(extreme_large_num)) - # 3. Accumulate error is amplified by a large number of accumalate operations. - # 4. Accumulate error is different on different implementations due to different accumulation order. - # a. Different kernel implementations. - # b. Different std functions. (std::log, std::tanh, std::exp) - # 5. The result of division between two same float values is not 1. - # 6. std functions get different results when input is nan or inf between GCC and SYCL. - "test_compare_cpu_cumsum_xpu_bfloat16", - "test_compare_cpu_cumsum_xpu_float16", - "test_compare_cpu_log_xpu_complex64", - "test_compare_cpu_log10_xpu_complex64", - "test_compare_cpu_log1p_xpu_complex64", - "test_compare_cpu_log2_xpu_complex64", - "test_compare_cpu_log2_xpu_complex128", - "test_compare_cpu_mul_xpu_complex64", - "test_compare_cpu_pow_xpu_complex128", - "test_compare_cpu_pow_xpu_complex64", - "test_compare_cpu_tan_xpu_complex128", - "test_compare_cpu_tan_xpu_complex64", - "test_compare_cpu_tanh_xpu_complex128", - "test_compare_cpu_tanh_xpu_complex64", - "test_compare_cpu_rsqrt_xpu_bfloat16", - "test_compare_cpu_pow_xpu_bfloat16", - # cuda has the same issue on this case - "test_compare_cpu__refs_rsub_xpu_bfloat16", - "test_compare_cpu_add_xpu_bfloat16", - "test_compare_cpu_sub_xpu_bfloat16", - "test_compare_cpu_acos_xpu_complex128", - "test_compare_cpu_acos_xpu_complex64", - "test_compare_cpu_acosh_xpu_complex64", - "test_compare_cpu_cross_xpu_float16", - "test_compare_cpu_floor_divide_xpu_bfloat16", - "test_compare_cpu_floor_divide_xpu_float16", - "test_compare_cpu_polygamma_polygamma_n_0_xpu_bfloat16", - "test_compare_cpu_exp_xpu_bfloat16", - "test_compare_cpu_exp_xpu_complex128", - "test_compare_cpu_exp_xpu_complex64", - "test_compare_cpu_acosh_xpu_complex64", - "test_compare_cpu_asin_xpu_complex128", - "test_compare_cpu_asin_xpu_complex64", - "test_compare_cpu_asinh_xpu_complex128", - "test_compare_cpu_asinh_xpu_complex64", - "test_compare_cpu_atan_xpu_complex128", - "test_compare_cpu_atan_xpu_complex64", - "test_compare_cpu_exp2_xpu_complex128", - "test_compare_cpu_exp2_xpu_complex64", - "test_compare_cpu_nextafter_xpu_bfloat16", - # CUDA does not support the data type either - "test_non_standard_bool_values_native_dropout_backward_xpu_bool", - # Need FP64 golden ref for more accurate comparison - "test_compare_cpu_log_softmax_xpu_bfloat16", - # TestCompositeCompliance - # CPU fallback fails - # Require implementing aten::embedding_renorm_ - "test_view_replay_nn_functional_embedding_xpu_float32", - # TestCompositeCompliance::test_cow_input - # XPU Tensor fails in copy-on-write cases - # AssertionError: False is not true : Keyword argument 'output grad 0' during backward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_backward=False` in this operation's OpInfo, add the arg to the OpInfo's `allow_cow_input_materialize_backward` list, or change the implementation to avoid materialization. - # https://github.com/intel/torch-xpu-ops/issues/281 - "test_cow_input", - # XPU implementation is correct. - # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified) - # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified) - # CPU implementation gets NaN in the cases. - # https://en.cppreference.com/w/cpp/numeric/complex/exp - "test_compare_cpu_sigmoid_xpu_complex64", - "test_compare_cpu_sigmoid_xpu_complex128", - # Special handle (different calculation order) in CPU reference impl. - # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300 - "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16", - "test_compare_cpu_nn_functional_embedding_bag_xpu_float16", - # Not implemented operators, aten::embedding_renorm_. - # To retrieve cases when the operators are supported. - # https://github.com/intel/torch-xpu-ops/issues/380 - "test_compare_cpu_nn_functional_embedding_bag_xpu_float32", - "test_compare_cpu_nn_functional_embedding_bag_xpu_float64", - "test_view_replay_nn_functional_embedding_bag_xpu_float32", - # Double and complex datatype matmul is not supported in oneDNN - "test_compare_cpu_cdist_xpu_float64", - # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision - # Not in CUDA test scope too - "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16", - "test_compare_cpu_nn_functional_upsample_bilinear_xpu_float16", - # CPU result is not golden reference - "test_compare_cpu_nn_functional_group_norm_xpu_bfloat16", - "test_compare_cpu_nn_functional_group_norm_xpu_float16", - "test_compare_cpu_nn_functional_nll_loss_xpu_bfloat16", - "test_compare_cpu_nn_functional_nll_loss_xpu_float16", - "test_compare_cpu_nn_functional_batch_norm_xpu_bfloat16", - "test_compare_cpu__batch_norm_with_update_xpu_bfloat16", - "test_compare_cpu__batch_norm_with_update_xpu_float16", - "test_compare_cpu_nn_functional_huber_loss_xpu_bfloat16", - "test_compare_cpu_nansum_xpu_bfloat16", - "test_compare_cpu_nanmean_xpu_bfloat16", - # Align with CUDA impl by using accumulate type. But CPU doesn't use. - # When XPU uses original data type, the case passes. - "test_compare_cpu_logit_xpu_bfloat16", - # precison error - # Mismatched elements: 1 / 24 (4.2%) - # Greatest absolute difference: 0.03125 at index (0, 1, 0, 1) (up to 0.001 allowed) - # Greatest relative difference: 0.0048828125 at index (0, 1, 0, 1) (up to 0.001 allowed) - "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_bfloat16", - # RuntimeError: "compute_index_ranges_weights" not implemented for 'Half' - "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float16", - # AssertionError: False is not true : Argument 0 during forward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_forward=False... - "test_cow_input_nn_functional_interpolate_bilinear_xpu_float32", - "test_cow_input_nn_functional_interpolate_linear_xpu_float32", - "test_cow_input_nn_functional_interpolate_trilinear_xpu_float32", - #The results of XPU and CUDA are consistent, but the results of CPU and CUDA are inconsistent - "test_compare_cpu_nn_functional_interpolate_linear_xpu_bfloat16", - "test_compare_cpu_nn_functional_interpolate_linear_xpu_float16", - # bicubic interpolate includes large calculation steps, accuracy reduces in half-precision - # Not in CUDA test scope too - "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_bfloat16", - "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float16", - # Not all operators are implemented for XPU tested in the case. - # Retrieve it once the operator is implemented. - # Error: The operator 'aten::glu_jvp' is not currently implemented for the XPU device. - "test_forward_ad_nn_functional_glu_xpu_float32", - # Precision error. - # Mismatched elements: 1 / 812 (0.1%) - # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed) - # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed) - "test_compare_cpu_hypot_xpu_bfloat16", - # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. - # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. - "test_compare_cpu_polar_xpu_bfloat16", - # Precision error. - # Mismatched elements: 1 / 25 (4.0%) - # Greatest absolute difference: 0.00146484375 at index (0, 0) (up to 0.001 allowed) - # Greatest relative difference: 0.0163116455078125 at index (0, 0) (up to 0.001 allowed) - "test_compare_cpu_sub_xpu_float16", - # different results for value index due to unstable sort. - # XPU and CUDA have the same result. - "test_compare_cpu_median_xpu_int16", - "test_compare_cpu_median_xpu_int32", - "test_compare_cpu_median_xpu_int64", - "test_compare_cpu_median_xpu_int8", - "test_compare_cpu_median_xpu_uint8", - "test_compare_cpu_nanmedian_xpu_int16", - "test_compare_cpu_nanmedian_xpu_int32", - "test_compare_cpu_nanmedian_xpu_int64", - "test_compare_cpu_nanmedian_xpu_int8", - "test_compare_cpu_nanmedian_xpu_uint8", -) +IS_WINDOWS = sys.platform == "win32" +skip_list = skip_dict["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] -skip_options = " -k 'not " + skip_list[0] +skip_options = " -k \"not " + skip_list[0] for skip_case in skip_list[1:]: skip_option = " and not " + skip_case skip_options += skip_option -skip_options += "'" +skip_options += "\"" -test_command = "PYTORCH_TEST_WITH_SLOW=1 pytest -v test_ops_xpu.py" +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = "pytest -v test_ops_xpu.py" test_command += skip_options - res = os.system(test_command) -exit_code = os.WEXITSTATUS(res) -sys.exit(exit_code) +sys.exit(res) diff --git a/test/xpu/extended/run_test_with_skip_arc.py b/test/xpu/extended/run_test_with_skip_arc.py new file mode 100644 index 000000000..30fd2c0e0 --- /dev/null +++ b/test/xpu/extended/run_test_with_skip_arc.py @@ -0,0 +1,24 @@ +import os +import sys +from skip_list_common import skip_dict +from skip_list_arc import skip_dict as skip_dict_specifical +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_arc import skip_dict as skip_dict_win_arc + +IS_WINDOWS = sys.platform == "win32" + +skip_list = skip_dict["test_ops_xpu.py"] + skip_dict_specifical["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_arc["test_ops_xpu.py"] + +skip_options = " -k \"not " + skip_list[0] +for skip_case in skip_list[1:]: + skip_option = " and not " + skip_case + skip_options += skip_option +skip_options += "\"" + +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = "pytest -v test_ops_xpu.py" +test_command += skip_options +res = os.system(test_command) +sys.exit(res) \ No newline at end of file diff --git a/test/xpu/extended/skip_list_arc.py b/test/xpu/extended/skip_list_arc.py new file mode 100644 index 000000000..e1e701b84 --- /dev/null +++ b/test/xpu/extended/skip_list_arc.py @@ -0,0 +1,11 @@ +skip_dict = { + "test_ops_xpu.py": ( + # RuntimeError: Required aspect fp64 is not supported on the device + # https://github.com/intel/torch-xpu-ops/issues/628 + "test_compare_cpu_bincount_xpu_int16", + "test_compare_cpu_bincount_xpu_int32", + "test_compare_cpu_bincount_xpu_int64", + "test_compare_cpu_bincount_xpu_int8", + "test_compare_cpu_bincount_xpu_uint8", + ), +} diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py new file mode 100644 index 000000000..d754c9150 --- /dev/null +++ b/test/xpu/extended/skip_list_common.py @@ -0,0 +1,166 @@ +skip_dict = { + "test_ops_xpu.py": ( + # Calculation error between XPU implementation and CPU implementation, + # 1. Compiler optimization causes failing to promote data type to higher precision. + # 2. Accumulate error is amplified by some operations in some extreme cases. (std::exp(extreme_large_num)) + # 3. Accumulate error is amplified by a large number of accumalate operations. + # 4. Accumulate error is different on different implementations due to different accumulation order. + # a. Different kernel implementations. + # b. Different std functions. (std::log, std::tanh, std::exp) + # 5. The result of division between two same float values is not 1. + # 6. std functions get different results when input is nan or inf between GCC and SYCL. + "test_compare_cpu_cumsum_xpu_bfloat16", + "test_compare_cpu_cumsum_xpu_float16", + "test_compare_cpu_log_xpu_complex64", + "test_compare_cpu_log10_xpu_complex64", + "test_compare_cpu_log1p_xpu_complex64", + "test_compare_cpu_log2_xpu_complex64", + "test_compare_cpu_log2_xpu_complex128", + "test_compare_cpu_mul_xpu_complex64", + "test_compare_cpu_pow_xpu_complex128", + "test_compare_cpu_pow_xpu_complex64", + "test_compare_cpu_tan_xpu_complex128", + "test_compare_cpu_tan_xpu_complex64", + "test_compare_cpu_tanh_xpu_complex128", + "test_compare_cpu_tanh_xpu_complex64", + "test_compare_cpu_rsqrt_xpu_bfloat16", + "test_compare_cpu_pow_xpu_bfloat16", + # cuda has the same issue on this case + "test_compare_cpu__refs_rsub_xpu_bfloat16", + "test_compare_cpu_add_xpu_bfloat16", + "test_compare_cpu_sub_xpu_bfloat16", + "test_compare_cpu_acos_xpu_complex128", + "test_compare_cpu_acos_xpu_complex64", + "test_compare_cpu_acosh_xpu_complex64", + "test_compare_cpu_cross_xpu_float16", + "test_compare_cpu_floor_divide_xpu_bfloat16", + "test_compare_cpu_floor_divide_xpu_float16", + "test_compare_cpu_polygamma_polygamma_n_0_xpu_bfloat16", + "test_compare_cpu_exp_xpu_bfloat16", + "test_compare_cpu_exp_xpu_complex128", + "test_compare_cpu_exp_xpu_complex64", + "test_compare_cpu_acosh_xpu_complex64", + "test_compare_cpu_asin_xpu_complex128", + "test_compare_cpu_asin_xpu_complex64", + "test_compare_cpu_asinh_xpu_complex128", + "test_compare_cpu_asinh_xpu_complex64", + "test_compare_cpu_atan_xpu_complex128", + "test_compare_cpu_atan_xpu_complex64", + "test_compare_cpu_exp2_xpu_complex128", + "test_compare_cpu_exp2_xpu_complex64", + "test_compare_cpu_nextafter_xpu_bfloat16", + # CUDA does not support the data type either + "test_non_standard_bool_values_native_dropout_backward_xpu_bool", + # Need FP64 golden ref for more accurate comparison + "test_compare_cpu_log_softmax_xpu_bfloat16", + # TestCompositeCompliance + # CPU fallback fails + # Require implementing aten::embedding_renorm_ + "test_view_replay_nn_functional_embedding_xpu_float32", + # TestCompositeCompliance::test_cow_input + # XPU Tensor fails in copy-on-write cases + # AssertionError: False is not true : Keyword argument 'output grad 0' during backward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_backward=False` in this operation's OpInfo, add the arg to the OpInfo's `allow_cow_input_materialize_backward` list, or change the implementation to avoid materialization. + # https://github.com/intel/torch-xpu-ops/issues/281 + "test_cow_input", + # XPU implementation is correct. + # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified) + # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified) + # CPU implementation gets NaN in the cases. + # https://en.cppreference.com/w/cpp/numeric/complex/exp + "test_compare_cpu_sigmoid_xpu_complex64", + "test_compare_cpu_sigmoid_xpu_complex128", + # Special handle (different calculation order) in CPU reference impl. + # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300 + "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16", + "test_compare_cpu_nn_functional_embedding_bag_xpu_float16", + # Not implemented operators, aten::embedding_renorm_. + # To retrieve cases when the operators are supported. + # https://github.com/intel/torch-xpu-ops/issues/380 + "test_compare_cpu_nn_functional_embedding_bag_xpu_float32", + "test_compare_cpu_nn_functional_embedding_bag_xpu_float64", + "test_view_replay_nn_functional_embedding_bag_xpu_float32", + # Double and complex datatype matmul is not supported in oneDNN + "test_compare_cpu_cdist_xpu_float64", + # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision + # Not in CUDA test scope too + "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16", + "test_compare_cpu_nn_functional_upsample_bilinear_xpu_float16", + # CPU result is not golden reference + "test_compare_cpu_nn_functional_group_norm_xpu_bfloat16", + "test_compare_cpu_nn_functional_group_norm_xpu_float16", + "test_compare_cpu_nn_functional_nll_loss_xpu_bfloat16", + "test_compare_cpu_nn_functional_nll_loss_xpu_float16", + "test_compare_cpu_nn_functional_batch_norm_xpu_bfloat16", + "test_compare_cpu__batch_norm_with_update_xpu_bfloat16", + "test_compare_cpu__batch_norm_with_update_xpu_float16", + "test_compare_cpu_nn_functional_huber_loss_xpu_bfloat16", + "test_compare_cpu_nansum_xpu_bfloat16", + "test_compare_cpu_nanmean_xpu_bfloat16", + # Align with CUDA impl by using accumulate type. But CPU doesn't use. + # When XPU uses original data type, the case passes. + "test_compare_cpu_logit_xpu_bfloat16", + # precison error + # Mismatched elements: 1 / 24 (4.2%) + # Greatest absolute difference: 0.03125 at index (0, 1, 0, 1) (up to 0.001 allowed) + # Greatest relative difference: 0.0048828125 at index (0, 1, 0, 1) (up to 0.001 allowed) + "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_bfloat16", + # RuntimeError: "compute_index_ranges_weights" not implemented for 'Half' + "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float16", + # AssertionError: False is not true : Argument 0 during forward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_forward=False... + "test_cow_input_nn_functional_interpolate_bilinear_xpu_float32", + "test_cow_input_nn_functional_interpolate_linear_xpu_float32", + "test_cow_input_nn_functional_interpolate_trilinear_xpu_float32", + #The results of XPU and CUDA are consistent, but the results of CPU and CUDA are inconsistent + "test_compare_cpu_nn_functional_interpolate_linear_xpu_bfloat16", + "test_compare_cpu_nn_functional_interpolate_linear_xpu_float16", + # bicubic interpolate includes large calculation steps, accuracy reduces in half-precision + # Not in CUDA test scope too + "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_bfloat16", + "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float16", + # Not all operators are implemented for XPU tested in the case. + # Retrieve it once the operator is implemented. + # Error: The operator 'aten::glu_jvp' is not currently implemented for the XPU device. + "test_forward_ad_nn_functional_glu_xpu_float32", + # Precision error. + # Mismatched elements: 1 / 812 (0.1%) + # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed) + # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed) + "test_compare_cpu_hypot_xpu_bfloat16", + # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. + # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. + "test_compare_cpu_polar_xpu_bfloat16", + # Precision error. + # Mismatched elements: 1 / 25 (4.0%) + # Greatest absolute difference: 0.00146484375 at index (0, 0) (up to 0.001 allowed) + # Greatest relative difference: 0.0163116455078125 at index (0, 0) (up to 0.001 allowed) + "test_compare_cpu_sub_xpu_float16", + # different results for value index due to unstable sort. + # XPU and CUDA have the same result. + "test_compare_cpu_median_xpu_int16", + "test_compare_cpu_median_xpu_int32", + "test_compare_cpu_median_xpu_int64", + "test_compare_cpu_median_xpu_int8", + "test_compare_cpu_median_xpu_uint8", + "test_compare_cpu_nanmedian_xpu_int16", + "test_compare_cpu_nanmedian_xpu_int32", + "test_compare_cpu_nanmedian_xpu_int64", + "test_compare_cpu_nanmedian_xpu_int8", + "test_compare_cpu_nanmedian_xpu_uint8", + + # sort algorithm is different to cpu + "test_compare_cpu_argsort_xpu_bfloat16", + "test_compare_cpu_argsort_xpu_float16", + "test_compare_cpu_argsort_xpu_int8", + "test_compare_cpu_argsort_xpu_uint8", + + # AssertionError: The values for attribute 'dtype' do not match: torch.float32 != torch.bfloat16 + # https://github.com/intel/torch-xpu-ops/issues/780 + "test_compare_cpu_native_layer_norm_xpu_bfloat16", + "test_compare_cpu_native_layer_norm_xpu_float16", + + # AssertionError: Tensor-likes are not close! + # https://github.com/intel/torch-xpu-ops/issues/781 + "test_compare_cpu_square_xpu_complex64", + + ), +} diff --git a/test/xpu/extended/skip_list_win.py b/test/xpu/extended/skip_list_win.py new file mode 100644 index 000000000..e8faa008d --- /dev/null +++ b/test/xpu/extended/skip_list_win.py @@ -0,0 +1,5 @@ +skip_dict = { + "test_ops_xpu.py": ( + "test_compare_cpu_pow_xpu_bfloat16", # https://github.com/intel/torch-xpu-ops/pull/764 + ), +} diff --git a/test/xpu/extended/skip_list_win_arc.py b/test/xpu/extended/skip_list_win_arc.py new file mode 100644 index 000000000..9d9221a4d --- /dev/null +++ b/test/xpu/extended/skip_list_win_arc.py @@ -0,0 +1,8 @@ +skip_dict = { + # SYCL Compiler on Windows removed the following operations when '-cl-poison-unsupported-fp64-kernels' is on + # Hence, skip the following windows specific errors + "test_ops_xpu.py": ( + "test_compare_cpu_sqrt_xpu_complex64", + "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32", + ), +} diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py index e6d46fccb..93fc8f359 100644 --- a/test/xpu/extended/test_ops_xpu.py +++ b/test/xpu/extended/test_ops_xpu.py @@ -44,7 +44,11 @@ xpu_device = torch.device("xpu") any_common_cpu_xpu_one = OpDTypes.any_common_cpu_cuda_one -cpu_xpu_all = (torch.bfloat16, torch.complex128, torch.complex64, torch.float16, torch.float32, torch.float64, torch.int16, torch.int32, torch.int64, torch.int8, torch.uint8, torch.bool) +cpu_xpu_all = (torch.bfloat16, torch.complex64, torch.float16, torch.float32, torch.int16, + torch.int32, torch.int64, torch.int8, torch.uint8, torch.bool) \ + if "has_fp64=0" in str(torch.xpu.get_device_properties(0)) else \ + (torch.bfloat16, torch.complex128, torch.complex64, torch.float16, torch.float32, + torch.float64, torch.int16, torch.int32, torch.int64, torch.int8, torch.uint8, torch.bool) _ops_and_refs_with_no_numpy_ref = [op for op in ops_and_refs if op.ref is None] _xpu_computation_ops = [ diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index 9b48b4959..28ce0a072 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -6,37 +6,27 @@ def launch_test(test_case, skip_list=None, exe_list=None): + os.environ["PYTORCH_ENABLE_XPU_FALLBACK"]="1" + os.environ["PYTORCH_TEST_WITH_SLOW"]="1" if skip_list != None: - skip_options = " -k 'not " + skip_list[0] + skip_options = " -k \"not " + skip_list[0] for skip_case in skip_list[1:]: skip_option = " and not " + skip_case skip_options += skip_option - skip_options += "'" - test_command = ( - "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v " - + test_case - ) - test_command += skip_options + skip_options += "\"" + test_command = "pytest -v " + test_case + skip_options return os.system(test_command) elif exe_list != None: - exe_options = " -k '" + exe_list[0] + exe_options = " -k \"" + exe_list[0] for exe_case in exe_list[1:]: exe_option = " or " + exe_case exe_options += exe_option - exe_options += "'" - test_command = ( - "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v " - + test_case - ) - test_command += exe_options + exe_options += "\"" + test_command = "pytest -v " + test_case + exe_options return os.system(test_command) else: - test_command = ( - "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v " - + test_case - ) - return os.system(test_command) - + test_command = "pytest -v " + test_case + return os.system(test_command) res = 0 @@ -51,5 +41,8 @@ def launch_test(test_case, skip_list=None, exe_list=None): ) res += launch_test("test_decomp_xpu.py", exe_list=execute_list) -exit_code = os.WEXITSTATUS(res) -sys.exit(exit_code) +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index e348355b5..c321b7a9b 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -17,5 +17,9 @@ if fail_test: print(",".join(fail_test) + " have failures") -exit_code = os.WEXITSTATUS(res) -sys.exit(exit_code) + +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) diff --git a/test/xpu/run_test_with_skip_arc.py b/test/xpu/run_test_with_skip_arc.py index c20b1e5ac..3c99244a1 100644 --- a/test/xpu/run_test_with_skip_arc.py +++ b/test/xpu/run_test_with_skip_arc.py @@ -20,5 +20,8 @@ skip_list += skip_dict_win_arc[key] res += launch_test(key, skip_list) -exit_code = os.WEXITSTATUS(res) -sys.exit(exit_code) +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) \ No newline at end of file diff --git a/test/xpu/run_test_with_skip_mtl.py b/test/xpu/run_test_with_skip_mtl.py index 828466b11..b8f6c180e 100644 --- a/test/xpu/run_test_with_skip_mtl.py +++ b/test/xpu/run_test_with_skip_mtl.py @@ -16,5 +16,8 @@ skip_list += skip_dict_win[key] res += launch_test(key, skip_list) -exit_code = os.WEXITSTATUS(res) -sys.exit(exit_code) +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) \ No newline at end of file diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index a83fa6749..6b5df037b 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -1,6 +1,9 @@ skip_dict = { "test_ops_xpu.py": ( # Skip list of base line + + # Need to revisit when the ops are enabled + # AssertionError: The supported dtypes for xxx on device type xpu are incorrect! "test_dtypes___rmod___xpu", "test_dtypes_nn_functional_conv1d_xpu", "test_dtypes_nn_functional_conv2d_xpu", @@ -10,176 +13,122 @@ "test_dtypes_nn_functional_conv_transpose3d_xpu", "test_dtypes_nn_functional_softsign_xpu", "test_dtypes_sparse_sampled_addmm_xpu", - "test_compare_cpu_sparse_sampled_addmm_xpu_float32", - "test_compare_cpu_to_sparse_xpu_float32", + # AssertionError: RuntimeError not raised "test_errors_dot_xpu", - "test_errors_kthvalue_xpu", - "test_errors_sparse_mul_layout0_xpu", - "test_errors_sparse_mul_layout1_xpu", - "test_errors_sparse_mul_layout2_xpu", - "test_errors_sparse_mul_layout3_xpu", + "test_errors_kthvalue_xpu", "test_errors_take_xpu", "test_errors_vdot_xpu", - "test_non_standard_bool_values_to_sparse_xpu_bool", + # Fallback cases with skipCPUIfNoLapack, AssertionError: Tensor-likes are not close! "test_noncontiguous_samples_linalg_det_xpu_float32", "test_noncontiguous_samples_linalg_slogdet_xpu_float32", "test_noncontiguous_samples_linalg_solve_ex_xpu_float32", "test_noncontiguous_samples_linalg_solve_xpu_float32", "test_noncontiguous_samples_linalg_tensorsolve_xpu_float32", "test_noncontiguous_samples_logdet_xpu_float32", - "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64", - "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64", - "test_noncontiguous_samples_nn_functional_conv3d_xpu_int64", + "test_noncontiguous_samples_nn_functional_rrelu_xpu_float32", "test_noncontiguous_samples_nn_functional_conv3d_xpu_complex64", - "test_noncontiguous_samples_nn_functional_conv_transpose1d_xpu_int64", + "test_variant_consistency_eager_nn_functional_rrelu_xpu_float32", + + # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta + # https://github.com/intel/torch-xpu-ops/issues/357 + "test_compare_cpu_sparse_sampled_addmm_xpu_float32", + "test_errors_sparse_mul_layout0_xpu", + "test_errors_sparse_mul_layout1_xpu", + "test_errors_sparse_mul_layout2_xpu", + "test_errors_sparse_mul_layout3_xpu", + "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64", + "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32", + + # NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend. + # https://github.com/intel/torch-xpu-ops/issues/357 + "test_compare_cpu_to_sparse_xpu_float32", + "test_variant_consistency_eager_to_sparse_xpu_float32", + + # RuntimeError: sparse_dim expected sparse or strided tensor layout but got Sparse + # Issue https://github.com/intel/torch-xpu-ops/issues/357 + "test_variant_consistency_eager_to_sparse_xpu_complex64", + "test_non_standard_bool_values_to_sparse_xpu_bool", + + # OneDNN issues, https://github.com/intel/torch-xpu-ops/issues/253 + # RuntimeError: Long is not supported in oneDNN! + # RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive + # RuntimeError: Double and complex datatype matmul is not supported in oneDNN + "test_noncontiguous_samples_nn_functional_conv3d_xpu_int64", + "test_noncontiguous_samples_nn_functional_conv_transpose1d_xpu_int64", # "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_complex64", "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_float32", "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_int64", "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_complex64", "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_float32", "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_int64", - "test_noncontiguous_samples_nn_functional_rrelu_xpu_float32", + "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64", + "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64", + + # RuntimeError: mode only supports CPU AND CUDA device type, got: xpu + # Issue https://github.com/intel/torch-xpu-ops/issues/327 "test_numpy_ref_linalg_tensorinv_xpu_float64", "test_out_mode_xpu_float32", + + # RuntimeError: false INTERNAL ASSERT FAILED at "/home/gta/daisyden/pytorch4/aten/src/ATen/native/DispatchStub.cpp":220, please report a bug to PyTorch. DispatchStub: missing kernel for xpu "test_out_nanmean_xpu_float32", - "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64", - "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32", - "test_out_warning_nanmean_xpu", - "test_python_ref__refs_linspace_tensor_overload_xpu_int16", - "test_python_ref__refs_linspace_tensor_overload_xpu_int32", - "test_python_ref__refs_linspace_tensor_overload_xpu_int64", - "test_python_ref__refs_linspace_tensor_overload_xpu_int8", - "test_python_ref__refs_linspace_tensor_overload_xpu_uint8", - "test_python_ref__refs_linspace_xpu_int16", - "test_python_ref__refs_linspace_xpu_int32", - "test_python_ref__refs_linspace_xpu_int64", - "test_python_ref__refs_linspace_xpu_int8", - "test_python_ref__refs_linspace_xpu_uint8", - "test_python_ref__refs_logaddexp_xpu_complex128", - "test_python_ref__refs_logaddexp_xpu_complex64", - "test_python_ref__refs_native_layer_norm_xpu_bfloat16", - "test_python_ref__refs_native_layer_norm_xpu_float16", - "test_python_ref__refs_native_layer_norm_xpu_float32", - "test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16", - "test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_float16", - "test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_bfloat16", - "test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_float16", - "test_python_ref__refs_nn_functional_triplet_margin_loss_xpu_uint8", - "test_python_ref__refs_square_xpu_bool", - "test_python_ref__refs_trunc_xpu_float64", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_bfloat16", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_float16", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_float32", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_float64", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_int16", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_int32", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_int64", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_int8", - "test_python_ref_executor__refs_geometric_executor_aten_xpu_uint8", - "test_python_ref_executor__refs_linspace_executor_aten_xpu_int16", - "test_python_ref_executor__refs_linspace_executor_aten_xpu_int32", - "test_python_ref_executor__refs_linspace_executor_aten_xpu_int64", - "test_python_ref_executor__refs_linspace_executor_aten_xpu_int8", - "test_python_ref_executor__refs_linspace_executor_aten_xpu_uint8", - "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int16", - "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int32", - "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int64", - "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int8", - "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_uint8", - "test_python_ref_executor__refs_log_normal_executor_aten_xpu_bfloat16", - "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float16", - "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float32", - "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float64", + "test_out_warning_nanmean_xpu", + + # NameError: name 'nanj' is not defined. Did you mean: 'nan'? + # https://github.com/intel/torch-xpu-ops/issues/768 "test_python_ref_executor__refs_logaddexp_executor_aten_xpu_complex128", "test_python_ref_executor__refs_logaddexp_executor_aten_xpu_complex64", - "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_bfloat16", - "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float16", - "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float32", - "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_bfloat16", - "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float16", - "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float32", - "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float64", - "test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_bfloat16", - "test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_float16", - "test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_bfloat16", - "test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_float16", - "test_python_ref_executor__refs_nn_functional_triplet_margin_loss_executor_aten_xpu_uint8", - "test_python_ref_executor__refs_square_executor_aten_xpu_bool", - "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex128", - "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex64", - "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int16", - "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int32", - "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int64", - "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int8", - "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_uint8", - "test_python_ref_torch_fallback__refs_linspace_xpu_int16", - "test_python_ref_torch_fallback__refs_linspace_xpu_int32", - "test_python_ref_torch_fallback__refs_linspace_xpu_int64", - "test_python_ref_torch_fallback__refs_linspace_xpu_int8", - "test_python_ref_torch_fallback__refs_linspace_xpu_uint8", - "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex128", - "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex64", - "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_bfloat16", - "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float16", - "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float32", - "test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16", - "test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_float16", - "test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_bfloat16", - "test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_float16", - "test_python_ref_torch_fallback__refs_sinh_xpu_complex128", - "test_python_ref_torch_fallback__refs_special_multigammaln_mvlgamma_p_5_xpu_int32", - "test_python_ref_torch_fallback__refs_square_xpu_bool", - "test_python_ref_torch_fallback__refs_vdot_xpu_complex128", - "test_python_ref_torch_fallback__refs_vdot_xpu_complex64", + + # RuntimeError: could not create a primitive descriptor for a deconvolution + # https://github.com/intel/torch-xpu-ops/issues/253 "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_complex64", "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_float32", "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64", "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_float32", - "test_variant_consistency_eager_nn_functional_rrelu_xpu_float32", - "test_variant_consistency_eager_to_sparse_xpu_complex64", - "test_variant_consistency_eager_to_sparse_xpu_float32", - "test_compare_cpu__refs_special_zeta_xpu_float32", + + # Need revisit when the op is enabled + # Unexpected success, xpu passed because it compares to cpu "test_compare_cpu_linalg_lu_factor_ex_xpu_float32", "test_compare_cpu_linalg_lu_factor_xpu_float32", "test_compare_cpu_linalg_lu_xpu_float32", "test_compare_cpu_special_hermite_polynomial_h_xpu_float32", - "test_compare_cpu_special_zeta_xpu_float32", + "test_compare_cpu_special_zeta_xpu_float32", + + # XFAIL of CUDA and XPU, unexpected success in fallback "test_out_cholesky_inverse_xpu_float32", "test_out_geqrf_xpu_float32", "test_out_narrow_copy_xpu_float32", "test_out_ormqr_xpu_float32", - "test_out_triangular_solve_xpu_float32", - "test_python_ref__refs_heaviside_xpu_int64", - "test_python_ref__refs_special_bessel_j0_xpu_int64", - "test_python_ref_errors__refs_dstack_xpu", - "test_python_ref_errors__refs_hstack_xpu", - "test_python_ref_errors__refs_linalg_cross_xpu", - "test_python_ref_errors__refs_vstack_xpu", + + # XFAIL of CUDA, XPU got unexpected success + "test_python_ref__refs_div_no_rounding_mode_xpu_complex32", + "test_python_ref__refs_pow_xpu_complex32", "test_python_ref_executor__refs_mul_executor_aten_xpu_complex32", - "test_python_ref__refs_special_multigammaln_mvlgamma_p_5_xpu_float64", - "test_python_ref_executor__refs_special_multigammaln_mvlgamma_p_3_executor_aten_xpu_float64", - "test_python_ref__refs_square_xpu_complex128", - "test_python_ref__refs_square_xpu_complex64", - "test_python_ref_executor__refs_istft_executor_aten_xpu_complex128", - "test_python_ref_executor__refs_square_executor_aten_xpu_complex128", - "test_python_ref_torch_fallback__refs_square_xpu_complex128", - "test_python_ref_torch_fallback__refs_square_xpu_complex64", - # Skip list of new added when porting XPU operators. - # See: https://github.com/intel/torch-xpu-ops/issues/128 - "test_dtypes_view_as_complex_xpu", # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16} - "test_dtypes_view_as_real_xpu", # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16} - "test_noncontiguous_samples_native_dropout_backward_xpu_int64", # The implementation aligns with CUDA, RuntimeError: "masked_scale" not implemented for 'Long'. - "test_non_standard_bool_values_native_dropout_backward_xpu_bool", # The implementation aligns with CUDA, RuntimeError: "masked_scale" not implemented for 'Bool'. - "test_non_standard_bool_values_scatter_reduce_amax_xpu_bool", # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool' - "test_non_standard_bool_values_scatter_reduce_amin_xpu_bool", # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool' - "test_non_standard_bool_values_scatter_reduce_prod_xpu_bool", # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool' - "test_non_standard_bool_values_argsort_xpu_bool", # The implementation aligns with CUDA, RuntimeError: "argsort" not implemented for 'Bool'. - "test_non_standard_bool_values_msort_xpu_bool", # The implementation aligns with CUDA, RuntimeError: "msort" not implemented for 'Bool'. + "test_python_ref_torch_fallback__refs_div_no_rounding_mode_xpu_complex32", + "test_python_ref__refs_pow_xpu_complex32", + "test_python_ref_executor__refs_mul_executor_aten_xpu_complex32", + "test_python_ref_torch_fallback__refs_div_no_rounding_mode_xpu_complex32", + "test_python_ref_torch_fallback__refs_pow_xpu_complex32", + + + # unexpected success because of cpu fallback + "test_out_triangular_solve_xpu_float32", + + # Newly added: + + # Cuda skipped it "test_non_standard_bool_values_sort_xpu_bool", # The implementation aligns with CUDA, RuntimeError: "sort" not implemented for 'Bool'. + + # Cuda skipped it + "test_non_standard_bool_values_msort_xpu_bool", # The implementation aligns with CUDA, RuntimeError: "msort" not implemented for 'Bool'. + + + # Unexpected success "test_python_ref_executor__refs_pow_executor_aten_xpu_complex32", # Didn't align with CUDA, Unexpected success + # Unexpected success - "test_errors_histogramdd_xpu", + # "test_errors_histogramdd_xpu", #XFAIL now # Jiterator is only supported on CUDA and ROCm GPUs, none are available. + # https://github.com/intel/torch-xpu-ops/issues/584 "_jiterator_", # https://github.com/intel/torch-xpu-ops/issues/157 # Segfault: @@ -473,9 +422,11 @@ "test_variant_consistency_eager_triangular_solve_xpu_complex64", # oneDNN issues # RuntimeError: value cannot be converted to type float without overflow + # https://github.com/intel/torch-xpu-ops/issues/683 "test_conj_view_addbmm_xpu_complex64", "test_neg_conj_view_addbmm_xpu_complex128", # CPU fallback error: AssertionError: Tensor-likes are not close! + # https://github.com/intel/torch-xpu-ops/issues/271 "test_neg_view_nn_functional_rrelu_xpu_float64", ### Error #0 in TestMathBitsXPU , RuntimeError: Double and complex datatype matmul is not supported in oneDNN # https://github.com/intel/torch-xpu-ops/issues/254 @@ -693,7 +644,7 @@ "test_neg_view_nn_functional_conv_transpose2d_xpu_float64", "test_neg_view_nn_functional_conv_transpose3d_xpu_float64", ### Error #2 in TestMathBitsXPU , NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend. - # https://github.com/intel/torch-xpu-ops/issues/242 + # https://github.com/intel/torch-xpu-ops/issues/242 and https://github.com/intel/torch-xpu-ops/issues/240 "test_conj_view_to_sparse_xpu_complex64", "test_neg_conj_view_to_sparse_xpu_complex128", "test_neg_view_to_sparse_xpu_float64", @@ -703,13 +654,14 @@ # in XPU supported operators. Then the case will work. "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64", "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64", + #AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect! #The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}. - #XPU supports bfloat16, CUDA doesn't support it. + # XPU supports bfloat16, CUDA doesn't support it. "test_dtypes_unique_xpu", # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. - "test_dtypes_polar_xpu", + #"test_dtypes_polar_xpu", # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support # but test_dtypes infrastructure leverage CUDA supported datatypes "test_dtypes_histogram_xpu", @@ -718,32 +670,15 @@ # 1. most cases of nextafter require Half dtype. # 2. Half dtype is a common dtype in workloads. # So far CUDA doesn't support Half, so that XPU fails as we aligned claimed dtypes with CUDA in test infra. + # https://github.com/intel/torch-xpu-ops/issues/623 "test_dtypes_nextafter_xpu", - # AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect! - # The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}. - # XPU supports bfloat16, CUDA doesn't support it. - "test_dtypes_unique_consecutive_xpu", - # 0816 add - "test_dtypes__unsafe_masked_index_put_accumulate_xpu", - "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_complex64", - "test_conj_view__unsafe_masked_index_put_accumulate_xpu_complex64", - "test_neg_conj_view__unsafe_masked_index_put_accumulate_xpu_complex128", - "test_neg_view__unsafe_masked_index_put_accumulate_xpu_float64", - "test_compare_cpu__unsafe_masked_index_put_accumulate_xpu_float32", - "test_dtypes__unsafe_masked_index_put_accumulate_xpu", - "test_multiple_devices__unsafe_masked_index_put_accumulate_xpu_float32", - "test_multiple_devices__unsafe_masked_index_put_accumulate_xpu_int64", - "test_non_standard_bool_values__unsafe_masked_index_put_accumulate_xpu_bool", - "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_complex64", - "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_float32", - "test_noncontiguous_samples__unsafe_masked_index_put_accumulate_xpu_int64", - "test_out__unsafe_masked_index_put_accumulate_xpu_float32", - "test_out_warning__unsafe_masked_index_put_accumulate_xpu", - "test_variant_consistency_eager__unsafe_masked_index_put_accumulate_xpu_complex64", - "test_variant_consistency_eager__unsafe_masked_index_put_accumulate_xpu_float32", - "test_conj_view__unsafe_masked_index_put_accumulate_xpu_complex64", - "test_neg_conj_view__unsafe_masked_index_put_accumulate_xpu_complex128", - "test_neg_view__unsafe_masked_index_put_accumulate_xpu_float64", + + # AssertionError: The supported dtypes for argsort on device type xpu are incorrect! + # The following dtypes worked in forward but are not listed by the OpInfo: {torch.bool}. + # CUDA does not have torch.bool support on argsort. + "test_dtypes_argsort_xpu", + # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported" + "test_errors_histogramdd_xpu", ), "test_binary_ufuncs_xpu.py": ( @@ -1185,9 +1120,9 @@ # It is kernel assert on XPU implementation not exception on host. # We are same as CUDA implementation. And CUDA skips these cases. "test_trivial_fancy_out_of_bounds_xpu", - "test_advancedindex", - # CUDA bias case - "test_index_put_accumulate_with_optional_tensors_xpu", + # index boundary should be checked. + # https://github.com/intel/torch-xpu-ops/issues/783 + "test_advancedindex_xpu_float64", # XPU implementation doesn't claimn FP8 now # https://github.com/intel/torch-xpu-ops/issues/461 "test_index_put_src_datatype_xpu_float8_e5m2", @@ -1330,10 +1265,18 @@ "test_reference_numerics_normal_polygamma_polygamma_n_2_xpu_float16", "test_reference_numerics_normal_polygamma_polygamma_n_3_xpu_float16", "test_reference_numerics_normal_polygamma_polygamma_n_4_xpu_float16", + + # CUDA XFAIL + "test_reference_numerics_large__refs_rsqrt_xpu_complex32", + + # Compiler issue in handling tanh with real or imag inf. + # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974 + "test_reference_numerics_large__refs_tanh_xpu_complex32", ), "test_masked_xpu.py": ( # RuntimeError: is_coalesced expected sparse coordinate tensor layout but got Sparse. + # https://github.com/intel/torch-xpu-ops/issues/357 "test_mask_layout_sparse_coo_masked_amax_xpu_bfloat16", "test_mask_layout_sparse_coo_masked_amax_xpu_float16", "test_mask_layout_sparse_coo_masked_amax_xpu_float32", @@ -1645,7 +1588,7 @@ "test_triangular_solve_xpu_complex64", "test_triangular_solve_xpu_float64", # https://github.com/intel/torch-xpu-ops/issues/317 - # addmm.out, addmv.out, linalg_lstsq, linalg_vector_norm.out, norm.out, vdot&dot lack XPU support and fallback to CPU + # addmm.out, addmv.out, linalg_lstsq, vdot&dot lack XPU support and fallback to CPU "test_addmm_sizes_xpu_complex128", "test_addmm_sizes_xpu_complex64", "test_blas_alpha_beta_empty_xpu_complex128", @@ -1654,21 +1597,18 @@ "test_linalg_lstsq_input_checks_xpu_complex64", "test_linalg_lstsq_input_checks_xpu_float32", "test_linalg_lstsq_input_checks_xpu_float64", - "test_norm_fused_type_promotion_xpu_bfloat16", - "test_norm_fused_type_promotion_xpu_float16", "test_dot_invalid_args_xpu", "test_vdot_invalid_args_xpu", + # https://github.com/intel/torch-xpu-ops/issues/821 + # RuntimeError: Fail to enable Kineto Profiler on XPU due to error code: 200 + "test_norm_fused_type_promotion_xpu_bfloat16", + # AssertionError: True is not false + "test_norm_fused_type_promotion_xpu_float16", + # https://github.com/intel/torch-xpu-ops/issues/814 # xpu does not have '_cuda_tunableop_is_enabled' API "test_matmul_small_brute_force_tunableop_xpu_float16", "test_matmul_small_brute_force_tunableop_xpu_float32", "test_matmul_small_brute_force_tunableop_xpu_float64", - # TypeError: Invalid NaN comparison. - "test_compile_int4_mm_m_32_k_32_n_48_xpu", - "test_compile_int4_mm_m_32_k_32_n_64_xpu", - "test_compile_int4_mm_m_32_k_64_n_48_xpu", - "test_compile_int4_mm_m_32_k_64_n_64_xpu", - "test_compile_int4_mm_m_64_k_32_n_48_xpu", - "test_compile_int4_mm_m_64_k_32_n_64_xpu", # XPU does not support tunable. "test_bmm_tunableop_rocm_xpu_float32", "test_numeric_check_leak_tunableop_rocm_xpu_float32", @@ -1991,6 +1931,7 @@ "test_maskedtensor_xpu.py": ( # RuntimeError: is_coalesced expected sparse coordinate tensor layout but got Sparse + # https://github.com/intel/torch-xpu-ops/issues/357 "test_contiguous_xpu", "test_invalid_sparse_coo_values_xpu", "test_to_dense_and_sparse_coo_xpu", @@ -2710,6 +2651,7 @@ "test_autodiff__foreach_sigmoid_outplace_xpu_complex128", "test_binary_op_with_scalar_self_support__foreach_pow_is_fastpath_True_xpu_bool", # AssertionError: RuntimeError not raised + # https://github.com/intel/torch-xpu-ops/issues/784 "test_0dim_tensor_overload_exception_xpu", # RuntimeError: Tried to instantiate dummy base class CUDAGraph "test_big_num_tensors__foreach_max_use_cuda_graph_True_xpu_float32", diff --git a/test/xpu/skip_list_win.py b/test/xpu/skip_list_win.py index 35bf3014a..f6f74c817 100644 --- a/test/xpu/skip_list_win.py +++ b/test/xpu/skip_list_win.py @@ -68,8 +68,4 @@ "test_reference_numerics_large_sin_xpu_complex64", "test_reference_numerics_small_acos_xpu_complex32", ), - "test_ops_xpu.py": ( - "test_compare_cpu_pow_xpu_bfloat16", # https://github.com/intel/torch-xpu-ops/pull/764 - ), - } diff --git a/test/xpu/skip_list_win_arc.py b/test/xpu/skip_list_win_arc.py index 0b1d079e1..de5d03891 100644 --- a/test/xpu/skip_list_win_arc.py +++ b/test/xpu/skip_list_win_arc.py @@ -1,10 +1,4 @@ skip_dict = { - # SYCL Compiler on Windows removed the following operations when '-cl-poison-unsupported-fp64-kernels' is on - # Hence, skip the following windows specific errors - "test_ops_xpu.py": ( - "test_compare_cpu_sqrt_xpu_complex64", - "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32", - ), "test_binary_ufuncs_xpu": ( "test_batch_vs_slicing___rpow___xpu_complex64", "test_batch_vs_slicing__refs_pow_xpu_complex64", diff --git a/test/xpu/test_indexing_xpu.py b/test/xpu/test_indexing_xpu.py index c3637b0fe..d57567318 100644 --- a/test/xpu/test_indexing_xpu.py +++ b/test/xpu/test_indexing_xpu.py @@ -11,6 +11,37 @@ with XPUPatchForImport(False): from test_indexing import NumpyTests,TestIndexing + import torch + + + def __test_index_put_accumulate_with_optional_tensors(self, device): + # TODO: replace with a better solution. + # Currently, here using torchscript to put None into indices. + # on C++ it gives indices as a list of 2 optional tensors: first is null and + # the second is a valid tensor. + @torch.jit.script + def func(x, i, v): + idx = [None, i] + x.index_put_(idx, v, accumulate=True) + return x + + n = 4 + t = torch.arange(n * 2, dtype=torch.float32).reshape(n, 2) + t_dev = t.to(device) + indices = torch.tensor([1, 0]) + indices_dev = indices.to(device) + value0d = torch.tensor(10.0) + value1d = torch.tensor([1.0, 2.0]) + + out_cuda = func(t_dev, indices_dev, value0d.xpu()) + out_cpu = func(t, indices, value0d) + self.assertEqual(out_cuda.cpu(), out_cpu) + + out_cuda = func(t_dev, indices_dev, value1d.xpu()) + out_cpu = func(t, indices, value1d) + self.assertEqual(out_cuda.cpu(), out_cpu) + + TestIndexing.test_index_put_accumulate_with_optional_tensors = __test_index_put_accumulate_with_optional_tensors instantiate_device_type_tests(NumpyTests, globals(), only_for=("xpu"), allow_xpu=True) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index debab066a..01d140858 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -237,6 +237,11 @@ "nan_to_num", "scatter_reduce", "nanmean", + "native_layer_norm", + "native_layer_norm_backward", + "square", + "heaviside", + "argsort", ] _ops_without_cuda_support = [ @@ -415,6 +420,7 @@ def ModuleTest_test_xpu(self, test_case): xpu_gradInput = test_case._backward( xpu_module, xpu_input_tuple, xpu_output, xpu_gradOutput ) + test_case.assertEqual( cpu_gradInput, xpu_gradInput, @@ -570,7 +576,7 @@ def convert_dtype(obj, dtype, requires_grad=False): CriterionTest.test_cuda = CriterionTest_test_xpu from torch.testing._internal.common_methods_invocations import sample_inputs_cat_concat, S, M -from torch.testing._internal.common_methods_invocations import make_tensor +from torch.testing._internal.common_methods_invocations import make_tensor, mask_not_all_zeros from functools import partial from torch.testing._internal.opinfo.core import SampleInput @@ -604,6 +610,21 @@ def index_variable_nofp64(shape, max_indices, device=torch.device('cpu')): index = torch.rand(*shape, dtype=torch.float32, device=device).mul_(max_indices).floor_().long() return index +def sample_inputs_index_put_nofp64(op_info, device, dtype, requires_grad, **kwargs): + make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) + + for accumulate in [False, True]: + # Test with indices arg + yield SampleInput( + make_arg((S, S,)), + (index_variable_nofp64(2, S, device=device),), + make_arg((2, S)), + accumulate=accumulate) + + # Test with mask arg + mask = torch.zeros(S, dtype=torch.bool) if accumulate else mask_not_all_zeros((S,)) + yield SampleInput( + make_arg((S, S)), (mask, ), make_arg((S,)), accumulate=accumulate) def sample_inputs_softmax_variant_nofp64( op_info, @@ -638,6 +659,30 @@ def sample_inputs_softmax_variant_nofp64( SampleInput(make_arg(shape), args=dim, kwargs=kwargs) for shape, dim in cases ) +def sample_inputs_like_fns_nofp64(self, device, dtype, requires_grad, **kwargs): + + inputs = [ + ((), {}), + ((S, S), {}), + ((0, S, 0), {}), + ((S,), {'dtype': dtype, 'device': device}), + # Hard-code some dtypes/devices. We want to test cases where the + # (dtype, device) is different from the input's (dtype, device) + # disabled for ARC + # ((S,), {'dtype': torch.double}), + ((S,), {'device': 'cpu'}), + # disabled for ARC + #((S,), {'dtype': torch.double, 'device': 'cpu'}), + ] + if torch.cuda.is_available(): + inputs.append(((S,), {'device': 'cuda'})) + + for shape, kwargs in inputs: + t = make_tensor(shape, dtype=dtype, device=device, + low=None, high=None, + requires_grad=requires_grad) + yield SampleInput(t, **kwargs) + class XPUPatchForImport: def __init__(self, patch_test_case=True) -> None: self.test_package = ( @@ -671,10 +716,6 @@ def __init__(self, patch_test_case=True) -> None: self.cuda_is_available = cuda.is_available self.cuda_is_bf16_supported = cuda.is_bf16_supported - if "has_fp64=0" in str(torch.xpu.get_device_properties(0)): - self.sample_inputs_softmax_variant = common_methods_invocations.sample_inputs_softmax_variant - self.index_variable = common_methods_invocations.index_variable - self.reference_inputs_cat = common_methods_invocations.reference_inputs_cat def align_db_decorators(self, db): def gen_xpu_wrappers(op_name, wrappers): @@ -722,14 +763,13 @@ def gen_xpu_wrappers(op_name, wrappers): def align_supported_dtypes(self, db): for opinfo in db: - if ( - opinfo.name not in _xpu_computation_op_list - or opinfo.name in _ops_without_cuda_support - ): + if ( opinfo.name not in _xpu_computation_op_list and (opinfo.torch_opinfo.name not in _xpu_computation_op_list + if db == common_methods_invocations.python_ref_db else True)) or opinfo.name in _ops_without_cuda_support: opinfo.dtypesIfXPU = opinfo.dtypes else: backward_dtypes = set(opinfo.backward_dtypesIfCUDA) - backward_dtypes.add(bfloat16) + if bfloat16 in opinfo.dtypesIfXPU: + backward_dtypes.add(bfloat16) opinfo.backward_dtypes = tuple(backward_dtypes) if "has_fp64=0" in str(torch.xpu.get_device_properties(0)): @@ -737,16 +777,31 @@ def align_supported_dtypes(self, db): opinfo.dtypesIfXPU = set(filter(lambda x: (x not in fp64_dtypes), list(opinfo.dtypesIfXPU))) opinfo.backward_dtypes = tuple(filter(lambda x: (x not in fp64_dtypes), list(opinfo.backward_dtypes))) + def filter_fp64_sample_input(self, db): + # Only for platform without fp64 support + if "has_fp64=0" in str(torch.xpu.get_device_properties(0)): + for opinfo in db: + if opinfo.name in _xpu_computation_op_list: + if opinfo.variant_test_name == "with_dtype" and \ + opinfo.name in ["log_softmax", "softmax", "nn.functional.softmin", ] and \ + get_wrapped_fn(opinfo.sample_inputs_func) != opinfo.sample_inputs_func and \ + get_wrapped_fn(opinfo.sample_inputs_func).func.__name__ == common_methods_invocations.sample_inputs_softmax_variant.__name__: + opinfo.sample_inputs_func = torch.no_grad()(partial(sample_inputs_softmax_variant_nofp64, with_dtype=True)) + elif opinfo.sample_inputs_func.__name__ == common_methods_invocations.sample_inputs_softmax_variant.__name__: + opinfo.sample_inputs_func = sample_inputs_softmax_variant_nofp64 + elif opinfo.sample_inputs_func.__name__ == common_methods_invocations.sample_inputs_like_fns.__name__: + opinfo.sample_inputs_func = sample_inputs_like_fns_nofp64 + elif opinfo.sample_inputs_func.__name__ == common_methods_invocations.sample_inputs_index_put.__name__: + opinfo.sample_inputs_func = sample_inputs_index_put_nofp64 + + if opinfo.reference_inputs_func != None and opinfo.reference_inputs_func.__name__ == common_methods_invocations.reference_inputs_cat.__name__: + opinfo.reference_inputs_func = reference_inputs_cat_nofp64 + def __enter__(self): # Monkey patch until we have a fancy way common_device_type.onlyCUDA = common_device_type.onlyXPU - if "has_fp64=0" in str(torch.xpu.get_device_properties(0)): - common_methods_invocations.sample_inputs_softmax_variant = sample_inputs_softmax_variant_nofp64 - common_methods_invocations.index_variable = index_variable_nofp64 - common_methods_invocations.reference_inputs_cat = reference_inputs_cat_nofp64 - class dtypesIfXPU(common_device_type.dtypes): def __init__(self, *args): super().__init__(*args, device_type="xpu") @@ -773,6 +828,7 @@ def __init__(self, *args): ]: self.align_supported_dtypes(db) self.align_db_decorators(db) + self.filter_fp64_sample_input(db) self.align_db_decorators(module_db) common_methods_invocations.python_ref_db = [ op @@ -869,11 +925,6 @@ def __exit__(self, exc_type, exc_value, traceback): cuda.is_available = self.cuda_is_available cuda.is_bf16_supported = self.cuda_is_bf16_supported - if "has_fp64=0" in str(torch.xpu.get_device_properties(0)): - common_methods_invocations.sample_inputs_softmax_variant = self.sample_inputs_softmax_variant - common_methods_invocations.index_variable = self.index_variable - common_methods_invocations.reference_inputs_cat = self.reference_inputs_cat - # Copy the test cases from generic_base_class to generic_test_class. # It serves to reuse test cases. Regarding some newly added hardware, @@ -915,33 +966,33 @@ def copy_tests( def launch_test(test_case, skip_list=None, exe_list=None): + os.environ["PYTORCH_ENABLE_XPU_FALLBACK"]="1" + os.environ["PYTORCH_TEST_WITH_SLOW"]="1" if skip_list != None: - skip_options = " -k 'not " + skip_list[0] + skip_options = " -k \"not " + skip_list[0] for skip_case in skip_list[1:]: skip_option = " and not " + skip_case skip_options += skip_option - skip_options += "'" + skip_options += "\"" test_command = ( - "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v " + "pytest -v " + test_case ) test_command += skip_options - return os.system(test_command) elif exe_list != None: - exe_options = " -k '" + exe_list[0] + exe_options = " -k \"" + exe_list[0] for exe_case in exe_list[1:]: exe_option = " or " + exe_case exe_options += exe_option - exe_options += "'" + exe_options += "\"" test_command = ( - "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v " + "pytest -v " + test_case ) test_command += exe_options - return os.system(test_command) else: test_command = ( - "PYTORCH_ENABLE_XPU_FALLBACK=1 PYTORCH_TEST_WITH_SLOW=1 pytest -v " + "pytest -v " + test_case ) - return os.system(test_command) + return os.system(test_command) diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index df72a5e4c..75a1b8efa 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -5147,4 +5147,21 @@ structured_inherits: TensorIteratorBase dispatch: XPU: sign_out - tags: pointwise \ No newline at end of file + tags: pointwise + +- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: range + +- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: range + +- func: range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: range_out_no_step + +- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + XPU: range_xpu_out + cpp_no_default_args: ['step']