Merge branch 'main' into zhiwei/codegen

intel · Sep 11, 2024 · 33cebd2 · 33cebd2
2 parents 283c6f7 + 1206590
commit 33cebd2
Show file tree

Hide file tree

Showing 273 changed files with 1,575 additions and 920 deletions.
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -120,14 +120,17 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "${SYCL_OFFLINE_COMPILER_CG_OPTIONS} -cl-fp32-correctly-rounded-divide-sqrt")
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")
 
-  if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL ""))
-    set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device $ENV{TORCH_XPU_ARCH_LIST}")
+  if(WIN32)
+    set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h")
   else()
-    set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device pvc,xe-lpg,ats-m150")
-    message(STATUS "'TORCH_XPU_ARCH_LIST' not set. Using default configuration for a full AOT build." 
-              "Try specifying from 'pvc,xe-lpg,ats-m150' if you don't need.")
+    set(AOT_TARGETS "pvc,xe-lpg,ats-m150")
+  endif()
+  if((DEFINED ENV{TORCH_XPU_ARCH_LIST}) AND NOT ("$ENV{TORCH_XPU_ARCH_LIST}" STREQUAL ""))
+    set(AOT_TARGETS "$ENV{TORCH_XPU_ARCH_LIST}")
   endif()
-  message(STATUS "    SYCL_OFFLINE_COMPILER_AOT_OPTIONS: ${SYCL_OFFLINE_COMPILER_AOT_OPTIONS}")
+
+  set(SYCL_OFFLINE_COMPILER_AOT_OPTIONS "-device ${AOT_TARGETS}")
+  message(STATUS "Compile Intel GPU AOT Targets for ${AOT_TARGETS}")
 
   set(SYCL_OFFLINE_COMPILER_FLAGS "${SYCL_OFFLINE_COMPILER_AOT_OPTIONS} ${SYCL_OFFLINE_COMPILER_CG_OPTIONS}")
 else()

diff --git a/cmake/Modules/FindSYCL/run_sycl.cmake b/cmake/Modules/FindSYCL/run_sycl.cmake
@@ -56,6 +56,12 @@ endforeach()
 # Choose host flags in FindSYCL.cmake
 @SYCL_host_flags@
 
+# Adding permissive flag for MSVC build to overcome ambiguous symbol error.
+if(WIN32)
+  string(APPEND SYCL_host_compiler_flags "/permissive- ")
+endif()
+
+
 list(REMOVE_DUPLICATES CMAKE_HOST_FLAGS)
 foreach(flag ${CMAKE_HOST_FLAGS})
   # Extra quotes are added around each flag to help SYCL parse out flags with spaces.

diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
@@ -1,10 +1,13 @@
 # ATen XPU sources
 
-file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp")
+file(GLOB xpu_cpp "xpu/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
+list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
 list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
 
 set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
+set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
@@ -31,7 +31,7 @@ Tensor& arange_out(
 
         TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
         TORCH_CHECK(
-            std::isfinite(xstart) && std::isfinite(xend),
+            std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
             "unsupported range: ",
             xstart,
             " -> ",
@@ -87,5 +87,33 @@ Tensor& arange_out(
 
   return xpu::arange_kernel(start, end, step, out);
 }
+
+Tensor& range_xpu_out(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& out) {
+  auto xstart = start.to<double>();
+  auto xend = end.to<double>();
+  auto xstep = step.to<double>();
+
+  TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+  TORCH_CHECK(
+      std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
+      "unsupported range: ",
+      xstart,
+      " -> ",
+      xend);
+  TORCH_CHECK(
+      ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+      "upper bound and larger bound inconsistent with step sign");
+  int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
+  if (out.numel() != size) {
+    out.resize_({size});
+  }
+
+  return at::native::xpu::range_kernel(start, end, step, out);
+}
+
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/sycl/AbsKernel.cpp b/src/ATen/native/xpu/sycl/AbsKernel.cpp
@@ -5,6 +5,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/AbsKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/AbsKernel.h b/src/ATen/native/xpu/sycl/AbsKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-void abs_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationEluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.h b/src/ATen/native/xpu/sycl/ActivationEluKernels.h
@@ -4,13 +4,13 @@
 
 namespace at::native::xpu {
 
-void elu_kernel(
+TORCH_XPU_API void elu_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,
     const Scalar& input_scale);
 
-void elu_backward_kernel(
+TORCH_XPU_API void elu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& alpha,
     const Scalar& scale,

diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
@@ -6,6 +6,8 @@
 #include <comm/XPUMathCompat.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
+
 namespace at {
 namespace native {
 namespace xpu {

diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.h b/src/ATen/native/xpu/sycl/ActivationGeluKernel.h
@@ -6,9 +6,11 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void gelu_kernel(TensorIteratorBase& iter, c10::string_view approximate);
+TORCH_XPU_API void gelu_kernel(
+    TensorIteratorBase& iter,
+    c10::string_view approximate);
 
-void gelu_backward_kernel(
+TORCH_XPU_API void gelu_backward_kernel(
     TensorIteratorBase& iter,
     c10::string_view approximate);
 

diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -2,9 +2,12 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorIterator.h>
 
+#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/ActivationGluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.h b/src/ATen/native/xpu/sycl/ActivationGluKernels.h
@@ -4,9 +4,9 @@
 
 namespace at::native::xpu {
 
-void glu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter);
 
-void glu_backward_kernel(
+TORCH_XPU_API void glu_backward_kernel(
     const TensorIteratorBase& iter,
     int64_t gI_stride,
     int64_t I_stride);

diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t, typename opmath_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void hardsigmoid_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter);
 
-void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
@@ -6,6 +6,8 @@
 #include <comm/XPUMathCompat.h>
 #include <comm/xpu_aten.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardswishKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {

diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h
@@ -6,9 +6,9 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardswish_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_kernel(TensorIterator& iter);
 
-void hardswish_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter);
 
 } // namespace xpu
 } // namespace native

diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationHardtanhKernels.h>
+
 namespace at {
 namespace native {
 namespace xpu {

diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h
@@ -6,7 +6,7 @@ namespace at {
 namespace native {
 namespace xpu {
 
-void hardtanh_backward_kernel(
+TORCH_XPU_API void hardtanh_backward_kernel(
     TensorIterator& iter,
     const Scalar& min,
     const Scalar& max);

diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLeakyReluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h
@@ -4,9 +4,11 @@
 
 namespace at::native::xpu {
 
-void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_);
+TORCH_XPU_API void leaky_relu_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& negval_);
 
-void leaky_relu_backward_kernel(
+TORCH_XPU_API void leaky_relu_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& negval_);
 

diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
@@ -6,6 +6,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
 
-void log_sigmoid_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
@@ -8,6 +8,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationMishKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void mish_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter);
 
-void mish_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void mish_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationPreluKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationPreluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -40,4 +42,4 @@ void prelu_backward_kernel(TensorIterator& iter) {
       });
 }
 
-} // namespace at::native::xpu
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationPreluKernels.h b/src/ATen/native/xpu/sycl/ActivationPreluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void prelu_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_kernel(TensorIterator& iter);
 
-void prelu_backward_kernel(TensorIterator& iter);
+TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
@@ -7,6 +7,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
 
+#include <ATen/native/xpu/sycl/ActivationSiluKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.h b/src/ATen/native/xpu/sycl/ActivationSiluKernels.h
@@ -4,8 +4,8 @@
 
 namespace at::native::xpu {
 
-void silu_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter);
 
-void silu_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.cpp
@@ -4,6 +4,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftplusKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>

diff --git a/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h b/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h
@@ -4,12 +4,12 @@
 
 namespace at::native::xpu {
 
-void softplus_kernel(
+TORCH_XPU_API void softplus_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);
 
-void softplus_backward_kernel(
+TORCH_XPU_API void softplus_backward_kernel(
     TensorIteratorBase& iter,
     const Scalar& beta_,
     const Scalar& threshold_);

diff --git a/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.cpp
@@ -3,6 +3,8 @@
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>