diff --git a/.lintrunner.toml b/.lintrunner.toml
index bd49e3721..9be45a043 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -56,8 +56,8 @@ code = 'CLANGFORMAT'
 include_patterns = [
     'src/aten/*.h',
     'src/aten/*.cpp',
-    'src/aten/sycl/*.h',
-    'src/aten/sycl/*.cpp',
+    'src/ATen/native/xpu/sycl/*.h',
+    'src/ATen/native/xpu/sycl/*.cpp',
     'aten/src/ATen/*.h',
     'aten/src/ATen/mps/**/*.mm',
     'aten/src/ATen/xpu/**/*.h',
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
index f85598b07..0ee38df7f 100644
--- a/cmake/BuildFlags.cmake
+++ b/cmake/BuildFlags.cmake
@@ -47,6 +47,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     list(APPEND SYCL_HOST_FLAGS -O0)
   endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+  if(USE_PER_OPERATOR_HEADERS)
+    list(APPEND SYCL_HOST_FLAGS -DAT_PER_OPERATOR_HEADERS)
+  endif()
+
   # -- Kernel flags (SYCL_KERNEL_OPTIONS)
   # The fast-math will be enabled by default in SYCL compiler.
   # Refer to [https://clang.llvm.org/docs/UsersManual.html#cmdoption-fno-fast-math]
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index e579576ff..090f924ab 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -3,7 +3,7 @@ if(Codegen_GPU_cmake_included)
 endif()
 set(Codegen_GPU_cmake_included true)
 
-set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/aten/src/ATen/xpu")
+set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen/")
 file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
 
 set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU.cpp)
@@ -43,10 +43,64 @@ function(GEN_BACKEND file_yaml)
     )
 endfunction(GEN_BACKEND)
 
-GEN_BACKEND(
-  xpu_functions.yaml
-  XPUNativeFunctions.h
-  RegisterXPU.cpp)
+
+set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU.cpp)
+set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
+function(GEN_XPU file_yaml)
+  set(generated_files "")
+  foreach(f ${ARGN})
+    list(APPEND generated_files "${BUILD_TORCH_XPU_ATEN_GENERATED}/${f}")
+  endforeach()
+  file(GLOB_RECURSE depend_files ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml})
+  set(CODEGEN_TEMPLATE ${TORCH_XPU_OPS_ROOT}/yaml/)
+
+  # Codegen prepare process
+  if(WIN32)
+    string(REPLACE "/" "\\" LinkPATH "${CODEGEN_TEMPLATE}templates")
+    string(REPLACE "/" "\\" TargetPATH "${CMAKE_SOURCE_DIR}/aten/src/ATen/templates")
+    execute_process(COMMAND cmd /c mklink /D ${LinkPATH} ${TargetPATH})
+    string(REPLACE "/" "\\" RegisterXPU_PATH_BACKSLASH "${RegisterXPU_PATH}")
+    string(REPLACE "/" "\\" XPUFallback_PATH_BACKSLASH "${XPUFallback_PATH}")
+    set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH_BACKSLASH} ">>" ${RegisterXPU_PATH_BACKSLASH})
+  else()
+    execute_process(COMMAND ln -s ${CMAKE_SOURCE_DIR}/aten/src/ATen/templates ${CODEGEN_TEMPLATE}) # soft link to pytorch templates
+    set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH} ">>" ${RegisterXPU_PATH})
+  endif()
+
+  add_custom_command(
+    OUTPUT ${generated_files}
+    COMMAND
+    "${PYTHON_EXECUTABLE}" -m torchgen.gen
+    --source-path ${TORCH_XPU_OPS_ROOT}/yaml/
+    --install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
+    --per-operator-headers
+    --static-dispatch-backend
+    --backend-whitelist=XPU
+    COMMAND
+    ${REGISTER_FALLBACK_CMD}
+    # Codegen post-process
+    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH}
+    ${SIMPLE_TRACE} 
+    WORKING_DIRECTORY ${TORCH_ROOT}
+    DEPENDS
+  ${depended_files}
+    ${TORCH_XPU_OPS_ROOT}/yaml/native/${file_yaml}
+    ${XPUFallback_PATH}
+  )
+endfunction(GEN_XPU)
+
+# GEN_BACKEND(
+#   xpu_functions.yaml
+#   XPUNativeFunctions.h
+#   RegisterXPU.cpp)
+
+GEN_XPU(
+  native_functions.yaml
+  XPUFunctions.h
+  RegisterXPU.cpp
+)
+
+
 
 
 list(APPEND xpu_generated_src ${RegisterXPU_PATH})
diff --git a/src/ATen/native/sparse/SparseTensor.cpp b/src/ATen/native/sparse/SparseTensor.cpp
index b842b9839..cd9a755fe 100644
--- a/src/ATen/native/sparse/SparseTensor.cpp
+++ b/src/ATen/native/sparse/SparseTensor.cpp
@@ -5,13 +5,9 @@
 #include <ATen/core/op_registration/adaption.h>
 #include <torch/library.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
 #include <ATen/ops/_nnz_native.h>
 #include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_native.h>
-#endif
+#include <ATen/ops/_values_native.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/Activation.cpp b/src/ATen/native/xpu/Activation.cpp
index 38aa44dc6..ce000752f 100644
--- a/src/ATen/native/xpu/Activation.cpp
+++ b/src/ATen/native/xpu/Activation.cpp
@@ -1,7 +1,15 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
+#include <ATen/native/TensorIterator.h>
+
+#include <ATen/ops/empty_like.h>
+#include <xpu/ATen/ops/empty.h>
+#include <xpu/ATen/ops/gelu_backward_native.h>
+#include <xpu/ATen/ops/gelu_native.h>
 
 #include <ATen/native/xpu/sycl/ActivationEluKernels.h>
 #include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
@@ -13,659 +21,57 @@
 #include <ATen/native/xpu/sycl/ActivationMishKernels.h>
 #include <ATen/native/xpu/sycl/ActivationPreluKernels.h>
 #include <ATen/native/xpu/sycl/ActivationSiluKernels.h>
+
 #include <ATen/native/xpu/sycl/ActivationSoftplusKernels.h>
 #include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
 #include <ATen/native/xpu/sycl/ActivationThresholdKernel.h>
 
 namespace at {
-Tensor XPUNativeFunctions::relu(const Tensor& self) {
-  TORCH_CHECK(
-      self.scalar_type() != at::kBool, "Boolean inputs not supported for relu");
-  return at::clamp_min(self, 0);
-}
-
-Tensor& XPUNativeFunctions::relu_(Tensor& self) {
-  TORCH_CHECK(
-      self.scalar_type() != at::kBool, "Boolean inputs not supported for relu");
-  return at::clamp_min_(self, 0);
-}
-
-Tensor& XPUNativeFunctions::relu_out(const Tensor& self, Tensor& out) {
-  TORCH_CHECK(
-      self.scalar_type() != at::kBool, "Boolean inputs not supported for relu");
-  return at::clamp_min_out(out, self, 0);
-}
-
-TensorIterator threshold_meta(
-    const Tensor& self,
-    const Scalar& threshold,
-    const Scalar& value,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build(TensorIteratorConfig()
-                 .set_check_mem_overlap(
-                     false) // threshold is idempotent, so overlap is okay
-                 .add_output(out)
-                 .add_const_input(self)
-                 .add_const_input(self) // other
-                 .allow_cpu_scalars(true)
-                 .promote_inputs_to_common_dtype(true)
-                 .cast_common_dtype_to_outputs(true)
-                 .enforce_safe_casting_to_output(true));
-  return iter;
-}
-
-Tensor XPUNativeFunctions::threshold(
-    const Tensor& self,
-    const Scalar& threshold,
-    const Scalar& value) {
-  Tensor out;
-  auto iter = threshold_meta(self, threshold, value, out);
-  native::xpu::threshold_kernel(iter, threshold, value);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::threshold_(
-    Tensor& self,
-    const Scalar& threshold,
-    const Scalar& value) {
-  auto iter = threshold_meta(self, threshold, value, self);
-  native::xpu::threshold_kernel(iter, threshold, value);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::threshold_out(
-    const Tensor& self,
-    const Scalar& threshold,
-    const Scalar& value,
-    Tensor& out) {
-  auto iter = threshold_meta(self, threshold, value, out);
-  native::xpu::threshold_kernel(iter, threshold, value);
-  return out;
-}
-
-TensorIterator threshold_backward_meta(
-    const Tensor& grad,
-    const Tensor& self,
-    const Scalar& threshold,
-    Tensor& gradInput) {
-  TensorIterator iter;
-  iter.build(TensorIteratorConfig()
-                 .set_check_mem_overlap(
-                     false) // threshold is idempotent, so overlap is okay
-                 .add_output(gradInput)
-                 .add_input(self)
-                 .add_input(grad) // other
-                 .allow_cpu_scalars(true)
-                 .promote_inputs_to_common_dtype(true)
-                 .cast_common_dtype_to_outputs(true)
-                 .enforce_safe_casting_to_output(true));
-  return iter;
-}
-
-Tensor XPUNativeFunctions::threshold_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& threshold) {
-  Tensor grad_input;
-  auto iter = threshold_backward_meta(grad_output, self, threshold, grad_input);
-  native::xpu::threshold_kernel(iter, threshold, 0);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::threshold_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& threshold,
-    Tensor& grad_input) {
-  auto iter = threshold_backward_meta(grad_output, self, threshold, grad_input);
-  native::xpu::threshold_kernel(iter, threshold, 0);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::gelu(
-    const Tensor& self,
-    c10::string_view approximate) {
-  Tensor out;
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::gelu_kernel(iter, approximate);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::gelu_(Tensor& self, c10::string_view approximate) {
-  auto iter = TensorIterator::unary_op(self, self);
-  native::xpu::gelu_kernel(iter, approximate);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::gelu_out(
-    const Tensor& self,
-    c10::string_view approximate,
-    Tensor& out) {
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::gelu_kernel(iter, approximate);
-  return out;
-}
-
-Tensor XPUNativeFunctions::gelu_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    c10::string_view approximate) {
-  Tensor grad_input;
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-  native::xpu::gelu_backward_kernel(iter, approximate);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::gelu_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    c10::string_view approximate,
-    Tensor& grad_input) {
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-  native::xpu::gelu_backward_kernel(iter, approximate);
-  return grad_input;
-}
-
-TensorIterator elu_meta(
-    const Tensor& self,
-    const Scalar& alpha,
-    const Scalar& scale,
-    const Scalar& input_scale,
-    Tensor& out) {
-  TensorIterator iter;
-  iter = TensorIterator::unary_op(out, self);
-  return iter;
-}
-
-Tensor& XPUNativeFunctions::elu_out(
-    const Tensor& self,
-    const Scalar& alpha,
-    const Scalar& scale,
-    const Scalar& input_scale,
-    Tensor& out) {
-  auto iter = elu_meta(self, alpha, scale, input_scale, out);
-  native::xpu::elu_kernel(iter, alpha, scale, input_scale);
-  return out;
-}
-
-Tensor XPUNativeFunctions::elu(
-    const Tensor& self,
-    const Scalar& alpha,
-    const Scalar& scale,
-    const Scalar& input_scale) {
-  Tensor out;
-  auto iter = elu_meta(self, alpha, scale, input_scale, out);
-  native::xpu::elu_kernel(iter, alpha, scale, input_scale);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::elu_(
-    Tensor& self,
-    const Scalar& alpha,
-    const Scalar& scale,
-    const Scalar& input_scale) {
-  auto iter = elu_meta(self, alpha, scale, input_scale, self);
-  native::xpu::elu_kernel(iter, alpha, scale, input_scale);
-  return self;
-}
-
-TensorIterator elu_backward_meta(
-    const Tensor& grad_output,
-    const Scalar& alpha,
-    const Scalar& scale,
-    const Scalar& input_scale,
-    bool is_result,
-    const Tensor& self_or_result,
-    Tensor& grad_input) {
-  TORCH_CHECK(
-      !is_result || alpha.to<double>() >= 0.0,
-      "In-place elu backward calculation is triggered with a negative slope which is not supported. "
-      "This is caused by calling in-place forward function with a negative slope, "
-      "please call out-of-place version instead.");
-
-  TensorIterator iter;
-  iter = TensorIterator::borrowing_binary_op(
-      grad_input, grad_output, self_or_result);
-  return iter;
-}
-
-Tensor& XPUNativeFunctions::elu_backward_out(
-    const Tensor& grad_output,
-    const Scalar& alpha,
-    const Scalar& scale,
-    const Scalar& input_scale,
-    bool is_result,
-    const Tensor& self_or_result,
-    Tensor& grad_input) {
-  auto iter = elu_backward_meta(
-      grad_output,
-      alpha,
-      scale,
-      input_scale,
-      is_result,
-      self_or_result,
-      grad_input);
-  native::xpu::elu_backward_kernel(iter, alpha, scale, input_scale, is_result);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::elu_backward(
-    const Tensor& grad_output,
-    const Scalar& alpha,
-    const Scalar& scale,
-    const Scalar& input_scale,
-    bool is_result,
-    const Tensor& self_or_result) {
-  Tensor grad_input;
-  auto iter = elu_backward_meta(
-      grad_output,
-      alpha,
-      scale,
-      input_scale,
-      is_result,
-      self_or_result,
-      grad_input);
-  native::xpu::elu_backward_kernel(iter, alpha, scale, input_scale, is_result);
-  return iter.output();
-}
-
-Tensor XPUNativeFunctions::silu(const Tensor& self) {
-  Tensor out;
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::silu_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::silu_(Tensor& self) {
-  auto iter = TensorIterator::unary_op(self, self);
-  native::xpu::silu_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::silu_out(const Tensor& self, Tensor& out) {
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::silu_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::silu_backward(
-    const Tensor& grad_output,
-    const Tensor& self) {
-  Tensor grad_input;
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-  native::xpu::silu_backward_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::silu_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    Tensor& grad_input) {
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-  native::xpu::silu_backward_kernel(iter);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::hardtanh(
-    const Tensor& self,
-    const Scalar& min,
-    const Scalar& max) {
-  Tensor result = at::empty_like(self);
-  return at::hardtanh_out(result, self, min, max);
-}
-
-Tensor& XPUNativeFunctions::hardtanh_out(
-    const Tensor& self,
-    const Scalar& min,
-    const Scalar& max,
-    Tensor& result) {
-  TORCH_CHECK(
-      self.scalar_type() != at::kBool,
-      "Boolean inputs not supported for hardtanh");
-  Scalar min_, max_;
-  if (at::isIntegralType(self.scalar_type(), /*include_bool*/ false)) {
-    int64_t minval = min.toLong();
-    int64_t maxval = max.toLong();
-    TORCH_CHECK(
-        self.dtype() != at::kByte || (minval >= 0 && maxval >= 0),
-        "cannot do hardtanh on an unsigned type with negative limits");
-    min_ = minval;
-    max_ = maxval;
-  } else {
-    min_ = min;
-    max_ = max;
-  }
-  return at::clamp_out(result, self, min_, max_);
-}
-
-Tensor& XPUNativeFunctions::hardtanh_(
-    Tensor& self,
-    const Scalar& min,
-    const Scalar& max) {
-  return at::hardtanh_out(self, self, min, max);
-}
-
-Tensor& XPUNativeFunctions::hardtanh_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& min,
-    const Scalar& max,
-    Tensor& grad_input) {
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-  native::xpu::hardtanh_backward_kernel(iter, min, max);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::hardtanh_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& min,
-    const Scalar& max) {
-  Tensor result;
-  auto iter = TensorIterator::borrowing_binary_op(result, grad_output, self);
-  native::xpu::hardtanh_backward_kernel(iter, min, max);
-  return iter.output();
-}
-
-Tensor XPUNativeFunctions::hardswish(const Tensor& self) {
-  Tensor result;
-  auto iter = TensorIterator::unary_op(result, self);
-  native::xpu::hardswish_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::hardswish_out(const Tensor& self, Tensor& result) {
-  auto iter = TensorIterator::unary_op(result, self);
-  native::xpu::hardswish_kernel(iter);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::hardswish_(Tensor& self) {
-  auto iter = TensorIterator::unary_op(self, self);
-  native::xpu::hardswish_kernel(iter);
-  return self;
-}
-
-Tensor XPUNativeFunctions::hardswish_backward(
-    const Tensor& grad_output,
-    const Tensor& self) {
-  Tensor grad_input;
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-  native::xpu::hardswish_backward_kernel(iter);
-  return iter.output();
-}
-
-Tensor XPUNativeFunctions::hardsigmoid(const Tensor& self) {
-  Tensor out;
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::hardsigmoid_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::hardsigmoid_(Tensor& self) {
-  auto iter = TensorIterator::unary_op(self, self);
-  native::xpu::hardsigmoid_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::hardsigmoid_out(const Tensor& self, Tensor& out) {
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::hardsigmoid_kernel(iter);
-  return out;
-}
-
-TensorIterator hardsigmoid_backward_meta(
-    const Tensor& grad_output,
-    const Tensor& self,
-    Tensor& grad_input) {
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::hardsigmoid_backward(
-    const Tensor& grad_output,
-    const Tensor& self) {
-  Tensor grad_input;
-  auto iter = hardsigmoid_backward_meta(grad_output, self, grad_input);
-  native::xpu::hardsigmoid_backward_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::hardsigmoid_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    Tensor& grad_input) {
-  auto iter = hardsigmoid_backward_meta(grad_output, self, grad_input);
-  native::xpu::hardsigmoid_backward_kernel(iter);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::leaky_relu(
-    const Tensor& self,
-    const Scalar& negval) {
-  Tensor out;
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::leaky_relu_kernel(iter, negval);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::leaky_relu_(Tensor& self, const Scalar& negval) {
-  auto iter = TensorIterator::unary_op(self, self);
-  native::xpu::leaky_relu_kernel(iter, negval);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::leaky_relu_out(
-    const Tensor& self,
-    const Scalar& negval,
-    Tensor& out) {
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::leaky_relu_kernel(iter, negval);
-  return out;
-}
-
-TensorIterator leaky_relu_backward_meta(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& negval,
-    bool is_result,
-    const Tensor& grad_input) {
-  TORCH_CHECK(
-      !is_result || negval.to<double>() >= 0.0,
-      "In-place leakyReLu backward calculation is triggered with a negative slope which is not supported. "
-      "This is caused by calling in-place forward function with a negative slope, "
-      "please call out-of-place version instead. File an issue at https://github.com/pytorch/pytorch if you do "
-      "require supporting in-place leakRelu backward calculation with negative slope");
-
-  return TensorIterator::borrowing_binary_op(grad_input, self, grad_output);
-}
-
-Tensor XPUNativeFunctions::leaky_relu_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& negval,
-    bool is_result) {
-  Tensor grad_input;
-  auto iter = leaky_relu_backward_meta(
-      grad_output, self, negval, is_result, grad_input);
-  native::xpu::leaky_relu_backward_kernel(iter, negval);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::leaky_relu_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& negval,
-    bool is_result,
-    Tensor& grad_input) {
-  auto iter = leaky_relu_backward_meta(
-      grad_output, self, negval, is_result, grad_input);
-  native::xpu::leaky_relu_backward_kernel(iter, negval);
-  return grad_input;
-}
-
-TensorIterator softplus_meta(
-    const Tensor& self,
-    const Scalar& beta,
-    const Scalar& threshold,
-    Tensor& out) {
-  return TensorIterator::unary_op(out, self);
-}
-
-Tensor XPUNativeFunctions::softplus(
-    const Tensor& self,
-    const Scalar& beta,
-    const Scalar& threshold) {
-  Tensor out;
-  auto iter = softplus_meta(self, beta, threshold, out);
-  native::xpu::softplus_kernel(iter, beta, threshold);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::softplus_out(
-    const Tensor& self,
-    const Scalar& beta,
-    const Scalar& threshold,
-    Tensor& out) {
-  auto iter = softplus_meta(self, beta, threshold, out);
-  native::xpu::softplus_kernel(iter, beta, threshold);
-  return out;
-}
-
-TensorIterator softplus_backward_meta(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& beta,
-    const Scalar& threshold,
-    Tensor& grad_input) {
-  return TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-}
-
-Tensor XPUNativeFunctions::softplus_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& beta,
-    const Scalar& threshold) {
-  Tensor grad_input;
-  auto iter =
-      softplus_backward_meta(grad_output, self, beta, threshold, grad_input);
-  native::xpu::softplus_backward_kernel(iter, beta, threshold);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::softplus_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& beta,
-    const Scalar& threshold,
-    Tensor& grad_input) {
-  auto iter =
-      softplus_backward_meta(grad_output, self, beta, threshold, grad_input);
-  native::xpu::softplus_backward_kernel(iter, beta, threshold);
-  return grad_input;
-}
-
-static inline void softshrink_check(const Scalar& lambd) {
-  double lamb = lambd.to<double>();
-  TORCH_CHECK(
-      lamb >= 0,
-      "lambda must be greater or equal to 0, but found to be ",
-      lamb,
-      ".");
-}
-
-TensorIterator softshrink_meta(
-    const Tensor& self,
-    const Scalar& lambd,
-    Tensor& out) {
-  softshrink_check(lambd);
-  return TensorIterator::unary_op(out, self);
-}
-
-Tensor XPUNativeFunctions::softshrink(const Tensor& self, const Scalar& lambd) {
-  Tensor out;
-  auto iter = softshrink_meta(self, lambd, out);
-  native::xpu::softshrink_kernel(iter, lambd);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::softshrink_out(
-    const Tensor& self,
-    const Scalar& lambd,
-    Tensor& out) {
-  auto iter = softshrink_meta(self, lambd, out);
-  native::xpu::softshrink_kernel(iter, lambd);
-  return out;
-}
-
-TensorIterator softshrink_backward_meta(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& lambd,
-    Tensor& grad_input) {
-  return TensorIterator::borrowing_binary_op(grad_input, grad_output, self);
-}
-
-Tensor XPUNativeFunctions::softshrink_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& lambd) {
-  Tensor grad_input;
-  auto iter = softshrink_backward_meta(grad_output, self, lambd, grad_input);
-  native::xpu::softshrink_backward_kernel(iter, lambd);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::softshrink_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Scalar& lambd,
-    Tensor& grad_input) {
-  auto iter = softshrink_backward_meta(grad_output, self, lambd, grad_input);
-  native::xpu::softshrink_backward_kernel(iter, lambd);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::_prelu_kernel(
-    const Tensor& self,
-    const Tensor& weight) {
-  // Weight broadcasts over self and they have the same dtype
-  auto result = at::empty_like(self);
-  auto iter = TensorIteratorConfig()
-                  .add_output(result)
-                  .add_const_input(self)
-                  .add_const_input(weight)
-                  .build();
-  native::xpu::prelu_kernel(iter);
-  return result;
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::_prelu_kernel_backward(
-    const Tensor& grad_out,
-    const Tensor& self,
-    const Tensor& weight) {
-  Tensor grad_self = at::empty({0}, self.options());
-  Tensor grad_weight = at::empty({0}, weight.options());
-  auto iter = TensorIteratorConfig()
-                  .add_output(grad_self)
-                  .add_output(grad_weight)
-                  .add_const_input(self)
-                  .add_const_input(weight)
-                  .add_const_input(grad_out)
-                  .build();
-  native::xpu::prelu_backward_kernel(iter);
-  return {grad_self, grad_weight};
-}
-
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::log_sigmoid_forward_out(
+namespace native {
+REGISTER_XPU_DISPATCH(threshold_stub, &xpu::threshold_kernel);
+REGISTER_XPU_DISPATCH(elu_stub, &xpu::elu_kernel);
+REGISTER_XPU_DISPATCH(elu_backward_stub, &xpu::elu_backward_kernel);
+REGISTER_XPU_DISPATCH(silu_stub, &xpu::silu_kernel);
+REGISTER_XPU_DISPATCH(silu_backward_stub, &xpu::silu_backward_kernel);
+REGISTER_XPU_DISPATCH(hardswish_stub, &xpu::hardswish_kernel);
+REGISTER_XPU_DISPATCH(hardswish_backward_stub, &xpu::hardswish_backward_kernel);
+REGISTER_XPU_DISPATCH(hardtanh_backward_stub, &xpu::hardtanh_backward_kernel);
+REGISTER_XPU_DISPATCH(hardsigmoid_stub, &xpu::hardsigmoid_kernel);
+REGISTER_XPU_DISPATCH(
+    hardsigmoid_backward_stub,
+    &xpu::hardsigmoid_backward_kernel);
+REGISTER_XPU_DISPATCH(leaky_relu_stub, &xpu::leaky_relu_kernel);
+REGISTER_XPU_DISPATCH(
+    leaky_relu_backward_stub,
+    &xpu::leaky_relu_backward_kernel);
+REGISTER_XPU_DISPATCH(softplus_stub, &xpu::softplus_kernel);
+REGISTER_XPU_DISPATCH(softplus_backward_stub, &xpu::softplus_backward_kernel);
+REGISTER_XPU_DISPATCH(softshrink_stub, &xpu::softshrink_kernel);
+REGISTER_XPU_DISPATCH(shrink_backward_stub, &xpu::softshrink_backward_kernel);
+REGISTER_XPU_DISPATCH(mish_stub, &xpu::mish_kernel);
+REGISTER_XPU_DISPATCH(mish_backward_stub, &xpu::mish_backward_kernel);
+REGISTER_XPU_DISPATCH(
+    log_sigmoid_backward_stub,
+    &xpu::log_sigmoid_backward_kernel);
+REGISTER_XPU_DISPATCH(prelu_stub, &xpu::prelu_kernel);
+REGISTER_XPU_DISPATCH(prelu_backward_stub, &xpu::prelu_backward_kernel);
+
+TORCH_IMPL_FUNC(gelu_backward_out_xpu)
+(const Tensor& /*grad*/,
+ const Tensor& /*self*/,
+ c10::string_view approximate,
+ const Tensor& /*grad_input*/
+) {
+  xpu::gelu_backward_kernel(*this, approximate);
+}
+
+TORCH_IMPL_FUNC(gelu_out_xpu)
+(const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*result*/
+) {
+  xpu::gelu_kernel(*this, approximate);
+}
+
+std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_xpu(
     const Tensor& input,
     Tensor& result,
     Tensor& buffer) {
@@ -675,72 +81,41 @@ std::tuple<Tensor&, Tensor&> XPUNativeFunctions::log_sigmoid_forward_out(
   return std::forward_as_tuple(result, buffer);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::log_sigmoid_forward(
-    const Tensor& input) {
+std::tuple<Tensor, Tensor> log_sigmoid_forward_xpu(const Tensor& input) {
   auto result = at::empty_like(input);
   auto buffer = at::empty({0}, input.options());
-  log_sigmoid_forward_out(input, result, buffer);
+  log_sigmoid_forward_out_xpu(input, result, buffer);
   return std::forward_as_tuple(result, buffer);
 }
 
-TensorIterator log_sigmoid_backward_meta(
+Tensor& log_sigmoid_backward_xpu_out(
     const Tensor& grad_output,
     const Tensor& input,
-    const Tensor& grad_input) {
-  TensorIterator iter;
-  iter.build(TensorIteratorConfig()
-                 .add_output(grad_input)
-                 .add_const_input(input)
-                 .add_const_input(grad_output));
-  return iter;
+    const Tensor& buffer,
+    Tensor& grad_input) {
+  auto iter = TensorIteratorConfig()
+                  .add_output(grad_input)
+                  .add_const_input(input)
+                  .add_const_input(grad_output)
+                  .build();
+  log_sigmoid_backward_stub(kXPU, iter);
+  return grad_input;
 }
 
-Tensor XPUNativeFunctions::log_sigmoid_backward(
+Tensor log_sigmoid_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& buffer) {
   auto grad_input = at::empty_like(grad_output);
-  auto iter = log_sigmoid_backward_meta(grad_output, input, grad_input);
-  native::xpu::log_sigmoid_backward_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::log_sigmoid_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& buffer,
-    Tensor& grad_input) {
-  auto iter = log_sigmoid_backward_meta(grad_output, input, grad_input);
-  native::xpu::log_sigmoid_backward_kernel(iter);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::mish(const Tensor& self) {
-  Tensor out;
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::mish_kernel(iter);
+  // NOTE: buffer is only used by CPU dispatch, we just ignore it here
+  auto iter = at::TensorIteratorConfig()
+                  .add_output(grad_input)
+                  .add_const_input(input)
+                  .add_const_input(grad_output)
+                  .build();
+  log_sigmoid_backward_stub(kXPU, iter);
   return iter.output();
 }
 
-Tensor& XPUNativeFunctions::mish_out(const Tensor& self, Tensor& out) {
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::mish_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::mish_(Tensor& self) {
-  auto iter = TensorIterator::unary_op(self, self);
-  native::xpu::mish_kernel(iter);
-  return self;
-}
-
-Tensor XPUNativeFunctions::mish_backward(
-    const Tensor& grad_output,
-    const Tensor& input) {
-  Tensor grad_input = at::empty({0}, input.options());
-  auto iter = TensorIterator::binary_op(grad_input, grad_output, input);
-  native::xpu::mish_backward_kernel(iter);
-  return grad_input;
-}
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
index 44ca61805..00aba1011 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -1,15 +1,14 @@
-#include <ATen/ATen.h>
+
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/AdaptivePooling.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <aten/src/ATen/ops/mean_ops.h>
-#endif
+#include <comm/xpu_aten.h>
+
+#include <ATen/ops/mean.h>
+#include <ATen/ops/zeros_like.h>
+#include <xpu/ATen/ops/_adaptive_avg_pool2d_backward_native.h>
+#include <xpu/ATen/ops/_adaptive_avg_pool2d_native.h>
 
 #include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
 
@@ -89,7 +88,8 @@ Tensor mean_backward(
 }
 } // namespace
 
-Tensor XPUNativeFunctions::_adaptive_avg_pool2d_backward(
+namespace native {
+Tensor adaptive_avg_pool2d_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input) {
   TensorArg grad_output_arg{grad_output, "grad_output", 1},
@@ -127,7 +127,7 @@ Tensor XPUNativeFunctions::_adaptive_avg_pool2d_backward(
   return grad_input;
 }
 
-Tensor& XPUNativeFunctions::adaptive_avg_pool2d_out(
+Tensor& adaptive_avg_pool2d_out_xpu(
     const Tensor& input,
     IntArrayRef output_size,
     Tensor& output) {
@@ -166,17 +166,18 @@ Tensor& XPUNativeFunctions::adaptive_avg_pool2d_out(
       output.as_strided__symint({n, c, 1, 1}, {c, 1, c, c});
     }
   } else {
-    native::xpu::adaptive_avg_pool2d_kernel(output, input, output_size);
+    xpu::adaptive_avg_pool2d_kernel(output, input, output_size);
   }
   return output;
 }
 
-Tensor XPUNativeFunctions::_adaptive_avg_pool2d(
+Tensor adaptive_avg_pool2d_xpu(
     at::Tensor const& input,
     IntArrayRef output_size) {
   auto output = at::empty({0}, input.options());
-  adaptive_avg_pool2d_out(input, output_size, output);
+  adaptive_avg_pool2d_out_xpu(input, output_size, output);
   return output;
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
index 55c6a2964..6098072ac 100644
--- a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
@@ -1,214 +1,50 @@
 #include <ATen/ATen.h>
 #include <ATen/native/AdaptivePooling.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 #include <ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-namespace at {
-
-void adaptive_max_pool2d_meta(
-    const Tensor& input,
-    IntArrayRef output_size,
-    Tensor& output,
-    Tensor& indices) {
-  int ndim = input.ndimension();
-  TORCH_CHECK(
-      ndim == 3 || ndim == 4,
-      "adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ",
-      input.sizes());
-  for (const auto i : c10::irange(1, ndim)) {
-    TORCH_CHECK(
-        input.size(i) > 0,
-        "adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
-        "but input has sizes ",
-        input.sizes(),
-        " with dimension ",
-        i,
-        " being empty");
-  }
-
-  TORCH_CHECK(
-      output_size.size() == 2,
-      "adaptive_max_pool2d(): internal error: output_size.size() must be 2");
-
-  int dimH = 1;
-  int64_t sizeB = 1;
-  int64_t sizeD = 0;
-
-  if (input.ndimension() == 4) {
-    sizeB = input.size(0);
-    dimH++;
-  }
-
-  sizeD = input.size(dimH - 1);
-
-  int64_t osizeH = output_size[0];
-  int64_t osizeW = output_size[1];
+#include <xpu/ATen/ops/adaptive_max_pool2d_backward_native.h>
+#include <xpu/ATen/ops/adaptive_max_pool2d_native.h>
 
-  /* resize output */
-  if (input.ndimension() == 3) {
-    if (output.defined()) {
-      at::xpu::resize_out(output, {sizeD, osizeH, osizeW}, {}, input.options());
-    } else {
-      output =
-          at::xpu::create_out({sizeD, osizeH, osizeW}, {}, input.options());
-    }
-    if (indices.defined()) {
-      at::xpu::resize_out(
-          indices, {sizeD, osizeH, osizeW}, {}, input.options());
-    } else {
-      indices = at::xpu::create_out(
-          {sizeD, osizeH, osizeW}, {}, input.options().dtype(kLong));
-    }
-  } else {
-    if (output.defined()) {
-      at::xpu::resize_out(
-          output,
-          {sizeB, sizeD, osizeH, osizeW},
-          {},
-          input.options().memory_format(input.suggest_memory_format()));
-    } else {
-      output = at::xpu::create_out(
-          {sizeB, sizeD, osizeH, osizeW},
-          {},
-          input.options().memory_format(input.suggest_memory_format()));
-    }
-    if (indices.defined()) {
-      at::xpu::resize_out(
-          indices,
-          {sizeB, sizeD, osizeH, osizeW},
-          {},
-          input.options()
-              .memory_format(input.suggest_memory_format())
-              .dtype(kLong));
-    } else {
-      indices = at::xpu::create_out(
-          {sizeB, sizeD, osizeH, osizeW},
-          {},
-          input.options()
-              .memory_format(input.suggest_memory_format())
-              .dtype(kLong));
-    }
-  }
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::adaptive_max_pool2d(
-    const Tensor& input,
-    IntArrayRef output_size) {
-  TensorArg input_arg{input, "input", 1};
-  checkAllSameGPU(__func__, {input_arg});
-
-  Tensor output, indices;
-  adaptive_max_pool2d_meta(input, output_size, output, indices);
-
-  if (input.numel() == 0) {
-    return {output, indices};
-  }
-
-  native::xpu::adaptive_max_pool2d_kernel(input, output_size, output, indices);
-  return {output, indices};
-}
+namespace at {
+namespace native {
 
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::adaptive_max_pool2d_out(
-    const Tensor& input,
-    IntArrayRef output_size,
-    Tensor& output,
-    Tensor& indices) {
+TORCH_IMPL_FUNC(adaptive_max_pool2d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ const Tensor& output,
+ const Tensor& indices) {
   TensorArg output_arg{output, "output", 1};
   TensorArg indices_arg{indices, "indices", 2};
   TensorArg input_arg{input, "input", 3};
   checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
-
-  adaptive_max_pool2d_meta(input, output_size, output, indices);
-
   if (input.numel() == 0) {
-    return {output, indices};
-  }
-
-  native::xpu::adaptive_max_pool2d_kernel(input, output_size, output, indices);
-  return {output, indices};
-}
-
-void adaptive_max_pool2d_backward_meta(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& indices,
-    Tensor& grad_input) {
-  int64_t ndim = grad_output.ndimension();
-  TORCH_CHECK(
-      ndim == 3 || ndim == 4,
-      "adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ",
-      grad_output.sizes());
-
-  at::native::adaptive_pool_empty_output_check(
-      grad_output, "adaptive_max_pool2d_backward");
-
-  TORCH_CHECK(
-      input.dtype() == grad_output.dtype(),
-      "expected dtype ",
-      input.dtype(),
-      " for `grad_output` but got dtype ",
-      grad_output.dtype());
-
-  if (grad_input.defined()) {
-    at::xpu::resize_out(
-        grad_input,
-        input.sizes(),
-        {},
-        input.options().memory_format(input.suggest_memory_format()));
-  } else {
-    grad_input = at::xpu::create_out(
-        input.sizes(),
-        {},
-        input.options().memory_format(input.suggest_memory_format()));
+    return;
   }
-}
-
-Tensor XPUNativeFunctions::adaptive_max_pool2d_backward(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& indices) {
-  TensorArg grad_output_arg{grad_output, "grad_output", 1};
-  TensorArg input_arg{input, "input", 2};
-  TensorArg indices_arg{indices, "indices", 3};
-
-  checkAllSameGPU(__func__, {grad_output_arg, input_arg, indices_arg});
-
-  Tensor grad_input;
-  adaptive_max_pool2d_backward_meta(grad_output, input, indices, grad_input);
 
-  if (grad_output.numel() == 0) {
-    return grad_input;
-  }
-
-  native::xpu::adaptive_max_pool2d_backward_kernel(
-      grad_output, input, indices, grad_input);
-  return grad_input;
+  xpu::adaptive_max_pool2d_kernel(input, output_size, output, indices);
 }
 
-Tensor& XPUNativeFunctions::adaptive_max_pool2d_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& indices,
-    Tensor& grad_input) {
-  TensorArg grad_input_arg{grad_input, "grad_input", 1};
-  TensorArg grad_output_arg{grad_output, "grad_output", 2};
+TORCH_IMPL_FUNC(adaptive_max_pool2d_backward_out_xpu)
+(const Tensor& gradOutput,
+ const Tensor& input,
+ const Tensor& indices,
+ const Tensor& gradInput) {
+  TensorArg grad_input_arg{gradInput, "grad_input", 1};
+  TensorArg grad_output_arg{gradOutput, "grad_output", 2};
   TensorArg input_arg{input, "input", 3};
   TensorArg indices_arg{indices, "indices", 4};
 
   checkAllSameGPU(
       __func__, {grad_input_arg, grad_output_arg, input_arg, indices_arg});
 
-  adaptive_max_pool2d_backward_meta(grad_output, input, indices, grad_input);
-
-  if (grad_output.numel() == 0) {
-    return grad_input;
+  if (gradOutput.numel() == 0) {
+    return;
   }
-
-  native::xpu::adaptive_max_pool2d_backward_kernel(
-      grad_output, input, indices, grad_input);
-  return grad_input;
+  xpu::adaptive_max_pool2d_backward_kernel(
+      gradOutput, input, indices, gradInput);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/AmpKernels.cpp b/src/ATen/native/xpu/AmpKernels.cpp
index 5ff9705d0..32216d354 100644
--- a/src/ATen/native/xpu/AmpKernels.cpp
+++ b/src/ATen/native/xpu/AmpKernels.cpp
@@ -1,12 +1,11 @@
-#include <ATen/ATen.h>
 #include <ATen/native/ForeachUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/AmpKernels.h>
 
 namespace at {
-
-void XPUNativeFunctions::_amp_foreach_non_finite_check_and_unscale_(
+namespace native {
+void _amp_foreach_non_finite_check_and_unscale_xpu_(
     TensorList scaled_grads,
     Tensor& found_inf,
     const Tensor& inv_scale) {
@@ -79,7 +78,7 @@ void XPUNativeFunctions::_amp_foreach_non_finite_check_and_unscale_(
       tensor_lists, found_inf, inv_scale);
 }
 
-Tensor& XPUNativeFunctions::_amp_update_scale_(
+Tensor& _amp_update_scale_xpu_(
     Tensor& current_scale,
     Tensor& growth_tracker,
     const Tensor& found_inf,
@@ -115,5 +114,5 @@ Tensor& XPUNativeFunctions::_amp_update_scale_(
 
   return current_scale;
 }
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/AveragePool2d.cpp b/src/ATen/native/xpu/AveragePool2d.cpp
index 4d3cc1c0e..326ad8a51 100644
--- a/src/ATen/native/xpu/AveragePool2d.cpp
+++ b/src/ATen/native/xpu/AveragePool2d.cpp
@@ -1,314 +1,63 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/Pool.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-namespace at {
-using namespace at::native;
-using namespace at::native::xpu;
-
-Tensor& avg_pool2d_meta(
-    const Tensor& input,
-    Tensor& output,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    std::optional<int64_t> divisor_override) {
-  TORCH_CHECK(
-      kernel_size.size() == 1 || kernel_size.size() == 2,
-      "avg_pool2d: kernel_size must either be a single int, or a tuple "
-      "of two ints");
-  const int64_t kH = kernel_size[0];
-  const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1];
-
-  TORCH_CHECK(
-      stride.empty() || stride.size() == 1 || stride.size() == 2,
-      "avg_pool2d: stride must either be omitted, a single int, or a "
-      "tuple of two ints");
-  const int64_t dH = stride.empty() ? kH : stride[0];
-  const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1];
-
-  TORCH_CHECK(
-      padding.size() == 1 || padding.size() == 2,
-      "avg_pool2d: padding must either be a single int, or a tuple of "
-      "two ints");
-  const int64_t padH = padding[0];
-  const int64_t padW = padding.size() == 1 ? padH : padding[1];
-
-  TORCH_CHECK(
-      !divisor_override.has_value() || divisor_override.value() != 0,
-      "divisor must be not zero");
-
-  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
-  const int64_t nInputPlane = input.size(-3);
-  const int64_t inputHeight = input.size(-2);
-  const int64_t inputWidth = input.size(-1);
-
-  const int64_t outputHeight =
-      pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
-  const int64_t outputWidth =
-      pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
-
-  auto memory_format = input.suggest_memory_format();
-  pool2d_shape_check(
-      input,
-      kH,
-      kW,
-      dH,
-      dW,
-      padH,
-      padW,
-      1,
-      1,
-      nInputPlane,
-      inputHeight,
-      inputWidth,
-      outputHeight,
-      outputWidth,
-      memory_format);
-
-  /* resize output */
-  if (input.ndimension() == 3) {
-    if (output.defined()) {
-      at::xpu::resize_out(
-          output,
-          {nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options());
-    } else {
-      output = at::xpu::create_out(
-          {nInputPlane, outputHeight, outputWidth}, {}, input.options());
-    }
-  } else {
-    if (output.defined()) {
-      at::xpu::resize_out(
-          output,
-          {nbatch, nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format));
-    } else {
-      output = at::xpu::create_out(
-          {nbatch, nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format));
-    }
-  }
-
-  return output;
-}
-
-Tensor& avg_pool2d_backward_meta(
-    const Tensor& gradOutput_,
-    Tensor& grad_input,
-    const Tensor& input,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    std::optional<int64_t> divisor_override) {
-  TORCH_CHECK(
-      kernel_size.size() == 1 || kernel_size.size() == 2,
-      "avg_pool2d: kernel_size must either be a single int, or a tuple "
-      "of two ints");
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1
-      ? kH
-      : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  TORCH_CHECK(
-      stride.empty() || stride.size() == 1 || stride.size() == 2,
-      "avg_pool2d: stride must either be omitted, a single int, or a "
-      "tuple of two ints");
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW
-      : stride.size() == 1      ? dH
-                                : safe_downcast<int, int64_t>(stride[1]);
-
-  TORCH_CHECK(
-      padding.size() == 1 || padding.size() == 2,
-      "avg_pool2d: padding must either be a single int, or a tuple of "
-      "two ints");
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW =
-      padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  TORCH_CHECK(
-      !divisor_override.has_value() || divisor_override.value() != 0,
-      "divisor must be not zero");
-
-  /* sizes */
-  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
-  const int64_t nInputPlane = input.size(-3); // number of channels (or colors)
-  const int64_t inputHeight = input.size(-2);
-  const int64_t inputWidth = input.size(-1);
-  const int64_t outputWidth =
-      pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
-  const int64_t outputHeight =
-      pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
-
-  auto memory_format = input.suggest_memory_format();
-  avg_pool2d_backward_shape_check(
-      input,
-      gradOutput_,
-      nbatch,
-      kH,
-      kW,
-      dH,
-      dW,
-      padH,
-      padW,
-      nInputPlane,
-      inputHeight,
-      inputWidth,
-      outputHeight,
-      outputWidth,
-      memory_format);
-
-  if (grad_input.defined()) {
-    at::xpu::resize_out(
-        grad_input,
-        input.sizes(),
-        {},
-        input.options().memory_format(memory_format));
-  } else {
-    grad_input = at::xpu::create_out(
-        input.sizes(), {}, input.options().memory_format(memory_format));
-  }
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::avg_pool2d(
-    const Tensor& input,
-    at::IntArrayRef kernel_size,
-    at::IntArrayRef stride,
-    at::IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
-  Tensor output;
-  output = avg_pool2d_meta(
-      input,
-      output,
-      kernel_size,
-      stride,
-      padding,
-      ceil_mode,
-      count_include_pad,
-      divisor_override);
-
-  at::native::xpu::avg_pool2d_kernel(
-      input,
-      kernel_size,
-      stride,
-      padding,
-      ceil_mode,
-      count_include_pad,
-      divisor_override,
-      output);
-  return output;
-}
-
-Tensor& XPUNativeFunctions::avg_pool2d_out(
-    const Tensor& input,
-    at::IntArrayRef kernel_size,
-    at::IntArrayRef stride,
-    at::IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
-    Tensor& output) {
-  avg_pool2d_meta(
-      input,
-      output,
-      kernel_size,
-      stride,
-      padding,
-      ceil_mode,
-      count_include_pad,
-      divisor_override);
+#include <xpu/ATen/ops/avg_pool2d_backward_native.h>
+#include <xpu/ATen/ops/avg_pool2d_native.h>
 
-  at::native::xpu::avg_pool2d_kernel(
-      input,
-      kernel_size,
-      stride,
-      padding,
+namespace at {
+namespace native {
+
+TORCH_IMPL_FUNC(avg_pool2d_out_xpu)
+(const Tensor& input_,
+ int64_t kH_,
+ int64_t kW_,
+ int64_t dH_,
+ int64_t dW_,
+ int64_t padH_,
+ int64_t padW_,
+ bool ceil_mode,
+ bool count_include_pad,
+ std::optional<int64_t> divisor_override,
+ const Tensor& output) {
+  xpu::avg_pool2d_kernel(
+      input_,
+      kH_,
+      kW_,
+      dH_,
+      dW_,
+      padH_,
+      padW_,
       ceil_mode,
       count_include_pad,
       divisor_override,
       output);
-  return output;
-}
-
-Tensor XPUNativeFunctions::avg_pool2d_backward(
-    const Tensor& grad_output,
-    const Tensor& input,
-    at::IntArrayRef kernel_size,
-    at::IntArrayRef stride,
-    at::IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
-  Tensor grad_input;
-  grad_input = avg_pool2d_backward_meta(
-      grad_output,
-      grad_input,
-      input,
-      kernel_size,
-      stride,
-      padding,
-      ceil_mode,
-      count_include_pad,
-      divisor_override);
-  at::native::xpu::avg_pool2d_backward_kernel(
-      grad_output,
-      input,
-      kernel_size,
-      stride,
-      padding,
-      ceil_mode,
-      count_include_pad,
-      divisor_override,
-      grad_input);
-  return grad_input;
 }
 
-Tensor& XPUNativeFunctions::avg_pool2d_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    at::IntArrayRef kernel_size,
-    at::IntArrayRef stride,
-    at::IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
-    Tensor& grad_input) {
-  avg_pool2d_backward_meta(
-      grad_output,
-      grad_input,
-      input,
-      kernel_size,
-      stride,
-      padding,
-      ceil_mode,
-      count_include_pad,
-      divisor_override);
-  at::native::xpu::avg_pool2d_backward_kernel(
-      grad_output,
-      input,
+TORCH_IMPL_FUNC(avg_pool2d_backward_out_xpu)
+(const Tensor& gradOutput_,
+ const Tensor& input_,
+ IntArrayRef kernel_size,
+ IntArrayRef stride,
+ IntArrayRef padding,
+ bool ceil_mode,
+ bool count_include_pad,
+ std::optional<int64_t> divisor_override,
+ const Tensor& gradInput) {
+  xpu::avg_pool2d_backward_kernel(
+      gradOutput_,
+      input_,
       kernel_size,
       stride,
       padding,
       ceil_mode,
       count_include_pad,
       divisor_override,
-      grad_input);
-  return grad_input;
+      gradInput);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/BatchNorm.cpp b/src/ATen/native/xpu/BatchNorm.cpp
index 93018263d..63e04365a 100644
--- a/src/ATen/native/xpu/BatchNorm.cpp
+++ b/src/ATen/native/xpu/BatchNorm.cpp
@@ -1,19 +1,19 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/xpu/sycl/BatchNormKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
+namespace native {
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::batch_norm_stats(
+std::tuple<Tensor, Tensor> batch_norm_stats_xpu(
     const Tensor& input,
     double eps) {
-  return native::xpu::batch_norm_stats_kernel(input, eps);
+  return xpu::batch_norm_stats_kernel(input, eps);
 }
 
-Tensor XPUNativeFunctions::batch_norm_elemt(
+Tensor batch_norm_elemt_xpu(
     const Tensor& input,
     const std::optional<Tensor>& weight,
     const std::optional<Tensor>& bias,
@@ -21,12 +21,11 @@ Tensor XPUNativeFunctions::batch_norm_elemt(
     const Tensor& invstd,
     double eps) {
   auto output = at::empty_like(input);
-  native::xpu::batch_norm_elemt_kernel(
-      output, input, weight, bias, mean, invstd);
+  xpu::batch_norm_elemt_kernel(output, input, weight, bias, mean, invstd);
   return output;
 }
 
-Tensor& XPUNativeFunctions::batch_norm_elemt_out(
+Tensor& batch_norm_elemt_xpu_out(
     const Tensor& input,
     const std::optional<Tensor>& weight,
     const std::optional<Tensor>& bias,
@@ -34,25 +33,24 @@ Tensor& XPUNativeFunctions::batch_norm_elemt_out(
     const Tensor& invstd,
     double eps,
     Tensor& out) {
-  native::xpu::batch_norm_elemt_kernel(out, input, weight, bias, mean, invstd);
+  xpu::batch_norm_elemt_kernel(out, input, weight, bias, mean, invstd);
   return out;
 }
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> XPUNativeFunctions::
-    batch_norm_backward_reduce(
-        const Tensor& grad_out,
-        const Tensor& input,
-        const Tensor& mean,
-        const Tensor& invstd,
-        const std::optional<Tensor>& weight,
-        bool input_g,
-        bool weight_g,
-        bool bias_g) {
-  return native::xpu::batch_norm_backward_reduce_kernel(
+std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_xpu(
+    const Tensor& grad_out,
+    const Tensor& input,
+    const Tensor& mean,
+    const Tensor& invstd,
+    const std::optional<Tensor>& weight,
+    bool input_g,
+    bool weight_g,
+    bool bias_g) {
+  return xpu::batch_norm_backward_reduce_kernel(
       grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g);
 }
 
-Tensor XPUNativeFunctions::batch_norm_backward_elemt(
+Tensor batch_norm_backward_elemt_xpu(
     const Tensor& grad_out,
     const Tensor& input,
     const Tensor& mean,
@@ -61,20 +59,20 @@ Tensor XPUNativeFunctions::batch_norm_backward_elemt(
     const Tensor& sum_dy,
     const Tensor& sum_dy_xmu,
     const Tensor& count) {
-  return native::xpu::batch_norm_backward_elemt_kernel(
+  return xpu::batch_norm_backward_elemt_kernel(
       grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::batch_norm_update_stats(
+std::tuple<Tensor, Tensor> batch_norm_update_stats_xpu(
     const Tensor& input,
     const std::optional<Tensor>& running_mean,
     const std::optional<Tensor>& running_var,
     double momentum) {
-  return native::xpu::batch_norm_update_stats_kernel(
+  return xpu::batch_norm_update_stats_kernel(
       input, running_mean, running_var, momentum);
 }
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::native_batch_norm(
+std::tuple<Tensor, Tensor, Tensor> batch_norm_xpu(
     const Tensor& input,
     const std::optional<Tensor>& weight,
     const std::optional<Tensor>& bias,
@@ -90,7 +88,7 @@ std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::native_batch_norm(
   auto save_mean = at::empty({n_input}, options);
   auto save_invstd = at::empty({n_input}, options);
 
-  native::xpu::batch_norm_kernel(
+  xpu::batch_norm_kernel(
       input,
       weight,
       bias,
@@ -106,7 +104,7 @@ std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::native_batch_norm(
   return std::make_tuple(output, save_mean, save_invstd);
 }
 
-std::tuple<Tensor&, Tensor&, Tensor&> XPUNativeFunctions::native_batch_norm_out(
+std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_xpu_out(
     const Tensor& input,
     const std::optional<Tensor>& weight,
     const std::optional<Tensor>& bias,
@@ -118,7 +116,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> XPUNativeFunctions::native_batch_norm_out(
     Tensor& out,
     Tensor& save_mean,
     Tensor& save_invstd) {
-  return native::xpu::batch_norm_kernel(
+  return xpu::batch_norm_kernel(
       input,
       weight,
       bias,
@@ -132,19 +130,18 @@ std::tuple<Tensor&, Tensor&, Tensor&> XPUNativeFunctions::native_batch_norm_out(
       save_invstd);
 }
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::
-    native_batch_norm_backward(
-        const Tensor& grad_out,
-        const Tensor& input,
-        const std::optional<Tensor>& weight,
-        const std::optional<Tensor>& running_mean,
-        const std::optional<Tensor>& running_var,
-        const std::optional<Tensor>& save_mean,
-        const std::optional<Tensor>& save_invstd,
-        bool train,
-        double eps,
-        std::array<bool, 3> output_mask) {
-  return native::xpu::batch_norm_backward_kernel(
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_xpu(
+    const Tensor& grad_out,
+    const Tensor& input,
+    const std::optional<Tensor>& weight,
+    const std::optional<Tensor>& running_mean,
+    const std::optional<Tensor>& running_var,
+    const std::optional<Tensor>& save_mean,
+    const std::optional<Tensor>& save_invstd,
+    bool train,
+    double eps,
+    std::array<bool, 3> output_mask) {
+  return xpu::batch_norm_backward_kernel(
       grad_out,
       input,
       weight,
@@ -157,7 +154,7 @@ std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::
       output_mask);
 }
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::_native_batch_norm_legit(
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_xpu(
     const Tensor& input,
     const std::optional<Tensor>& weight,
     const std::optional<Tensor>& bias,
@@ -166,24 +163,23 @@ std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::_native_batch_norm_legit(
     bool training,
     double momentum,
     double eps) {
-  return XPUNativeFunctions::native_batch_norm(
+  return batch_norm_xpu(
       input, weight, bias, running_mean, running_var, training, momentum, eps);
 }
 
-std::tuple<Tensor&, Tensor&, Tensor&> XPUNativeFunctions::
-    _native_batch_norm_legit_out(
-        const Tensor& input,
-        const std::optional<Tensor>& weight,
-        const std::optional<Tensor>& bias,
-        Tensor& running_mean,
-        Tensor& running_var,
-        bool training,
-        double momentum,
-        double eps,
-        Tensor& out,
-        Tensor& save_mean,
-        Tensor& save_invstd) {
-  return XPUNativeFunctions::native_batch_norm_out(
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_xpu_out(
+    const Tensor& input,
+    const std::optional<Tensor>& weight,
+    const std::optional<Tensor>& bias,
+    Tensor& running_mean,
+    Tensor& running_var,
+    bool training,
+    double momentum,
+    double eps,
+    Tensor& out,
+    Tensor& save_mean,
+    Tensor& save_invstd) {
+  return batch_norm_xpu_out(
       input,
       weight,
       bias,
@@ -197,29 +193,29 @@ std::tuple<Tensor&, Tensor&, Tensor&> XPUNativeFunctions::
       save_invstd);
 }
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::_native_batch_norm_legit(
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_xpu(
     const Tensor& input,
     const std::optional<Tensor>& weight,
     const std::optional<Tensor>& bias,
     bool training,
     double momentum,
     double eps) {
-  return XPUNativeFunctions::native_batch_norm(
+  return batch_norm_xpu(
       input, weight, bias, Tensor(), Tensor(), training, momentum, eps);
 }
 
-std::tuple<at::Tensor&, at::Tensor&, at::Tensor&> XPUNativeFunctions::
-    _native_batch_norm_legit_out(
-        const at::Tensor& input,
-        const std::optional<at::Tensor>& weight,
-        const std::optional<at::Tensor>& bias,
-        bool training,
-        double momentum,
-        double eps,
-        at::Tensor& out,
-        at::Tensor& save_mean,
-        at::Tensor& save_invstd) {
-  return XPUNativeFunctions::native_batch_norm_out(
+std::tuple<at::Tensor&, at::Tensor&, at::Tensor&>
+_batch_norm_legit_no_stats_xpu_out(
+    const at::Tensor& input,
+    const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& bias,
+    bool training,
+    double momentum,
+    double eps,
+    at::Tensor& out,
+    at::Tensor& save_mean,
+    at::Tensor& save_invstd) {
+  return batch_norm_xpu_out(
       input,
       weight,
       bias,
@@ -233,7 +229,7 @@ std::tuple<at::Tensor&, at::Tensor&, at::Tensor&> XPUNativeFunctions::
       save_invstd);
 }
 
-inline std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_with_update(
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_xpu(
     const Tensor& input,
     const c10::optional<Tensor>& weight_opt,
     const c10::optional<Tensor>& bias_opt,
@@ -256,7 +252,7 @@ inline std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_with_update(
   auto save_mean = at::empty({n_input}, options);
   auto save_invstd = at::empty({n_input}, options);
 
-  native::xpu::batch_norm_kernel(
+  xpu::batch_norm_kernel(
       input,
       weight,
       bias,
@@ -273,7 +269,7 @@ inline std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_with_update(
       output, save_mean, save_invstd, reserve);
 }
 
-inline std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> batch_norm_with_update_out(
+std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_xpu_out(
     const Tensor& input,
     const c10::optional<Tensor>& weight_opt,
     const c10::optional<Tensor>& bias_opt,
@@ -290,7 +286,7 @@ inline std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> batch_norm_with_update_out
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); });
 
-  std::tie(out, save_mean, save_var) = native::xpu::batch_norm_kernel(
+  std::tie(out, save_mean, save_var) = xpu::batch_norm_kernel(
       input,
       weight,
       bias,
@@ -307,47 +303,7 @@ inline std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> batch_norm_with_update_out
       out, save_mean, save_var, reserve);
 }
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> XPUNativeFunctions::
-    _batch_norm_with_update(
-        const Tensor& input,
-        const std::optional<Tensor>& weight,
-        const std::optional<Tensor>& bias,
-        Tensor& running_mean,
-        Tensor& running_var,
-        double momentum,
-        double eps) {
-  return batch_norm_with_update(
-      input, weight, bias, running_mean, running_var, momentum, eps);
-}
-
-std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> XPUNativeFunctions::
-    _batch_norm_with_update_out(
-        const Tensor& input,
-        const std::optional<Tensor>& weight,
-        const std::optional<Tensor>& bias,
-        Tensor& running_mean,
-        Tensor& running_var,
-        double momentum,
-        double eps,
-        Tensor& out,
-        Tensor& save_mean,
-        Tensor& save_invstd,
-        Tensor& reserve) {
-  return batch_norm_with_update_out(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      momentum,
-      eps,
-      out,
-      save_mean,
-      save_invstd,
-      reserve);
-}
-
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::batch_norm_backward(
+std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& weight,
@@ -367,7 +323,7 @@ std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::batch_norm_backward(
       c10::value_or_else(save_mean_opt, [] { return Tensor(); });
   const Tensor& save_var =
       c10::value_or_else(save_var_opt, [] { return Tensor(); });
-  return native::xpu::batch_norm_backward_kernel(
+  return xpu::batch_norm_backward_kernel(
       grad_output,
       input,
       weight,
@@ -380,4 +336,5 @@ std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::batch_norm_backward(
       grad_input_mask);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
index e17309841..31c6dd984 100644
--- a/src/ATen/native/xpu/BinaryOps.cpp
+++ b/src/ATen/native/xpu/BinaryOps.cpp
@@ -1,8 +1,10 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
+#include <xpu/ATen/ops/add_native.h>
 
 #include <ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h>
 #include <ATen/native/xpu/sycl/BinaryGeometricKernels.h>
@@ -18,770 +20,46 @@
 #include <ATen/native/xpu/sycl/StepKernels.h>
 
 namespace at {
-Tensor XPUNativeFunctions::add(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::alpha_check(iter.dtype(), alpha);
-  native::xpu::add_kernel(iter, alpha);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::add_(
-    Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha) {
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::alpha_check(iter.dtype(), alpha);
-  native::xpu::add_kernel(iter, alpha);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::add_out(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::alpha_check(iter.dtype(), alpha);
-  native::xpu::add_kernel(iter, alpha);
-  return out;
-}
-
-Tensor XPUNativeFunctions::sub(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha) {
-  Tensor out;
-  native::sub_check(self, other);
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::alpha_check(iter.dtype(), alpha);
-  native::xpu::sub_kernel(iter, alpha);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sub_(
-    Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha) {
-  native::sub_check(self, other);
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::alpha_check(iter.dtype(), alpha);
-  native::xpu::sub_kernel(iter, alpha);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::sub_out(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha,
-    Tensor& out) {
-  native::sub_check(self, other);
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::alpha_check(iter.dtype(), alpha);
-  native::xpu::sub_kernel(iter, alpha);
-  return out;
-}
-
-Tensor XPUNativeFunctions::mul(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::mul_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::mul_(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::xpu::mul_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::mul_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::mul_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::div(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(out, self, other);
-  native::xpu::div_true_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::div_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(self, self, other);
-  native::xpu::div_true_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::div_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(out, self, other);
-  native::xpu::div_true_kernel(iter);
-  return out;
-}
-
-static inline TensorIterator meta_func_div_Tensor_mode(
-    const Tensor& self,
-    const Tensor& other,
-    const Tensor& output,
-    c10::optional<c10::string_view> rounding_mode) {
-  TensorIterator iter;
-  if (!rounding_mode.has_value()) {
-    iter.build_borrowing_binary_float_op(output, self, other);
-    // NOLINTNEXTLINE(bugprone-branch-clone)
-  } else if (*rounding_mode == "trunc") {
-    iter.build_borrowing_binary_op(output, self, other);
-  } else if (*rounding_mode == "floor") {
-    iter.build_borrowing_binary_op(output, self, other);
-  } else {
-    TORCH_CHECK(
-        false,
-        "div expected rounding_mode to be one of None, 'trunc', or 'floor' "
-        "but found '",
-        *rounding_mode,
-        "'");
-  }
-  return iter;
-}
-
-static inline void impl_func_div_Tensor_mode(
-    TensorIterator& iter,
-    ::std::optional<c10::string_view> rounding_mode) {
-  if (!rounding_mode.has_value()) {
-    native::xpu::div_true_kernel(iter);
-  } else if (*rounding_mode == "trunc") {
-    native::xpu::div_trunc_kernel(iter);
-  } else if (*rounding_mode == "floor") {
-    native::xpu::div_floor_kernel(iter);
-  }
-}
-
-Tensor XPUNativeFunctions::div(
-    const at::Tensor& self,
-    const at::Tensor& other,
-    ::std::optional<c10::string_view> rounding_mode) {
-  Tensor output;
-  TensorIterator iter =
-      meta_func_div_Tensor_mode(self, other, output, rounding_mode);
-  impl_func_div_Tensor_mode(iter, rounding_mode);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::div_(
-    at::Tensor& self,
-    const at::Tensor& other,
-    ::std::optional<c10::string_view> rounding_mode) {
-  TensorIterator iter =
-      meta_func_div_Tensor_mode(self, other, self, rounding_mode);
-  impl_func_div_Tensor_mode(iter, rounding_mode);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::div_out(
-    const at::Tensor& self,
-    const at::Tensor& other,
-    ::std::optional<c10::string_view> rounding_mode,
-    at::Tensor& output) {
-  TensorIterator iter =
-      meta_func_div_Tensor_mode(self, other, output, rounding_mode);
-  impl_func_div_Tensor_mode(iter, rounding_mode);
-  return output;
-}
-
-Tensor XPUNativeFunctions::rsub(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha) {
-  return XPUNativeFunctions::sub(other, self, alpha);
-}
-
-Tensor XPUNativeFunctions::remainder(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::remainder_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::remainder_(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::xpu::remainder_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::remainder_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::remainder_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::remainder(const Scalar& self, const Tensor& other) {
-  auto wrapper = native::wrapped_scalar_tensor(self);
-  return XPUNativeFunctions::remainder(wrapper, other);
-}
-
-Tensor XPUNativeFunctions::fmod(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::fmod_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::fmod_(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::xpu::fmod_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::fmod_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::fmod_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::tanh_backward(
-    const Tensor& grad_output,
-    const Tensor& output) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, grad_output, output);
-  native::xpu::tanh_backward_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::tanh_backward_out(
-    const Tensor& grad_output,
-    const Tensor& output,
-    Tensor& grad_input) {
-  auto iter =
-      TensorIterator::borrowing_binary_op(grad_input, grad_output, output);
-  native::xpu::tanh_backward_kernel(iter);
-  return grad_input;
-}
-
-Tensor& XPUNativeFunctions::bitwise_and_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::bitwise_and_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::bitwise_or_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::bitwise_or_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::bitwise_xor_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::bitwise_xor_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::__lshift__(const Tensor& self, const Tensor& other) {
-  Tensor result;
-  auto iter = TensorIterator::binary_op(result, self, other);
-  native::xpu::lshift_kernel(iter);
-  return iter.output();
-}
-
-Tensor XPUNativeFunctions::__lshift__(const Tensor& self, const Scalar& other) {
-  Tensor result;
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  auto iter = TensorIterator::binary_op(result, self, wrapper);
-  native::xpu::lshift_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::__ilshift__(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::binary_op(self, self, other);
-  native::xpu::lshift_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::__ilshift__(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  auto iter = TensorIterator::binary_op(self, self, wrapper);
-  native::xpu::lshift_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::bitwise_left_shift_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& result) {
-  auto iter = TensorIterator::borrowing_binary_op(result, self, other);
-  native::xpu::lshift_kernel(iter);
-  return result;
-}
-
-Tensor XPUNativeFunctions::__rshift__(const Tensor& self, const Tensor& other) {
-  Tensor result;
-  auto iter = TensorIterator::binary_op(result, self, other);
-  native::xpu::rshift_kernel(iter);
-  return iter.output();
-}
-
-Tensor XPUNativeFunctions::__rshift__(const Tensor& self, const Scalar& other) {
-  Tensor result;
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  auto iter = TensorIterator::binary_op(result, self, wrapper);
-  native::xpu::rshift_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::__irshift__(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::binary_op(self, self, other);
-  native::xpu::rshift_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::__irshift__(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  auto iter = TensorIterator::binary_op(self, self, wrapper);
-  native::xpu::rshift_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::bitwise_right_shift_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& result) {
-  auto iter = TensorIterator::borrowing_binary_op(result, self, other);
-  native::xpu::rshift_kernel(iter);
-  return result;
-}
-
-Tensor XPUNativeFunctions::gcd(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::gcd_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::gcd_(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::xpu::gcd_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::gcd_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::gcd_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::nextafter(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::nextafter_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::nextafter_(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::xpu::nextafter_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::nextafter_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::nextafter_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::hypot(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::hypot_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::hypot_(Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::borrowing_binary_op(self, self, other);
-  native::xpu::hypot_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::hypot_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::hypot_kernel(iter);
-  return out;
-}
-
-static inline TensorIterator meta_func_maximum(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  TORCH_CHECK(
-      !self.is_complex() && !other.is_complex(),
-      "maximum not implemented for complex tensors.");
+namespace native {
+REGISTER_XPU_DISPATCH(add_stub, &xpu::add_kernel)
+REGISTER_XPU_DISPATCH(sub_stub, &xpu::sub_kernel);
+REGISTER_XPU_DISPATCH(mul_stub, &xpu::mul_kernel);
+REGISTER_XPU_DISPATCH(div_true_stub, &xpu::div_true_kernel);
+REGISTER_XPU_DISPATCH(div_trunc_stub, &xpu::div_trunc_kernel);
+REGISTER_XPU_DISPATCH(div_floor_stub, &xpu::div_floor_kernel);
+REGISTER_XPU_DISPATCH(remainder_stub, &xpu::remainder_kernel);
+REGISTER_XPU_DISPATCH(fmod_stub, &xpu::fmod_kernel);
+REGISTER_XPU_DISPATCH(tanh_backward_stub, &xpu::tanh_backward_kernel);
+REGISTER_XPU_DISPATCH(bitwise_and_stub, &xpu::bitwise_and_kernel);
+REGISTER_XPU_DISPATCH(bitwise_or_stub, &xpu::bitwise_or_kernel);
+REGISTER_XPU_DISPATCH(bitwise_xor_stub, &xpu::bitwise_xor_kernel);
+REGISTER_XPU_DISPATCH(gcd_stub, &xpu::gcd_kernel);
+REGISTER_XPU_DISPATCH(maximum_stub, &xpu::maximum_kernel);
+REGISTER_XPU_DISPATCH(minimum_stub, &xpu::minimum_kernel);
+REGISTER_XPU_DISPATCH(sigmoid_backward_stub, &xpu::sigmoid_backward_kernel);
+REGISTER_XPU_DISPATCH(nextafter_stub, &xpu::nextafter_kernel);
+REGISTER_XPU_DISPATCH(hypot_stub, &xpu::hypot_kernel);
+REGISTER_XPU_DISPATCH(atan2_stub, &xpu::atan2_kernel);
+REGISTER_XPU_DISPATCH(copysign_stub, &xpu::copysign_kernel);
+REGISTER_XPU_DISPATCH(logical_and_stub, &xpu::logical_and_kernel);
+REGISTER_XPU_DISPATCH(logical_or_stub, &xpu::logical_or_kernel);
+REGISTER_XPU_DISPATCH(logical_xor_stub, &xpu::logical_xor_kernel);
+REGISTER_XPU_DISPATCH(logit_backward_stub, &xpu::logit_backward_kernel);
+REGISTER_XPU_DISPATCH(logaddexp_stub, &xpu::logaddexp_kernel);
+REGISTER_XPU_DISPATCH(logaddexp2_stub, &xpu::logaddexp2_kernel);
+REGISTER_XPU_DISPATCH(fmax_stub, &xpu::fmax_kernel);
+REGISTER_XPU_DISPATCH(fmin_stub, &xpu::fmin_kernel);
+REGISTER_XPU_DISPATCH(lshift_stub, &xpu::lshift_kernel);
+REGISTER_XPU_DISPATCH(rshift_stub, &xpu::rshift_kernel);
+
+TORCH_IMPL_FUNC(add_out_xpu)
+(const Tensor& self,
+ const Tensor& other,
+ const Scalar& alpha,
+ const Tensor& output) {
   auto iter = TensorIterator::borrowing_binary_op(output, self, other);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::maximum(const Tensor& self, const Tensor& other) {
-  Tensor output;
-  auto iter = meta_func_maximum(self, other, output);
-  native::xpu::maximum_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::maximum_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  auto iter = meta_func_maximum(self, other, output);
-  native::xpu::maximum_kernel(iter);
-  return output;
-}
-
-static inline TensorIterator meta_func_minimum(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  TORCH_CHECK(
-      !self.is_complex() && !other.is_complex(),
-      "minimum not implemented for complex tensors.");
-  auto iter = TensorIterator::borrowing_binary_op(output, self, other);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::minimum(const Tensor& self, const Tensor& other) {
-  Tensor output;
-  auto iter = meta_func_minimum(self, other, output);
-  native::xpu::minimum_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::minimum_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  auto iter = meta_func_minimum(self, other, output);
-  native::xpu::minimum_kernel(iter);
-  return output;
-}
-
-Tensor& XPUNativeFunctions::logit_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    std::optional<double> eps,
-    Tensor& grad_input) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_op(grad_input, grad_output, input);
-  native::xpu::logit_backward_kernel(iter, Scalar(eps ? eps.value() : -1.0));
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::logit_backward(
-    const Tensor& grad_output,
-    const Tensor& input,
-    std::optional<double> eps) {
-  Tensor grad_input;
-  TensorIterator iter;
-  iter.build_borrowing_binary_op(grad_input, grad_output, input);
-  native::xpu::logit_backward_kernel(iter, Scalar(eps ? eps.value() : -1.0));
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sigmoid_backward_out(
-    const Tensor& grad_output,
-    const Tensor& output,
-    Tensor& grad_input) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_op(grad_input, grad_output, output);
-  native::xpu::sigmoid_backward_kernel(iter);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::sigmoid_backward(
-    const Tensor& grad_output,
-    const Tensor& output) {
-  Tensor grad_input;
-  TensorIterator iter;
-  iter.build_borrowing_binary_op(grad_input, grad_output, output);
-  native::xpu::sigmoid_backward_kernel(iter);
-  return iter.output();
-}
-
-Tensor XPUNativeFunctions::logaddexp(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::logaddexp_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::logaddexp_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::logaddexp_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::logaddexp2(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::logaddexp2_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::logaddexp2_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::borrowing_binary_op(out, self, other);
-  native::xpu::logaddexp2_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::floor_divide_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  auto iter = TensorIterator::binary_op(output, self, other);
-  native::xpu::div_floor_kernel(iter);
-  if (!output.defined()) {
-    output = iter.output();
-  }
-  return output;
-}
-
-Tensor XPUNativeFunctions::floor_divide(
-    const Tensor& self,
-    const Tensor& other) {
-  Tensor output;
-  auto iter = TensorIterator::binary_op(output, self, other);
-  native::xpu::div_floor_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::floor_divide_(Tensor& self, const Tensor& other) {
-  return XPUNativeFunctions::floor_divide_out(self, other, self);
-}
-
-TensorIterator meta_fmin_fmax(
-    const char* const name,
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  TORCH_CHECK(
-      !self.is_complex() && !other.is_complex(),
-      name,
-      " not implemented for complex tensors.");
-  TensorIterator iter;
-  iter.build_binary_op(output, self, other);
-  return iter;
-}
-
-Tensor& XPUNativeFunctions::fmax_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  auto iter = meta_fmin_fmax("fmax", self, other, output);
-  native::xpu::fmax_kernel(iter);
-  return output;
-}
-
-Tensor XPUNativeFunctions::fmax(const Tensor& self, const Tensor& other) {
-  Tensor output;
-  auto iter = meta_fmin_fmax("fmax", self, other, output);
-  native::xpu::fmax_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::fmin_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& output) {
-  auto iter = meta_fmin_fmax("fmin", self, other, output);
-  native::xpu::fmin_kernel(iter);
-  return output;
-}
-
-Tensor XPUNativeFunctions::fmin(const Tensor& self, const Tensor& other) {
-  Tensor output;
-  auto iter = meta_fmin_fmax("fmin", self, other, output);
-  native::xpu::fmin_kernel(iter);
-  return iter.output();
-}
-
-Tensor XPUNativeFunctions::atan2(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(out, self, other);
-  native::xpu::atan2_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::atan2_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(self, self, other);
-  native::xpu::atan2_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::atan2_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(out, self, other);
-  native::xpu::atan2_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::copysign_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(out, self, other);
-  native::xpu::copysign_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::copysign_(Tensor& self, const Tensor& other) {
-  return XPUNativeFunctions::copysign_out(self, other, self);
-}
-
-Tensor XPUNativeFunctions::copysign(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_binary_float_op(out, self, other);
-  native::xpu::copysign_kernel(iter);
-  return iter.output();
-}
-
-// We need explicit cast to OutFunc because each *_out func is overloaded twice.
-// Without An explicit cast, merely referring to *_out function is ambiguous.
-using OutFunc =
-    std::add_const<Tensor& (&)(Tensor&, const Tensor&, const Tensor&)>::type;
-
-template <typename OutImpl>
-Tensor comparison_op(
-    const Tensor& self,
-    const Tensor& other,
-    OutImpl& out_impl) {
-  Tensor result = at::empty({0}, self.options().dtype(kBool));
-  return out_impl(result, self, other);
-}
-
-template <typename OutImpl>
-Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
-  return out_impl(self, self, other);
-}
-
-template <typename OutImpl>
-Tensor& comparison_op_out(
-    Tensor& result,
-    const Tensor& self,
-    const Scalar& other,
-    OutImpl& out_impl) {
-  return out_impl(result, self, native::wrapped_scalar_tensor(other));
-}
-
-template <typename OutImpl>
-Tensor comparison_op(
-    const Tensor& self,
-    const Scalar& other,
-    OutImpl& out_impl) {
-  return comparison_op(self, native::wrapped_scalar_tensor(other), out_impl);
-}
-
-template <typename OutImpl>
-Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
-  return out_impl(self, self, native::wrapped_scalar_tensor(other));
-}
-
-Tensor& XPUNativeFunctions::logical_and_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::comparison_op(out, self, other);
-  native::xpu::logical_and_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::logical_and(
-    const Tensor& self,
-    const Tensor& other) {
-  return comparison_op(self, other, static_cast<OutFunc>(at::logical_and_out));
-}
-
-Tensor& XPUNativeFunctions::logical_and_(Tensor& self, const Tensor& other) {
-  return comparison_op_(self, other, static_cast<OutFunc>(at::logical_and_out));
-}
-
-Tensor& XPUNativeFunctions::logical_or_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::comparison_op(out, self, other);
-  native::xpu::logical_or_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::logical_or(const Tensor& self, const Tensor& other) {
-  return comparison_op(self, other, static_cast<OutFunc>(at::logical_or_out));
-}
-
-Tensor& XPUNativeFunctions::logical_or_(Tensor& self, const Tensor& other) {
-  return comparison_op_(self, other, static_cast<OutFunc>(at::logical_or_out));
-}
-
-Tensor& XPUNativeFunctions::logical_xor_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  auto iter = TensorIterator::comparison_op(out, self, other);
-  native::xpu::logical_xor_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::logical_xor(
-    const Tensor& self,
-    const Tensor& other) {
-  return comparison_op(self, other, static_cast<OutFunc>(at::logical_xor_out));
-}
-
-Tensor& XPUNativeFunctions::logical_xor_(Tensor& self, const Tensor& other) {
-  return comparison_op_(self, other, static_cast<OutFunc>(at::logical_xor_out));
+  xpu::add_kernel(iter, alpha);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Bucketization.cpp b/src/ATen/native/xpu/Bucketization.cpp
index 0d6c2a9f5..3394f87bd 100644
--- a/src/ATen/native/xpu/Bucketization.cpp
+++ b/src/ATen/native/xpu/Bucketization.cpp
@@ -1,11 +1,11 @@
 #include <ATen/native/BucketizationUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/xpu/sycl/BucketizationKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
+namespace native {
 
-Tensor& XPUNativeFunctions::searchsorted_out(
+Tensor& searchsorted_out_xpu(
     const Tensor& sorted_sequence,
     const Tensor& self,
     bool out_int32,
@@ -17,9 +17,9 @@ Tensor& XPUNativeFunctions::searchsorted_out(
   c10::MaybeOwned<Tensor> sorter_maybe_owned =
       at::borrow_from_optional_tensor(sorter_opt);
   const Tensor& sorter = *sorter_maybe_owned;
-  at::native::searchsorted_pre_check(
+  searchsorted_pre_check(
       sorted_sequence, self, result, out_int32, right, side_opt, sorter);
-  at::native::resize_output(result, self.sizes());
+  resize_output(result, self.sizes());
 
   if (self.numel() == 0) {
     return result;
@@ -28,12 +28,12 @@ Tensor& XPUNativeFunctions::searchsorted_out(
   // we have two inputs to set right, pre_check checks that they aren't set to
   // opposites
   bool is_right = (side_opt && *side_opt == "right") || right;
-  at::native::xpu::searchsorted_kernel(
+  xpu::searchsorted_kernel(
       result, self, sorted_sequence, out_int32, is_right, sorter);
   return result;
 }
 
-Tensor& XPUNativeFunctions::searchsorted_out(
+Tensor& searchsorted_out_xpu(
     const Tensor& sorted_sequence,
     const Scalar& self,
     bool out_int32,
@@ -42,8 +42,8 @@ Tensor& XPUNativeFunctions::searchsorted_out(
     const std::optional<Tensor>& sorter_opt,
     Tensor& result) {
   const Tensor& scalar_tensor =
-      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
-  return searchsorted_out(
+      searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted_out_xpu(
       sorted_sequence,
       scalar_tensor,
       out_int32,
@@ -53,7 +53,7 @@ Tensor& XPUNativeFunctions::searchsorted_out(
       result);
 }
 
-Tensor XPUNativeFunctions::searchsorted(
+Tensor searchsorted_xpu(
     const Tensor& sorted_sequence,
     const Tensor& self,
     bool out_int32,
@@ -64,12 +64,12 @@ Tensor XPUNativeFunctions::searchsorted(
   c10::TensorOptions options =
       TensorOptions().device(self.options().device()).dtype(scalar_type);
   Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
-  searchsorted_out(
+  searchsorted_out_xpu(
       sorted_sequence, self, out_int32, right, side_opt, sorter, result);
   return result;
 }
 
-Tensor XPUNativeFunctions::searchsorted(
+Tensor searchsorted_xpu(
     const Tensor& sorted_sequence,
     const Scalar& self,
     bool out_int32,
@@ -77,12 +77,12 @@ Tensor XPUNativeFunctions::searchsorted(
     const std::optional<c10::string_view> side_opt,
     const std::optional<Tensor>& sorter) {
   const Tensor& scalar_tensor =
-      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
-  return searchsorted(
+      searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted_xpu(
       sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter);
 }
 
-Tensor& XPUNativeFunctions::bucketize_out(
+Tensor& bucketize_out_xpu(
     const Tensor& self,
     const Tensor& boundaries,
     bool out_int32,
@@ -93,12 +93,12 @@ Tensor& XPUNativeFunctions::bucketize_out(
       "boundaries tensor must be 1 dimension, but got dim(",
       boundaries.dim(),
       ")");
-  searchsorted_out(
+  searchsorted_out_xpu(
       boundaries, self, out_int32, right, nullopt, nullopt, result);
   return result;
 }
 
-Tensor XPUNativeFunctions::bucketize(
+Tensor bucketize_xpu(
     const Tensor& self,
     const Tensor& boundaries,
     bool out_int32,
@@ -107,19 +107,20 @@ Tensor XPUNativeFunctions::bucketize(
   c10::TensorOptions options =
       TensorOptions().device(self.options().device()).dtype(scalar_type);
   Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
-  bucketize_out(self, boundaries, out_int32, right, result);
+  bucketize_out_xpu(self, boundaries, out_int32, right, result);
   return result;
 }
 
-Tensor XPUNativeFunctions::bucketize(
+Tensor bucketize_xpu(
     const Scalar& self,
     const Tensor& boundaries,
     bool out_int32,
     bool right) {
-  return bucketize(
-      at::native::searchsorted_scalar_tensor(self, boundaries.device()),
+  return bucketize_xpu(
+      searchsorted_scalar_tensor(self, boundaries.device()),
       boundaries,
       out_int32,
       right);
 }
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Col2Im.cpp b/src/ATen/native/xpu/Col2Im.cpp
index 3a46fd8ad..2a6742e5e 100644
--- a/src/ATen/native/xpu/Col2Im.cpp
+++ b/src/ATen/native/xpu/Col2Im.cpp
@@ -1,15 +1,17 @@
-#include <ATen/ATen.h>
+
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/div_rtn.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <torch/library.h>
 
 #include <ATen/native/xpu/sycl/Col2ImKernel.h>
 
-namespace at {
+#include <comm/xpu_aten.h>
+#include <xpu/ATen/ops/col2im_native.h>
+
+namespace at::native {
 
-Tensor& XPUNativeFunctions::col2im_out(
+Tensor& col2im_out_xpu(
     const Tensor& self,
     IntArrayRef output_size,
     IntArrayRef kernel_size,
@@ -27,7 +29,7 @@ Tensor& XPUNativeFunctions::col2im_out(
   return out;
 }
 
-Tensor XPUNativeFunctions::col2im(
+Tensor col2im_xpu(
     const Tensor& self,
     IntArrayRef output_size,
     IntArrayRef kernel_size,
@@ -43,4 +45,4 @@ Tensor XPUNativeFunctions::col2im(
   return output;
 }
 
-} // namespace at
+} // namespace at::native
diff --git a/src/ATen/native/xpu/CompareOps.cpp b/src/ATen/native/xpu/CompareOps.cpp
index 6f84e68fc..ee0798ad0 100644
--- a/src/ATen/native/xpu/CompareOps.cpp
+++ b/src/ATen/native/xpu/CompareOps.cpp
@@ -2,336 +2,16 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/CompareKernels.h>
 
 namespace at {
 
-Tensor XPUNativeFunctions::eq(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::eq_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::eq_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(self, self, other);
-  native::xpu::eq_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::eq_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::eq_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::eq(const Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::eq_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::eq_(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper);
-  native::xpu::eq_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::eq_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::eq_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::ne(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::ne_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::ne_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(self, self, other);
-  native::xpu::ne_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::ne_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::ne_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::ne(const Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::ne_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::ne_(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper);
-  native::xpu::ne_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::ne_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::ne_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::lt(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::lt_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::lt_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(self, self, other);
-  native::xpu::lt_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::lt_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::lt_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::lt(const Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::lt_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::lt_(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper);
-  native::xpu::lt_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::lt_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::lt_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::le(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::le_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::le_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(self, self, other);
-  native::xpu::le_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::le_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::le_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::le(const Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::le_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::le_(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper);
-  native::xpu::le_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::le_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::le_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::gt(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::gt_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::gt_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(self, self, other);
-  native::xpu::gt_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::gt_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::gt_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::gt(const Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::gt_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::gt_(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper);
-  native::xpu::gt_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::gt_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::gt_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::ge(const Tensor& self, const Tensor& other) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::ge_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::ge_(Tensor& self, const Tensor& other) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(self, self, other);
-  native::xpu::ge_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::ge_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_comparison_op(out, self, other);
-  native::xpu::ge_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::ge(const Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::ge_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::ge_(Tensor& self, const Scalar& other) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper);
-  native::xpu::ge_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::ge_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  auto wrapper = native::wrapped_scalar_tensor(other);
-  TensorIterator iter;
-  iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper);
-  native::xpu::ge_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::isnan(const Tensor& self) {
-  return XPUNativeFunctions::ne(self, self);
-}
-
-Tensor& XPUNativeFunctions::isnan_out(const Tensor& self, Tensor& out) {
-  return XPUNativeFunctions::ne_out(self, self, out);
-}
-
+namespace native {
+REGISTER_XPU_DISPATCH(eq_stub, &xpu::eq_kernel);
+REGISTER_XPU_DISPATCH(ne_stub, &xpu::ne_kernel);
+REGISTER_XPU_DISPATCH(le_stub, &xpu::le_kernel);
+REGISTER_XPU_DISPATCH(lt_stub, &xpu::lt_kernel);
+REGISTER_XPU_DISPATCH(ge_stub, &xpu::ge_kernel);
+REGISTER_XPU_DISPATCH(gt_stub, &xpu::gt_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Copy.cpp b/src/ATen/native/xpu/Copy.cpp
index c95aa9cee..fea67164c 100644
--- a/src/ATen/native/xpu/Copy.cpp
+++ b/src/ATen/native/xpu/Copy.cpp
@@ -1,20 +1,22 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/Copy.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/xpu/CachingHostAllocator.h>
 #include <ATen/xpu/XPUContext.h>
 #include <ATen/xpu/XPUEvent.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <ATen/xpu/detail/XPUHooks.h>
 #include <c10/core/ScalarType.h>
 #include <c10/xpu/XPUStream.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/CopyKernel.h>
 #include <ATen/native/xpu/sycl/UnaryComplexKernels.h>
 #include <comm/SYCLContext.h>
 #include <comm/XPUGuard.h>
 
+#include <ATen/ops/empty_like.h>
+
 using namespace at;
 using namespace at::xpu;
 
@@ -295,72 +297,7 @@ void _copy_xpu(TensorIterator& iter, bool non_blocking) {
 
 } // namespace native::xpu
 
-Tensor& XPUNativeFunctions::copy_(
-    Tensor& self,
-    const Tensor& src,
-    bool non_blocking) {
-  if (self._is_zerotensor()) {
-    TORCH_CHECK(
-        false,
-        "ZeroTensors are immutable. Please materialize the tensor using `.clone()`, if you want a mutable zero tensor.");
-  }
-  if (src._is_zerotensor()) {
-    return self.zero_();
-  }
-
-  TORCH_CHECK(self.defined(), "self is undefined");
-  TORCH_CHECK(src.defined(), "src is undefined");
-
-  if (self.is_same(src)) {
-    return self;
-  }
-
-  // TODO: Support quantization
-
-  // Exit early if self and src are views of the same data
-  const bool is_same_data =
-      (self.is_alias_of(src) && self.storage_offset() == src.storage_offset() &&
-       self.strides().equals(src.strides()) &&
-       self.sizes().equals(src.sizes()) &&
-       self.scalar_type() == src.scalar_type() &&
-       self.is_conj() == src.is_conj() && self.is_neg() == src.is_neg());
-  if (is_same_data) {
-    return self;
-  }
-
-  auto iter = TensorIteratorConfig()
-                  .set_check_mem_overlap(true)
-                  .add_output(self)
-                  .add_input(src)
-                  .resize_outputs(false)
-                  .check_all_same_dtype(false)
-                  .check_all_same_device(false)
-                  .build();
-
-  if (iter.numel() == 0) {
-    return self;
-  }
-
-  native::xpu::_copy_xpu(iter, non_blocking);
-
-  return self;
-}
-
-Tensor XPUNativeFunctions::_to_copy(
-    const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    bool non_blocking,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
-  return at::native::_to_copy(
-      self,
-      dtype,
-      layout,
-      device,
-      pin_memory,
-      non_blocking,
-      optional_memory_format);
+namespace native {
+REGISTER_XPU_DISPATCH(copy_stub, &native::xpu::_copy_xpu);
 }
 } // namespace at
diff --git a/src/ATen/native/xpu/Cross.cpp b/src/ATen/native/xpu/Cross.cpp
index 757f088f9..a155cf034 100644
--- a/src/ATen/native/xpu/Cross.cpp
+++ b/src/ATen/native/xpu/Cross.cpp
@@ -1,64 +1,12 @@
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/native/Cross.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/xpu/sycl/CrossKernel.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
 namespace at {
-void linalg_cross_meta(
-    const Tensor& input,
-    const Tensor& other,
-    int64_t dim,
-    Tensor& output) {
-  auto x_d = input.dim();
-  auto y_d = other.dim();
-  // This is to avoid things like
-  // linalg.cross(torch.randn(2, 3), torch.randn(5, 2, 3), dim=2)
-  TORCH_CHECK(
-      x_d == y_d,
-      "linalg.cross: inputs must have the same number of dimensions.");
-  TORCH_CHECK(
-      input.size(dim) == 3 && other.size(dim) == 3,
-      "linalg.cross: inputs dimension ",
-      dim,
-      " must have length 3. Got ",
-      input.size(dim),
-      " and ",
-      other.size(dim));
-
-  // Broadcast the batch dimension of input and other.
-  // Since the non-batch dimensions agree, this is the same as broadcast all the
-  // inputs
-  auto out_size = infer_size(input.sizes(), other.sizes());
-
-  if (output.defined()) {
-    at::xpu::resize_out(output, out_size, {}, input.options());
-  } else {
-    output = at::xpu::create_out(out_size, {}, input.options());
-  }
-}
-
-Tensor& XPUNativeFunctions::linalg_cross_out(
-    const Tensor& self,
-    const Tensor& other,
-    int64_t dim,
-    Tensor& out) {
-  linalg_cross_meta(self, other, dim, out);
-
-  dim = maybe_wrap_dim(dim, self.dim());
-  auto out_size = out.sizes();
-  Tensor input_broadcasted = self.expand(out_size);
-  Tensor other_broadcasted = other.expand(out_size);
-  native::xpu::linalg_cross_kernel(
-      out, input_broadcasted, other_broadcasted, dim);
-  return out;
-}
-
-Tensor XPUNativeFunctions::linalg_cross(
-    const Tensor& self,
-    const Tensor& other,
-    int64_t dim) {
-  Tensor out;
-  return linalg_cross_out(self, other, dim, out);
-}
-} // namespace at
\ No newline at end of file
+namespace native {
+REGISTER_XPU_DISPATCH(cross_stub, &xpu::linalg_cross_kernel);
+} // namespace native
+} // namespace at
diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
index af0c0cfd6..600d29e85 100644
--- a/src/ATen/native/xpu/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
@@ -2,305 +2,45 @@
 #include <ATen/native/Pool.h>
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
-namespace at {
-
-using namespace at::native;
-
-void max_pool2d_with_indices_meta(
-    const Tensor& input,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode,
-    Tensor& output,
-    Tensor& indices) {
-  TORCH_CHECK(
-      kernel_size.size() == 1 || kernel_size.size() == 2,
-      "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1
-      ? kH
-      : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  // NB: stride default is not expressible as an integer constant, so we
-  // accept empty stride for this case
-  TORCH_CHECK(
-      stride.empty() || stride.size() == 1 || stride.size() == 2,
-      "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW
-      : stride.size() == 1      ? dH
-                                : safe_downcast<int, int64_t>(stride[1]);
-
-  TORCH_CHECK(
-      padding.size() == 1 || padding.size() == 2,
-      "max_pool2d: padding must either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW =
-      padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  TORCH_CHECK(
-      dilation.size() == 1 || dilation.size() == 2,
-      "max_pool2d: dilation must be either a single int, or a tuple of two ints");
-  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
-  const int dilationW = dilation.size() == 1
-      ? dilationH
-      : safe_downcast<int, int64_t>(dilation[1]);
-
-  const auto memory_format = input.suggest_memory_format();
-  if (memory_format == at::MemoryFormat::ChannelsLast) {
-    TORCH_CHECK(
-        input.ndimension() == 4,
-        "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
-  } else if (memory_format == at::MemoryFormat::Contiguous) {
-    TORCH_CHECK(
-        (input.ndimension() == 3 || input.ndimension() == 4),
-        "non-empty 3D or 4D (batch mode) tensor expected for input");
-  } else {
-    TORCH_CHECK(
-        false,
-        "Unsupport memory format. Supports only ChannelsLast, Contiguous");
-  }
-
-  /* sizes */
-  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
-  const int64_t nInputPlane = input.size(-3);
-  const int64_t inputHeight = input.size(-2);
-  const int64_t inputWidth = input.size(-1);
-
-  const int64_t outputHeight = pooling_output_shape<int64_t>(
-      inputHeight, kH, padH, dH, dilationH, ceil_mode);
-  const int64_t outputWidth = pooling_output_shape<int64_t>(
-      inputWidth, kW, padW, dW, dilationW, ceil_mode);
-
-  pool2d_shape_check(
-      input,
-      kH,
-      kW,
-      dH,
-      dW,
-      padH,
-      padW,
-      dilationH,
-      dilationW,
-      nInputPlane,
-      inputHeight,
-      inputWidth,
-      outputHeight,
-      outputWidth,
-      memory_format);
-
-  /* resize output and indices */
-  if (input.ndimension() == 3) {
-    if (output.defined()) {
-      at::xpu::resize_out(
-          output,
-          {nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format));
-    } else {
-      output = at::xpu::create_out(
-          {nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format));
-    }
-
-    /* indices will contain the locations for each output point */
-    if (indices.defined()) {
-      at::xpu::resize_out(
-          indices,
-          {nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format).dtype(kLong));
-    } else {
-      indices = at::xpu::create_out(
-          {nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format).dtype(kLong));
-    }
-
-  } else {
-    if (output.defined()) {
-      at::xpu::resize_out(
-          output,
-          {nbatch, nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format));
-    } else {
-      output = at::xpu::create_out(
-          {nbatch, nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format));
-    }
-
-    /* indices will contain the locations for each output point */
-    if (indices.defined()) {
-      at::xpu::resize_out(
-          indices,
-          {nbatch, nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format).dtype(kLong));
-    } else {
-      indices = at::xpu::create_out(
-          {nbatch, nInputPlane, outputHeight, outputWidth},
-          {},
-          input.options().memory_format(memory_format).dtype(kLong));
-    }
-  }
-}
-
-Tensor& max_pool2d_with_indices_backward_meta(
-    const Tensor& gradOutput,
-    const Tensor& input,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode,
-    const Tensor& indices,
-    Tensor& gradInput) {
-  // #20866, #22032: Guarantee this for the official C++ API?
-  TORCH_CHECK(
-      kernel_size.size() == 1 || kernel_size.size() == 2,
-      "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1
-      ? kH
-      : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  // NB: stride default is not expressible as an integer constant, so we accept
-  // empty stride for this case
-  TORCH_CHECK(
-      stride.empty() || stride.size() == 1 || stride.size() == 2,
-      "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW
-      : stride.size() == 1      ? dH
-                                : safe_downcast<int, int64_t>(stride[1]);
-
-  TORCH_CHECK(
-      padding.size() == 1 || padding.size() == 2,
-      "max_pool2d: padding must either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW =
-      padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  TORCH_CHECK(
-      dilation.size() == 1 || dilation.size() == 2,
-      "max_pool2d: dilation must be either a single int, or a tuple of two ints");
-  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
-  const int dilationW = dilation.size() == 1
-      ? dilationH
-      : safe_downcast<int, int64_t>(dilation[1]);
-
-  TORCH_CHECK(
-      input.dtype() == gradOutput.dtype(),
-      "expected dtype ",
-      input.dtype(),
-      " for `gradOutput` but got dtype ",
-      gradOutput.dtype());
+#include <xpu/ATen/ops/max_pool2d_with_indices_backward_native.h>
+#include <xpu/ATen/ops/max_pool2d_with_indices_native.h>
 
-  const auto memory_format = input.suggest_memory_format();
-  if (memory_format == at::MemoryFormat::ChannelsLast) {
-    TORCH_CHECK(
-        input.ndimension() == 4,
-        "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
-  } else if (memory_format == at::MemoryFormat::Contiguous) {
-    TORCH_CHECK(
-        (input.ndimension() == 3 || input.ndimension() == 4),
-        "non-empty 3D or 4D (batch mode) tensor expected for input");
-  } else {
-    TORCH_CHECK(
-        false,
-        "Unsupport memory format. Supports only ChannelsLast, Contiguous");
-  }
-
-  /* sizes */
-  const int64_t nInputPlane = input.size(-3);
-  const int64_t inputHeight = input.size(-2);
-  const int64_t inputWidth = input.size(-1);
-
-  /* XXX preserve the existing shape check behavior */
-  const int64_t outputHeight_for_shape_check = pooling_output_shape<int64_t>(
-      inputHeight, kH, padH, dH, dilationH, ceil_mode);
-  const int64_t outputWidth_for_shape_check = pooling_output_shape<int64_t>(
-      inputWidth, kW, padW, dW, dilationW, ceil_mode);
-
-  max_pool2d_backward_shape_check(
-      input,
+namespace at {
+namespace native {
+TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_xpu)
+(const Tensor& gradOutput,
+ const Tensor& input,
+ IntArrayRef kernel_size,
+ IntArrayRef stride,
+ IntArrayRef padding,
+ IntArrayRef dilation,
+ bool ceil_mode,
+ const Tensor& indices,
+ const Tensor& gradInput) {
+  xpu::max_pool2d_with_indices_backward_kernel(
+      gradInput,
       gradOutput,
-      indices,
-      kH,
-      kW,
-      dH,
-      dW,
-      padH,
-      padW,
-      dilationH,
-      dilationW,
-      nInputPlane,
-      inputHeight,
-      inputWidth,
-      outputHeight_for_shape_check,
-      outputWidth_for_shape_check,
-      memory_format);
-
-  auto options = input.options().memory_format(memory_format);
-  if (gradInput.defined()) {
-    at::xpu::resize_out(gradInput, input.sizes(), {}, options);
-  } else {
-    gradInput = at::xpu::create_out(input.sizes(), {}, options);
-  }
-
-  return gradInput;
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::max_pool2d_with_indices(
-    const Tensor& input,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode) {
-  Tensor output;
-  Tensor indices;
-  max_pool2d_with_indices_meta(
-      input,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      ceil_mode,
-      output,
-      indices);
-
-  at::native::xpu::max_pool2d_with_indices_kernel(
       input,
+      indices,
       kernel_size,
       stride,
       padding,
       dilation,
-      ceil_mode,
-      output,
-      indices);
-
-  return std::tuple<Tensor&, Tensor&>(output, indices);
+      ceil_mode);
 }
 
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::max_pool2d_with_indices_out(
-    const Tensor& input,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode,
-    Tensor& output,
-    Tensor& indices) {
-  max_pool2d_with_indices_meta(
+TORCH_IMPL_FUNC(max_pool2d_with_indices_out_xpu)
+(const Tensor& input,
+ IntArrayRef kernel_size,
+ IntArrayRef stride,
+ IntArrayRef padding,
+ IntArrayRef dilation,
+ bool ceil_mode,
+ const Tensor& output,
+ const Tensor& indices) {
+  xpu::max_pool2d_with_indices_kernel(
       input,
       kernel_size,
       stride,
@@ -309,77 +49,6 @@ std::tuple<Tensor&, Tensor&> XPUNativeFunctions::max_pool2d_with_indices_out(
       ceil_mode,
       output,
       indices);
-
-  at::native::xpu::max_pool2d_with_indices_kernel(
-      input,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      ceil_mode,
-      output,
-      indices);
-
-  return std::tuple<Tensor&, Tensor&>(output, indices);
 }
-
-Tensor& XPUNativeFunctions::max_pool2d_with_indices_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode,
-    const Tensor& indices,
-    Tensor& grad_input) {
-  grad_input = max_pool2d_with_indices_backward_meta(
-      grad_output,
-      self,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      ceil_mode,
-      indices,
-      grad_input);
-
-  at::native::xpu::max_pool2d_with_indices_backward_kernel(
-      grad_input,
-      grad_output,
-      self,
-      indices,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      ceil_mode);
-
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::max_pool2d_with_indices_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode,
-    const Tensor& indices) {
-  Tensor grad_input;
-  max_pool2d_with_indices_backward_out(
-      grad_output,
-      self,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      ceil_mode,
-      indices,
-      grad_input);
-
-  return grad_input;
-}
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Distance.cpp b/src/ATen/native/xpu/Distance.cpp
index 613eb542c..63ae0cced 100644
--- a/src/ATen/native/xpu/Distance.cpp
+++ b/src/ATen/native/xpu/Distance.cpp
@@ -1,123 +1,10 @@
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Distance.h>
 #include <ATen/native/xpu/sycl/DistanceKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
-Tensor cdist_impl(
-    const Tensor& x1,
-    const Tensor& x2,
-    const double p,
-    c10::optional<int64_t> compute_mode) {
-  TORCH_CHECK(
-      at::isFloatingType(x1.scalar_type()),
-      "cdist only supports floating-point dtypes, X1 got: ",
-      x1.scalar_type());
-  auto device1 = x1.device().type();
-  TORCH_CHECK(
-      at::isFloatingType(x2.scalar_type()),
-      "cdist only supports floating-point dtypes, X2 got: ",
-      x2.scalar_type());
-  auto device2 = x2.device().type();
-  TORCH_CHECK(p >= 0, "cdist only supports non-negative p values");
-  TORCH_CHECK(
-      device1 == device2,
-      "X1 and X2 must have the same device type. X1: ",
-      device1,
-      " X2: ",
-      device2);
-  // TODO: This is bad; this test should apply universally
-  TORCH_CHECK(
-      !x1.is_xpu() || x1.get_device() == x2.get_device(),
-      "device of X1 (",
-      x1.get_device(),
-      ") must match device of X2 (",
-      x2.get_device(),
-      ")");
 
-  SymInt c1 = x1.sym_size(-1);
-  SymInt c2 = x2.sym_size(-1);
-  // 0 - default value. If p = 2 and r1 > 25 or r2 > 25 (these values are based
-  // on performance metrics), it will try to compute distance using matrix
-  // multiplication approach 1 - force to use matrix multiplication for p = 2 2
-  // - do not use matrix multiplication for p = 2
-  int64_t mode = compute_mode.value_or(0);
-  TORCH_CHECK(
-      mode >= 0 && mode <= 2, "possible modes: 0, 1, 2, but was: ", mode);
-  SymInt r1 = x1.size(-2);
-  SymInt r2 = x2.size(-2);
-  if (!(p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25))))) {
-    TORCH_CHECK(
-        device1 == kCPU || device1 == kXPU,
-        "cdist only supports CPU and XPU devices, X1 got: ",
-        device1);
-    TORCH_CHECK(
-        device2 == kCPU || device2 == kXPU,
-        "cdist only supports CPU and XPU devices, X2 got: ",
-        device2);
-  }
-  int64_t dim1 = x1.dim();
-  int64_t dim2 = x2.dim();
-  SymIntArrayRef batch_tensor1(x1.sym_sizes().data(), dim1 - 2);
-  SymIntArrayRef batch_tensor2(x2.sym_sizes().data(), dim2 - 2);
-  std::vector<SymInt> expand_batch_portion =
-      at::infer_size_symint(batch_tensor1, batch_tensor2);
-  std::vector<SymInt> x1_expand_size(expand_batch_portion);
-  x1_expand_size.insert(x1_expand_size.end(), {r1, c1});
-  std::vector<SymInt> x2_expand_size(expand_batch_portion);
-  x2_expand_size.insert(x2_expand_size.end(), {r2, c2});
-
-  const SymInt expand_batch_product =
-      c10::multiply_integers(expand_batch_portion);
-  std::vector<SymInt> x1_view{expand_batch_product, r1, c1};
-  std::vector<SymInt> x2_view{expand_batch_product, r2, c2};
-
-  Tensor x1_expanded =
-      x1.expand_symint(x1_expand_size).contiguous().view_symint(x1_view);
-  Tensor x2_expanded =
-      x2.expand_symint(x2_expand_size).contiguous().view_symint(x2_view);
-
-  std::vector<SymInt> output_shape(std::move(expand_batch_portion));
-  output_shape.insert(output_shape.end(), {r1, r2});
-
-  Tensor result;
-  if (r1 == 0 || r2 == 0 || expand_batch_product == 0) {
-    result = at::empty_symint(output_shape, x1.options());
-  } else if (c1 == 0) {
-    result = at::zeros_symint(output_shape, x1.options());
-  } else if (p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25)))) {
-    Tensor dist = (expand_batch_product == 1)
-        ? at::_euclidean_dist(x1, x2)
-        : at::_euclidean_dist(x1_expanded, x2_expanded);
-    result = dist.view_symint(output_shape);
-  } else {
-    result = at::empty_symint(output_shape, x1.options());
-    native::xpu::cdist_kernel(result, x1_expanded, x2_expanded, p);
-  }
-  return result;
+namespace native {
+REGISTER_XPU_DISPATCH(cdist_stub, &xpu::cdist_kernel);
 }
-
-Tensor XPUNativeFunctions::_cdist_forward(
-    const Tensor& x1,
-    const Tensor& x2,
-    const double p,
-    c10::optional<int64_t> compute_mode) {
-  TORCH_CHECK(
-      x1.dim() >= 2,
-      "cdist only supports at least 2D tensors, X1 got: ",
-      x1.dim(),
-      "D");
-  TORCH_CHECK(
-      x2.dim() >= 2,
-      "cdist only supports at least 2D tensors, X2 got: ",
-      x2.dim(),
-      "D");
-  TORCH_CHECK(
-      x1.size(-1) == x2.size(-1),
-      "X1 and X2 must have the same number of columns. X1: ",
-      x1.size(-1),
-      " X2: ",
-      x2.size(-1));
-
-  return cdist_impl(x1, x2, p, compute_mode);
-}
-
 } // namespace at
diff --git a/src/ATen/native/xpu/Distributions.cpp b/src/ATen/native/xpu/Distributions.cpp
index 51ff727cc..bce51bbf8 100644
--- a/src/ATen/native/xpu/Distributions.cpp
+++ b/src/ATen/native/xpu/Distributions.cpp
@@ -1,314 +1,32 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/DistributionTemplates.h>
+#include <ATen/native/Distributions.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <ATen/native/UnaryOps.h>
 
 #include <ATen/native/xpu/sycl/DistributionKernels.h>
 #include <ATen/native/xpu/sycl/MultinomialKernel.h>
 #include <ATen/ops/div.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
-
-template <typename RNG>
-struct NormalStub {
-  void operator()(
-      Tensor& self,
-      double mean,
-      double std,
-      c10::optional<Generator> gen) {
-    native::xpu::normal_kernel(self, mean, std, gen);
-  }
-};
-
-Tensor& XPUNativeFunctions::normal_(
-    Tensor& self,
-    double mean,
-    double std,
-    ::std::optional<Generator> generator) {
-  return native::templates::normal_impl_<NormalStub, Generator>(
-      self, mean, std, std::move(generator));
-}
-
-// out tensor float
-Tensor& XPUNativeFunctions::normal_out(
-    const Tensor& mean,
-    double std,
-    c10::optional<Generator> gen,
-    Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalStub, Generator>(
-      output, mean, std, std::move(gen));
-}
-
-// functional tensor float
-Tensor XPUNativeFunctions::normal(
-    const Tensor& mean,
-    double std,
-    c10::optional<Generator> gen) {
-  return at::native::templates::normal_impl<NormalStub, Generator>(
-      mean, std, std::move(gen));
-}
-
-// out float tensor
-Tensor& XPUNativeFunctions::normal_out(
-    double mean,
-    const Tensor& std,
-    c10::optional<Generator> gen,
-    Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalStub, Generator>(
-      output, mean, std, std::move(gen));
-}
-
-// functional float tensor
-Tensor XPUNativeFunctions::normal(
-    double mean,
-    const Tensor& std,
-    c10::optional<Generator> gen) {
-  return at::native::templates::normal_impl<NormalStub, Generator>(
-      mean, std, std::move(gen));
-}
-
-// out tensor tensor
-Tensor& XPUNativeFunctions::normal_out(
-    const Tensor& mean,
-    const Tensor& std,
-    c10::optional<Generator> gen,
-    Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalStub, Generator>(
-      output, mean, std, std::move(gen));
-}
-
-// functional tensor tensor
-Tensor XPUNativeFunctions::normal(
-    const Tensor& mean,
-    const Tensor& std,
-    c10::optional<Generator> gen) {
-  return at::native::templates::normal_impl<NormalStub, Generator>(
-      mean, std, std::move(gen));
-}
-
-template <typename RNG>
-struct UniformStub {
-  void operator()(
-      TensorIteratorBase& iter,
-      double from,
-      double to,
-      c10::optional<Generator> gen) {
-    native::xpu::uniform_kernel(iter, from, to, gen);
-  }
-};
-
-Tensor& XPUNativeFunctions::uniform_(
-    Tensor& self,
-    double from,
-    double to,
-    ::std::optional<Generator> generator) {
-  return native::templates::uniform_impl_<UniformStub, Generator>(
-      self, from, to, std::move(generator));
-}
-
-template <typename RNG>
-struct BernoulliStub {
-  void operator()(
-      Tensor& self,
-      const Tensor& p_,
-      c10::optional<Generator> gen) {
-    native::xpu::bernoulli_tensor_kernel(self, p_, gen);
-  }
-  void operator()(Tensor& self, double p, c10::optional<Generator> gen) {
-    native::xpu::bernoulli_scalar_kernel(self, p, gen);
-  }
-};
-
-Tensor& XPUNativeFunctions::bernoulli_(
-    Tensor& self,
-    const Tensor& p_,
-    ::std::optional<Generator> generator) {
-  return native::templates::bernoulli_impl_<BernoulliStub, Generator>(
-      self, p_, std::move(generator));
-}
-
-Tensor& XPUNativeFunctions::bernoulli_(
-    Tensor& self,
-    double p,
-    ::std::optional<Generator> generator) {
-  return native::templates::bernoulli_impl_<BernoulliStub, Generator>(
-      self, p, std::move(generator));
-}
-
-Tensor& XPUNativeFunctions::bernoulli_out(
-    const Tensor& self,
-    c10::optional<Generator> gen,
-    Tensor& result) {
-  return native::templates::bernoulli_out_impl<BernoulliStub, Generator>(
-      result, self, std::move(gen));
-}
-
-template <typename RNG>
-struct RandomStub {
-  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
-    native::xpu::random_kernel(iter, gen);
-  }
-};
-
-Tensor& XPUNativeFunctions::random_(
-    Tensor& self,
-    ::std::optional<Generator> generator) {
-  return native::templates::random_impl<RandomStub, Generator>(
-      self, std::move(generator));
-}
-
-template <typename RNG>
-struct RandomFromToStub {
-  void operator()(
-      TensorIteratorBase& iter,
-      uint64_t range,
-      int64_t from,
-      c10::optional<Generator> gen) {
-    native::xpu::random_from_to_kernel(iter, range, from, gen);
-  }
-  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
-    native::xpu::random_full_64_bits_range_kernel(iter, gen);
-  }
-};
-
-Tensor& XPUNativeFunctions::random_(
-    Tensor& self,
-    int64_t from,
-    c10::optional<int64_t> to_opt,
-    ::std::optional<Generator> generator) {
-  return native::templates::random_from_to_impl<RandomFromToStub, Generator>(
-      self, from, to_opt, std::move(generator));
-}
-
-Tensor& XPUNativeFunctions::random_(
-    Tensor& self,
-    int64_t to,
-    ::std::optional<Generator> generator) {
-  return random_(self, 0, to, std::move(generator));
-}
-
-template <typename RNG>
-struct ExponentialStub {
-  void operator()(
-      TensorIteratorBase& iter,
-      double lambda,
-      c10::optional<Generator> gen) {
-    native::xpu::exponential_kernel(iter, lambda, gen);
-  }
-};
-
-Tensor& XPUNativeFunctions::exponential_(
-    Tensor& self,
-    double lambda,
-    std::optional<Generator> generator) {
-  return native::templates::exponential_impl_<ExponentialStub, Generator>(
-      self, lambda, std::move(generator));
-}
-
-/* The largest consecutive integer representable in float32 (2^24) */
-constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (24);
-
-Tensor& XPUNativeFunctions::multinomial_out(
-    const Tensor& self,
-    int64_t n_sample,
-    bool with_replacement,
-    ::std::optional<at::Generator> gen,
-    at::Tensor& result) {
-  TORCH_CHECK(
-      result.device() == self.device(),
-      "multinomial arguments must have the same device");
-  TORCH_CHECK(
-      self.dim() > 0 && self.dim() <= 2, "prob_dist must be 1 or 2 dim");
-  TORCH_CHECK(
-      at::isFloatingType(self.scalar_type()),
-      "multinomial only supports floating-point dtypes for input, got: ",
-      self.scalar_type());
-  TORCH_CHECK(
-      result.scalar_type() == ScalarType::Long,
-      "multinomial expects Long tensor out, got: ",
-      result.scalar_type());
-  TORCH_CHECK(n_sample > 0, "cannot sample n_sample <= 0 samples");
-  int64_t n_categories = self.size(-1);
-  TORCH_CHECK(
-      with_replacement || (n_sample <= n_categories),
-      "cannot sample n_sample > prob_dist.size(-1) samples without replacement");
-  // Since the index tensor is float, numCategories cannot exceed max
-  // float integer precision
-  TORCH_CHECK(
-      n_categories <= FLOAT32_MAX_CONSECUTIVE_INT,
-      "number of categories cannot exceed 2^24");
-
-  if (self.dim() == 1) {
-    result.resize_({n_sample});
-  } else {
-    const int64_t n_dist = self.size(0);
-    result.resize_({n_dist, n_sample});
-  }
-  if (result.numel() == 0) {
-    return result;
-  }
-
-  // Fast-path for no replacement or if only one sample is drawn.
-  // Reference:
-  // https://github.com/pytorch/pytorch/issues/11931#issuecomment-625882503
-  if (!with_replacement || n_sample == 1) {
-    // Sanity checks on `self`.
-    auto is_valid = ((self.max() < INFINITY) & (self.min() >= 0)).item();
-    TORCH_CHECK(
-        is_valid.to<bool>(),
-        "probability tensor contains either `inf`, `nan` or element < 0");
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    bool zero_prob_condition;
-    if (self.dim() == 1) {
-      zero_prob_condition = (self.sum() == 0).item().to<bool>();
-    } else {
-      zero_prob_condition = (self.sum(1) == 0).sum().item().to<bool>();
-    }
-    TORCH_CHECK(
-        !zero_prob_condition,
-        "invalid multinomial distribution (sum of probabilities <= 0)");
-
-    // The algorithm is from gumbel softmax.
-    // s = argmax( logp - log(-log(eps)) ) where eps ~ U(0, 1)
-    // Here we can apply exp to the formula which will not affect result of
-    // argmax or topk. Then we have
-    // s = argmax( p / (-log(eps)) ) where eps ~ U(0, 1).
-    // We can also simplify the formula above by
-    // s = argmax( p / q ) where q ~ Exp(1)
-    Tensor q = at::empty_like(self).exponential_(1, std::move(gen));
-    // In theory the probability to generate 0 from exponential distribution is
-    // 0. However, on CUDA side there is a protection to avoid 0s, but on CPU
-    // side, there is a very low probability to generate 0 from
-    // exponential<double>. The probability is about 2^(-DBL_MANT_DIG). We just
-    // ignore it here, but there may be some risk to get invalid output on CPU.
-    at::div_out(q, self, q);
-    if (n_sample == 1) {
-      at::argmax_out(result, q, /*dim=*/-1, /*keepdim=*/true);
-    } else {
-      Tensor vals = at::empty(result.sizes(), self.options());
-      at::topk_out(vals, result, q, n_sample);
-    }
-    return result;
-  }
-
-  at::native::xpu::multinomial_kernel(result, self, n_sample, gen);
-  return result;
-}
-
-Tensor XPUNativeFunctions::multinomial(
-    const Tensor& self,
-    int64_t n_sample,
-    bool with_replacement,
-    ::std::optional<at::Generator> gen) {
-  Tensor result = at::empty({0}, self.options().dtype(kLong));
-
-  XPUNativeFunctions::multinomial_out(
-      self, n_sample, with_replacement, std::move(gen), result);
-  return result;
-}
-
+namespace native {
+REGISTER_XPU_DISPATCH(normal_stub, &xpu::normal_kernel);
+REGISTER_XPU_DISPATCH(uniform_stub, &xpu::uniform_kernel);
+REGISTER_XPU_DISPATCH(bernoulli_scalar_stub, &xpu::bernoulli_scalar_kernel);
+REGISTER_XPU_DISPATCH(bernoulli_tensor_stub, &xpu::bernoulli_tensor_kernel);
+REGISTER_XPU_DISPATCH(random_stub, &xpu::random_kernel);
+REGISTER_XPU_DISPATCH(random_from_to_stub, &xpu::random_from_to_kernel);
+REGISTER_XPU_DISPATCH(exponential_stub, &xpu::exponential_kernel);
+REGISTER_XPU_DISPATCH(
+    random_full_64_bits_range_stub,
+    &xpu::random_full_64_bits_range_kernel);
+REGISTER_XPU_DISPATCH(
+    multinomial_with_replacement_stub,
+    &xpu::multinomial_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Dropout.cpp b/src/ATen/native/xpu/Dropout.cpp
index d3d74dbaf..950afccc0 100644
--- a/src/ATen/native/xpu/Dropout.cpp
+++ b/src/ATen/native/xpu/Dropout.cpp
@@ -1,20 +1,24 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DropoutKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
+#include <xpu/ATen/ops/native_dropout_backward_native.h>
+#include <xpu/ATen/ops/native_dropout_native.h>
+
+#include <comm/xpu_aten.h>
 
 namespace at {
 
-::std::tuple<Tensor, Tensor> XPUNativeFunctions::native_dropout(
+namespace native {
+::std::tuple<Tensor, Tensor> native_dropout_xpu(
     const Tensor& input,
     double p,
     ::std::optional<bool> train) {
   return at::native::xpu::dropout_kernel(input, p, train);
 }
 
-Tensor XPUNativeFunctions::native_dropout_backward(
+Tensor native_dropout_backward_xpu(
     const Tensor& grad_output,
     const Tensor& mask,
     double scale) {
@@ -29,4 +33,5 @@ Tensor XPUNativeFunctions::native_dropout_backward(
   return at::native::xpu::dropout_backward_kernel(grad_output, mask, scale);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Embedding.cpp b/src/ATen/native/xpu/Embedding.cpp
index 1eb073e43..41901fb10 100644
--- a/src/ATen/native/xpu/Embedding.cpp
+++ b/src/ATen/native/xpu/Embedding.cpp
@@ -1,12 +1,13 @@
-#include <ATen/ATen.h>
 #include <ATen/core/op_registration/adaption.h>
 
+#include <xpu/ATen/ops/embedding_dense_backward_native.h>
+
 #include <ATen/native/xpu/sycl/EmbeddingKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
-
-Tensor XPUNativeFunctions::embedding_dense_backward(
+namespace native {
+Tensor embedding_dense_backward_xpu(
     const Tensor& grad_output,
     const Tensor& indices,
     int64_t num_weights,
@@ -20,9 +21,9 @@ Tensor XPUNativeFunctions::embedding_dense_backward(
       "grad_output");
   c10::impl::check_and_update_common_device(
       common_device, indices, "xpu::embedding_dense_backward", "indices");
-  return native::xpu::embedding_dense_backward_kernel(
+  return xpu::embedding_dense_backward_kernel(
       grad_output, indices, num_weights, padding_idx, scale_grad_by_freq);
   ;
 }
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/EmbeddingBag.cpp b/src/ATen/native/xpu/EmbeddingBag.cpp
index 7300157d5..0786a9061 100644
--- a/src/ATen/native/xpu/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/EmbeddingBag.cpp
@@ -1,11 +1,13 @@
-#include <ATen/ATen.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <xpu/ATen/ops/_embedding_bag_forward_only_native.h>
+#include <xpu/ATen/ops/_embedding_bag_native.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingBagKernels.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
+namespace native {
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> XPUNativeFunctions::_embedding_bag(
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_xpu(
     const Tensor& weight,
     const Tensor& indices,
     const Tensor& offsets,
@@ -46,55 +48,27 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> XPUNativeFunctions::_embedding_bag(
       padding_idx);
 }
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> XPUNativeFunctions::
-    _embedding_bag_forward_only(
-        const Tensor& weight,
-        const Tensor& indices,
-        const Tensor& offsets,
-        bool scale_grad_by_freq,
-        int64_t mode,
-        bool sparse,
-        const c10::optional<Tensor>& per_sample_weights_opt,
-        bool include_last_offset,
-        int64_t padding_idx) {
-  return _embedding_bag(
-      weight,
-      indices,
-      offsets,
-      scale_grad_by_freq,
-      mode,
-      sparse,
-      per_sample_weights_opt,
-      include_last_offset,
-      padding_idx);
-}
-
-Tensor XPUNativeFunctions::_embedding_bag_backward(
-    const Tensor& grad,
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_xpu(
+    const Tensor& weight,
     const Tensor& indices,
     const Tensor& offsets,
-    const Tensor& offset2bag,
-    const Tensor& bag_size,
-    const Tensor& maximum_indices,
-    int64_t num_weights,
     bool scale_grad_by_freq,
     int64_t mode,
     bool sparse,
-    const c10::optional<Tensor>& per_sample_weights,
+    const c10::optional<Tensor>& per_sample_weights_opt,
+    bool include_last_offset,
     int64_t padding_idx) {
-  return at::native::_embedding_bag_backward_symint(
-      grad,
+  return _embedding_bag_xpu(
+      weight,
       indices,
       offsets,
-      offset2bag,
-      bag_size,
-      maximum_indices,
-      num_weights,
       scale_grad_by_freq,
       mode,
       sparse,
-      per_sample_weights,
+      per_sample_weights_opt,
+      include_last_offset,
       padding_idx);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Equal.cpp b/src/ATen/native/xpu/Equal.cpp
index 107936c72..dcee9b380 100644
--- a/src/ATen/native/xpu/Equal.cpp
+++ b/src/ATen/native/xpu/Equal.cpp
@@ -1,15 +1,16 @@
 #include <ATen/NamedTensorUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/equal_native.h>
-#endif
+#include <xpu/ATen/ops/equal_native.h>
 
 namespace at {
-
-bool XPUNativeFunctions::equal(const Tensor& self, const Tensor& src) {
+namespace xpu {
+// Note:
+// Seems {op}_xpu_dispatch.h is not generated in codegen via
+// backendwhitelist mode. We have to manually add a declaration here.
+at::Tensor eq(const at::Tensor& self, const at::Tensor& other);
+} // namespace xpu
+namespace native {
+bool xpu_equal(const Tensor& self, const Tensor& src) {
   if (!at::namedinference::are_names_equal(
           self.unsafeGetTensorImpl(), src.unsafeGetTensorImpl())) {
     return false;
@@ -38,7 +39,7 @@ bool XPUNativeFunctions::equal(const Tensor& self, const Tensor& src) {
     return true;
   }
 
-  return at::XPUNativeFunctions::eq(self, src).all().item().to<bool>();
+  return at::xpu::eq(self, src).all().item().to<bool>();
 }
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Fill.cpp b/src/ATen/native/xpu/Fill.cpp
index 025b2853f..88b3f765d 100644
--- a/src/ATen/native/xpu/Fill.cpp
+++ b/src/ATen/native/xpu/Fill.cpp
@@ -1,50 +1,10 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/Fill.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 #include <ATen/native/xpu/sycl/FillKernel.h>
-
-namespace at {
-
-Tensor& fill_out(Tensor& self, const Scalar& value) {
-  auto iter = TensorIteratorConfig()
-                  .set_check_mem_overlap(
-                      false) // Fill is idempotent, so overlap is okay
-                  .check_all_same_dtype(false)
-                  .add_output(self)
-                  .resize_outputs(false)
-                  .build();
-  native::xpu::fill_kernel(iter, value);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::fill_(Tensor& self, const Scalar& value) {
-  return fill_out(self, value);
-}
-
-Tensor& XPUNativeFunctions::fill_(Tensor& self, const Tensor& value) {
-  TORCH_CHECK(
-      value.dim() == 0,
-      "fill_ only supports 0-dimension value tensor but got tensor with ",
-      value.dim(),
-      " dimensions.");
-  if (self.device() != value.device()) {
-    return fill_out(self, value.item());
-  }
-  // Check if value is a view of self and if it is we clone
-  // it to avoid overwriting self prematurely
-  if (self.is_alias_of(value)) {
-    self.copy_(value.clone());
-  } else {
-    self.copy_(value);
-  }
-  return self;
-}
-
-Tensor& XPUNativeFunctions::zero_(Tensor& self) {
-  return self.fill_(0);
-}
-
-} // namespace at
+namespace at::native {
+REGISTER_XPU_DISPATCH(fill_stub, &native::xpu::fill_kernel);
+} // namespace at::native
diff --git a/src/ATen/native/xpu/ForeachOpList.cpp b/src/ATen/native/xpu/ForeachOpList.cpp
index 9d9e01af8..6813a91ae 100644
--- a/src/ATen/native/xpu/ForeachOpList.cpp
+++ b/src/ATen/native/xpu/ForeachOpList.cpp
@@ -1,141 +1,186 @@
 #include <ATen/native/ForeachUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
 
+#include <ATen/ops/empty_like.h>
+
 namespace at {
+namespace native {
+
+::std::vector<at::Tensor> foreach_tensor_mul_list_kernel_slow(
+    at::TensorList self,
+    at::TensorList other);
+void foreach_tensor_mul_list_kernel_slow_(
+    at::TensorList self,
+    at::TensorList other);
+
+::std::vector<at::Tensor> foreach_tensor_div_list_kernel_slow(
+    at::TensorList self,
+    at::TensorList other);
+void foreach_tensor_div_list_kernel_slow_(
+    at::TensorList self,
+    at::TensorList other);
+
+::std::vector<at::Tensor> foreach_tensor_add_list_kernel_slow(
+    at::TensorList self,
+    at::TensorList other,
+    const at::Scalar& alpha);
+void foreach_tensor_add_list_kernel_slow_(
+    at::TensorList self,
+    at::TensorList other,
+    const at::Scalar& alpha);
 
 #define FOREACH_BINARY_OP_LIST(NAME, DIVISION_OP)                           \
-  void XPUNativeFunctions::_foreach_##NAME##_(                              \
+  void foreach_tensor_##NAME##_list_kernel_xpu_(                            \
       TensorList tensors1, TensorList tensors2) {                           \
-    at::native::check_foreach_api_restrictions(tensors1, tensors2);         \
-    if (!at::native::can_use_fast_route(tensors1, tensors2, DIVISION_OP)) { \
-      return at::native::foreach_tensor_##NAME##_list_kernel_slow_(         \
-          tensors1, tensors2);                                              \
+    check_foreach_api_restrictions(tensors1, tensors2);                     \
+    if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) {             \
+      return foreach_tensor_##NAME##_list_kernel_slow_(tensors1, tensors2); \
     }                                                                       \
                                                                             \
-    at::native::xpu::FOREACH_BINARY_LIST_INPLACE_KERNEL_NAME(NAME)(         \
-        tensors1, tensors2);                                                \
+    xpu::FOREACH_BINARY_LIST_INPLACE_KERNEL_NAME(NAME)(tensors1, tensors2); \
   }                                                                         \
                                                                             \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##NAME(                  \
+  std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_xpu(              \
       TensorList tensors1, TensorList tensors2) {                           \
-    at::native::check_foreach_api_restrictions(tensors1, tensors2);         \
-    if (!at::native::can_use_fast_route(tensors1, tensors2, DIVISION_OP)) { \
-      return at::native::foreach_tensor_##NAME##_list_kernel_slow(          \
-          tensors1, tensors2);                                              \
+    check_foreach_api_restrictions(tensors1, tensors2);                     \
+    if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) {             \
+      return foreach_tensor_##NAME##_list_kernel_slow(tensors1, tensors2);  \
     }                                                                       \
                                                                             \
-    return at::native::xpu::FOREACH_BINARY_LIST_KERNEL_NAME(NAME)(          \
-        tensors1, tensors2);                                                \
+    return xpu::FOREACH_BINARY_LIST_KERNEL_NAME(NAME)(tensors1, tensors2);  \
   }
 
-#define FOREACH_BINARY_OP_LIST_ALPHA(NAME)                                \
-  void XPUNativeFunctions::_foreach_##NAME##_(                            \
-      TensorList tensors1, TensorList tensors2, const Scalar& alpha) {    \
-    at::native::check_foreach_api_restrictions(tensors1, tensors2);       \
-    if (!at::native::can_use_fast_route({tensors1, tensors2}, alpha)) {   \
-      return at::native::foreach_tensor_##NAME##_list_kernel_slow_(       \
-          tensors1, tensors2, alpha);                                     \
-    }                                                                     \
-                                                                          \
-    at::native::xpu::FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL_NAME(NAME)( \
-        tensors1, tensors2, alpha);                                       \
-  }                                                                       \
-                                                                          \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##NAME(                \
-      TensorList tensors1, TensorList tensors2, const Scalar& alpha) {    \
-    at::native::check_foreach_api_restrictions(tensors1, tensors2);       \
-    if (!at::native::can_use_fast_route({tensors1, tensors2}, alpha)) {   \
-      return at::native::foreach_tensor_##NAME##_list_kernel_slow(        \
-          tensors1, tensors2, alpha);                                     \
-    }                                                                     \
-                                                                          \
-    return at::native::xpu::FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)(  \
-        tensors1, tensors2, alpha);                                       \
+#define FOREACH_BINARY_OP_LIST_ALPHA(NAME)                             \
+  void foreach_tensor_##NAME##_list_kernel_xpu_(                       \
+      TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \
+    check_foreach_api_restrictions(tensors1, tensors2);                \
+    if (!can_use_fast_route({tensors1, tensors2}, alpha)) {            \
+      return foreach_tensor_##NAME##_list_kernel_slow_(                \
+          tensors1, tensors2, alpha);                                  \
+    }                                                                  \
+                                                                       \
+    xpu::FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL_NAME(NAME)(          \
+        tensors1, tensors2, alpha);                                    \
+  }                                                                    \
+                                                                       \
+  std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_xpu(         \
+      TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \
+    check_foreach_api_restrictions(tensors1, tensors2);                \
+    if (!can_use_fast_route({tensors1, tensors2}, alpha)) {            \
+      return foreach_tensor_##NAME##_list_kernel_slow(                 \
+          tensors1, tensors2, alpha);                                  \
+    }                                                                  \
+                                                                       \
+    return xpu::FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)(           \
+        tensors1, tensors2, alpha);                                    \
   }
 
 FOREACH_BINARY_OP_LIST_ALPHA(add);
 FOREACH_BINARY_OP_LIST(mul, false);
 FOREACH_BINARY_OP_LIST(div, true);
 
-#define FOREACH_POINTWISE_OP_TENSOR(NAME)                                      \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##NAME(                     \
-      TensorList input,                                                        \
-      TensorList tensors1,                                                     \
-      TensorList tensors2,                                                     \
-      const Tensor& scalars_) {                                                \
-    auto scalars =                                                             \
-        at::native::convert_tensor_to_scalar_list(scalars_, input.size());     \
-    at::native::check_foreach_api_restrictions(                                \
-        input, tensors1, tensors2, scalars);                                   \
-    if (!at::native::can_use_fast_route({input, tensors1, tensors2}) ||        \
-        at::native::has_integral_tensor(input, /* includeBool */ true)) {      \
-      return at::native::foreach_tensor_##NAME##_scalarlist_slow(              \
-          input, tensors1, tensors2, scalars);                                 \
-    }                                                                          \
-                                                                               \
-    return native::xpu::foreach_##NAME##_kernel(                               \
-        input, tensors1, tensors2, scalars);                                   \
-  }                                                                            \
-                                                                               \
-  void XPUNativeFunctions::_foreach_##NAME##_(                                 \
-      TensorList input,                                                        \
-      TensorList tensors1,                                                     \
-      TensorList tensors2,                                                     \
-      const Tensor& scalars_) {                                                \
-    auto scalars =                                                             \
-        at::native::convert_tensor_to_scalar_list(scalars_, input.size());     \
-    at::native::check_foreach_api_restrictions(                                \
-        input, tensors1, tensors2, scalars);                                   \
-    if (!at::native::can_use_fast_route(                                       \
-            {input, tensors1, tensors2}, scalars) ||                           \
-        at::native::has_integral_tensor(input, /* includeBool */ true)) {      \
-      return at::native::foreach_tensor_##NAME##_scalarlist_slow_(             \
-          input, tensors1, tensors2, scalars);                                 \
-    }                                                                          \
-                                                                               \
-    native::xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars); \
+::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_addcmul_scalarlist_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+
+::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_addcdiv_scalarlist_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+
+#define FOREACH_POINTWISE_OP_TENSOR(NAME)                                  \
+  std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_xpu(             \
+      TensorList input,                                                    \
+      TensorList tensors1,                                                 \
+      TensorList tensors2,                                                 \
+      const Tensor& scalars_) {                                            \
+    auto scalars =                                                         \
+        at::native::convert_tensor_to_scalar_list(scalars_, input.size()); \
+    at::native::check_foreach_api_restrictions(                            \
+        input, tensors1, tensors2, scalars);                               \
+    if (!at::native::can_use_fast_route({input, tensors1, tensors2}) ||    \
+        at::native::has_integral_tensor(input, /* includeBool */ true)) {  \
+      return at::native::foreach_tensor_##NAME##_scalarlist_slow(          \
+          input, tensors1, tensors2, scalars);                             \
+    }                                                                      \
+                                                                           \
+    return native::xpu::foreach_##NAME##_kernel(                           \
+        input, tensors1, tensors2, scalars);                               \
+  }                                                                        \
+                                                                           \
+  void foreach_tensor_##NAME##_list_kernel_xpu_(                           \
+      TensorList input,                                                    \
+      TensorList tensors1,                                                 \
+      TensorList tensors2,                                                 \
+      const Tensor& scalars_) {                                            \
+    auto scalars = convert_tensor_to_scalar_list(scalars_, input.size());  \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);    \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) ||       \
+        has_integral_tensor(input, /* includeBool */ true)) {              \
+      return foreach_tensor_##NAME##_scalarlist_slow_(                     \
+          input, tensors1, tensors2, scalars);                             \
+    }                                                                      \
+                                                                           \
+    xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars);     \
   }
 
 FOREACH_POINTWISE_OP_TENSOR(addcmul)
 FOREACH_POINTWISE_OP_TENSOR(addcdiv)
 
-std::vector<at::Tensor> XPUNativeFunctions::_foreach_lerp(
+::std::vector<at::Tensor> foreach_tensor_ternary_lerp_slow(
+    at::TensorList self,
+    at::TensorList tensors1,
+    at::TensorList weights);
+
+std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3) {
-  at::native::check_foreach_api_restrictions(tensors1, tensors2, tensors3);
-  if (!at::native::can_use_fast_route(
-          {tensors1, tensors2, tensors3}, {}, true)) {
-    return at::native::foreach_tensor_ternary_lerp_slow(
-        tensors1, tensors2, tensors3);
+  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+  if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
+    return foreach_tensor_ternary_lerp_slow(tensors1, tensors2, tensors3);
   }
 
   std::vector<at::Tensor> vec_res;
   vec_res.reserve(tensors1.size());
   for (const auto& t : tensors1) {
-    vec_res.emplace_back(at::native::empty_like(t));
+    vec_res.emplace_back(at::empty_like(t));
   }
 
-  native::xpu::foreach_lerp_list_kernel(tensors1, tensors2, tensors3, vec_res);
+  xpu::foreach_lerp_list_kernel(tensors1, tensors2, tensors3, vec_res);
   return vec_res;
 }
 
-void XPUNativeFunctions::_foreach_lerp_(
+void foreach_tensor_ternary_lerp_slow_(
+    at::TensorList self,
+    at::TensorList tensors1,
+    at::TensorList weights);
+
+void foreach_tensor_lerp_ternary_xpu_(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3) {
-  at::native::check_foreach_api_restrictions(tensors1, tensors2, tensors3);
-  if (!at::native::can_use_fast_route(
-          {tensors1, tensors2, tensors3}, {}, true)) {
-    return at::native::foreach_tensor_ternary_lerp_slow_(
-        tensors1, tensors2, tensors3);
+  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+  if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
+    return foreach_tensor_ternary_lerp_slow_(tensors1, tensors2, tensors3);
   }
 
-  native::xpu::foreach_lerp_list_kernel_(tensors1, tensors2, tensors3);
+  xpu::foreach_lerp_list_kernel_(tensors1, tensors2, tensors3);
 
   // TODO: Handle version bump in codegen.
   // increment_version
@@ -144,4 +189,5 @@ void XPUNativeFunctions::_foreach_lerp_(
   }
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachOpScalar.cpp b/src/ATen/native/xpu/ForeachOpScalar.cpp
index 95238d0dc..46b908ced 100644
--- a/src/ATen/native/xpu/ForeachOpScalar.cpp
+++ b/src/ATen/native/xpu/ForeachOpScalar.cpp
@@ -3,109 +3,155 @@
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
 
-#define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP)                        \
-  void XPUNativeFunctions::_foreach_##NAME##_(                        \
-      TensorList tensors, const Scalar& scalar) {                     \
-    at::native::check_foreach_api_restrictions(tensors);              \
-    if (!at::native::can_use_fast_route(tensors, scalar, DIV_OP)) {   \
-      return at::native::foreach_tensor_##NAME##_scalar_kernel_slow_( \
-          tensors, scalar);                                           \
-    }                                                                 \
-                                                                      \
-    at::native::xpu::FOREACH_BINARY_SCALAR_INPLACE_KERNEL_NAME(NAME)( \
-        tensors, scalar);                                             \
-  }                                                                   \
-                                                                      \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##NAME(            \
-      TensorList tensors, const Scalar& scalar) {                     \
-    at::native::check_foreach_api_restrictions(tensors);              \
-    if (!at::native::can_use_fast_route(tensors, scalar, DIV_OP)) {   \
-      return at::native::foreach_tensor_##NAME##_scalar_kernel_slow(  \
-          tensors, scalar);                                           \
-    }                                                                 \
-                                                                      \
-    return at::native::xpu::FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)(  \
-        tensors, scalar);                                             \
+namespace native {
+
+::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
+    at::TensorList self,
+    const at::Scalar& scalar);
+void foreach_tensor_add_scalar_kernel_slow_(
+    at::TensorList self,
+    const at::Scalar& scalar);
+
+::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
+    at::TensorList self,
+    const at::Scalar& scalar);
+void foreach_tensor_mul_scalar_kernel_slow_(
+    at::TensorList self,
+    const at::Scalar& scalar);
+
+::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
+    at::TensorList self,
+    const at::Scalar& scalar);
+void foreach_tensor_div_scalar_kernel_slow_(
+    at::TensorList self,
+    const at::Scalar& scalar);
+
+#define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP)                             \
+  void foreach_tensor_##NAME##_scalar_kernel_xpu_(                         \
+      TensorList tensors, const Scalar& scalar) {                          \
+    check_foreach_api_restrictions(tensors);                               \
+    if (!can_use_fast_route(tensors, scalar, DIV_OP)) {                    \
+      return foreach_tensor_##NAME##_scalar_kernel_slow_(tensors, scalar); \
+    }                                                                      \
+                                                                           \
+    xpu::FOREACH_BINARY_SCALAR_INPLACE_KERNEL_NAME(NAME)(tensors, scalar); \
+  }                                                                        \
+                                                                           \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_xpu(           \
+      TensorList tensors, const Scalar& scalar) {                          \
+    check_foreach_api_restrictions(tensors);                               \
+    if (!can_use_fast_route(tensors, scalar, DIV_OP)) {                    \
+      return foreach_tensor_##NAME##_scalar_kernel_slow(tensors, scalar);  \
+    }                                                                      \
+                                                                           \
+    return xpu::FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)(tensors, scalar);  \
   }
 
 FOREACH_BINARY_OP_SCALAR(add, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(mul, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(div, /*div_op*/ true);
 
-#define FOREACH_POINTWISE_OP_SCALAR(NAME)                                     \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##NAME(                    \
-      TensorList input,                                                       \
-      TensorList tensors1,                                                    \
-      TensorList tensors2,                                                    \
-      const Scalar& scalar) {                                                 \
-    at::native::check_foreach_api_restrictions(input, tensors1, tensors2);    \
-                                                                              \
-    if (!at::native::can_use_fast_route(                                      \
-            {input, tensors1, tensors2}, scalar) ||                           \
-        at::native::has_integral_tensor(input, /* includeBool */ true)) {     \
-      return at::native::foreach_tensor_##NAME##_scalar_slow(                 \
-          input, tensors1, tensors2, scalar);                                 \
-    }                                                                         \
-                                                                              \
-    return native::xpu::foreach_##NAME##_kernel(                              \
-        input, tensors1, tensors2, scalar);                                   \
-  }                                                                           \
-                                                                              \
-  void XPUNativeFunctions::_foreach_##NAME##_(                                \
-      TensorList input,                                                       \
-      TensorList tensors1,                                                    \
-      TensorList tensors2,                                                    \
-      const Scalar& scalar) {                                                 \
-    at::native::check_foreach_api_restrictions(input, tensors1, tensors2);    \
-                                                                              \
-    if (!at::native::can_use_fast_route(                                      \
-            {input, tensors1, tensors2}, scalar) ||                           \
-        at::native::has_integral_tensor(input, /* includeBool */ true)) {     \
-      return at::native::foreach_tensor_##NAME##_scalar_slow_(                \
-          input, tensors1, tensors2, scalar);                                 \
-    }                                                                         \
-                                                                              \
-    native::xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalar); \
+::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value);
+void foreach_tensor_addcmul_scalar_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value);
+
+::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value);
+void foreach_tensor_addcdiv_scalar_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value);
+
+#define FOREACH_POINTWISE_OP_SCALAR(NAME)                                   \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalar_xpu(                   \
+      TensorList input,                                                     \
+      TensorList tensors1,                                                  \
+      TensorList tensors2,                                                  \
+      const Scalar& scalar) {                                               \
+    check_foreach_api_restrictions(input, tensors1, tensors2);              \
+                                                                            \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalar) ||         \
+        has_integral_tensor(input, /* includeBool */ true)) {               \
+      return foreach_tensor_##NAME##_scalar_slow(                           \
+          input, tensors1, tensors2, scalar);                               \
+    }                                                                       \
+                                                                            \
+    return xpu::foreach_##NAME##_kernel(input, tensors1, tensors2, scalar); \
+  }                                                                         \
+                                                                            \
+  void foreach_tensor_##NAME##_scalar_xpu_(                                 \
+      TensorList input,                                                     \
+      TensorList tensors1,                                                  \
+      TensorList tensors2,                                                  \
+      const Scalar& scalar) {                                               \
+    check_foreach_api_restrictions(input, tensors1, tensors2);              \
+                                                                            \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalar) ||         \
+        has_integral_tensor(input, /* includeBool */ true)) {               \
+      return foreach_tensor_##NAME##_scalar_slow_(                          \
+          input, tensors1, tensors2, scalar);                               \
+    }                                                                       \
+                                                                            \
+    xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalar);       \
   }
 
 FOREACH_POINTWISE_OP_SCALAR(addcmul)
 FOREACH_POINTWISE_OP_SCALAR(addcdiv)
 
-std::vector<at::Tensor> XPUNativeFunctions::_foreach_lerp(
+::std::vector<at::Tensor> foreach_tensor_lerp_list_kernel_slow(
+    at::TensorList self,
+    at::TensorList tensors1,
+    const at::Scalar& weight);
+void foreach_tensor_lerp_list_kernel_slow_(
+    at::TensorList self,
+    at::TensorList tensors1,
+    const at::Scalar& weight);
+
+std::vector<at::Tensor> foreach_tensor_lerp_list_xpu(
     TensorList tensors1,
     TensorList tensors2,
     const Scalar& weight) {
-  at::native::check_foreach_api_restrictions(tensors1, tensors2);
-  if (!at::native::can_use_fast_route({tensors1, tensors2}, {}, true)) {
-    return at::native::foreach_tensor_lerp_list_kernel_slow(
-        tensors1, tensors2, weight);
+  check_foreach_api_restrictions(tensors1, tensors2);
+  if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
+    return foreach_tensor_lerp_list_kernel_slow(tensors1, tensors2, weight);
   }
 
   std::vector<at::Tensor> vec_res;
   vec_res.reserve(tensors1.size());
   for (const auto& t : tensors1) {
-    vec_res.emplace_back(at::native::empty_like(t));
+    vec_res.emplace_back(at::empty_like(t));
   }
 
-  native::xpu::foreach_lerp_scalar_kernel(tensors1, tensors2, weight, vec_res);
+  xpu::foreach_lerp_scalar_kernel(tensors1, tensors2, weight, vec_res);
 
   return vec_res;
 }
 
-void XPUNativeFunctions::_foreach_lerp_(
+void foreach_tensor_lerp_list_xpu_(
     TensorList tensors1,
     TensorList tensors2,
     const Scalar& weight) {
-  at::native::check_foreach_api_restrictions(tensors1, tensors2);
-  if (!at::native::can_use_fast_route({tensors1, tensors2}, {}, true)) {
-    return at::native::foreach_tensor_lerp_list_kernel_slow_(
-        tensors1, tensors2, weight);
+  check_foreach_api_restrictions(tensors1, tensors2);
+  if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
+    return foreach_tensor_lerp_list_kernel_slow_(tensors1, tensors2, weight);
   }
 
-  native::xpu::foreach_lerp_scalar_kernel_(tensors1, tensors2, weight);
+  xpu::foreach_lerp_scalar_kernel_(tensors1, tensors2, weight);
 }
+
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
index 7752395db..6ac047476 100644
--- a/src/ATen/native/xpu/ForeachOpScalarList.cpp
+++ b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -2,78 +2,154 @@
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
+#include <xpu/ATen/ops/_foreach_add_native.h>
+#include <xpu/ATen/ops/_foreach_mul_native.h>
 
 namespace at {
+namespace native {
+::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
+    at::TensorList self,
+    const at::Scalar& scalar);
+void foreach_tensor_add_scalar_kernel_slow_(
+    at::TensorList self,
+    const at::Scalar& scalar);
+::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
+    at::TensorList self,
+    const at::Scalar& scalar);
+void foreach_tensor_mul_scalar_kernel_slow_(
+    at::TensorList self,
+    const at::Scalar& scalar);
 
-#define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP)                        \
-  void XPUNativeFunctions::_foreach_##NAME##_(                            \
-      TensorList tensors, at::ArrayRef<Scalar> scalars) {                 \
-    at::native::check_foreach_api_restrictions(tensors, scalars);         \
-    if (!at::native::can_use_fast_route(tensors, scalars, DIV_OP)) {      \
-      return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_( \
-          tensors, scalars);                                              \
-    }                                                                     \
-                                                                          \
-    at::native::xpu::FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL_NAME(NAME)( \
-        tensors, scalars);                                                \
-  }                                                                       \
-                                                                          \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##NAME(                \
-      TensorList tensors, at::ArrayRef<Scalar> scalars) {                 \
-    at::native::check_foreach_api_restrictions(tensors, scalars);         \
-    if (!at::native::can_use_fast_route(tensors, scalars, DIV_OP)) {      \
-      return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(  \
-          tensors, scalars);                                              \
-    }                                                                     \
-                                                                          \
-    return at::native::xpu::FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)(  \
-        tensors, scalars);                                                \
-  }
+::std::vector<at::Tensor> foreach_tensor_add_scalarlist_kernel_slow(
+    at::TensorList self,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_add_scalarlist_kernel_slow_(
+    at::TensorList self,
+    at::ArrayRef<at::Scalar> scalars);
+::std::vector<at::Tensor> foreach_tensor_mul_scalarlist_kernel_slow(
+    at::TensorList self,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_mul_scalarlist_kernel_slow_(
+    at::TensorList self,
+    at::ArrayRef<at::Scalar> scalars);
 
-FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false);
-FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false);
-FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true);
+::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
+    at::TensorList self,
+    const at::Scalar& scalar);
+void foreach_tensor_div_scalar_kernel_slow_(
+    at::TensorList self,
+    const at::Scalar& scalar);
+::std::vector<at::Tensor> foreach_tensor_div_scalarlist_kernel_slow(
+    at::TensorList self,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_div_scalarlist_kernel_slow_(
+    at::TensorList self,
+    at::ArrayRef<at::Scalar> scalars);
 
-#define FOREACH_POINTWISE_OP_SCALARLIST(NAME)                                  \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##NAME(                     \
-      TensorList input,                                                        \
-      TensorList tensors1,                                                     \
-      TensorList tensors2,                                                     \
-      at::ArrayRef<Scalar> scalars) {                                          \
-    at::native::check_foreach_api_restrictions(                                \
-        input, tensors1, tensors2, scalars);                                   \
-                                                                               \
-    if (!at::native::can_use_fast_route(                                       \
-            {input, tensors1, tensors2}, scalars) ||                           \
-        at::native::has_integral_tensor(input, /* includeBool */ true)) {      \
-      return at::native::foreach_tensor_##NAME##_scalarlist_slow(              \
-          input, tensors1, tensors2, scalars);                                 \
+#define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP)                             \
+  void foreach_tensor_##NAME##_scalar_kernel_xpu_(                             \
+      TensorList tensors, at::ArrayRef<Scalar> scalars) {                      \
+    check_foreach_api_restrictions(tensors, scalars);                          \
+    if (!can_use_fast_route(tensors, scalars, DIV_OP)) {                       \
+      return foreach_tensor_##NAME##_scalarlist_kernel_slow_(                  \
+          tensors, scalars);                                                   \
     }                                                                          \
                                                                                \
-    return native::xpu::foreach_##NAME##_kernel(                               \
-        input, tensors1, tensors2, scalars);                                   \
+    xpu::FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL_NAME(NAME)(                  \
+        tensors, scalars);                                                     \
   }                                                                            \
                                                                                \
-  void XPUNativeFunctions::_foreach_##NAME##_(                                 \
-      TensorList input,                                                        \
-      TensorList tensors1,                                                     \
-      TensorList tensors2,                                                     \
-      at::ArrayRef<Scalar> scalars) {                                          \
-    at::native::check_foreach_api_restrictions(                                \
-        input, tensors1, tensors2, scalars);                                   \
-                                                                               \
-    if (!at::native::can_use_fast_route(                                       \
-            {input, tensors1, tensors2}, scalars) ||                           \
-        at::native::has_integral_tensor(input, /* includeBool */ true)) {      \
-      return at::native::foreach_tensor_##NAME##_scalarlist_slow_(             \
-          input, tensors1, tensors2, scalars);                                 \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_xpu(               \
+      TensorList tensors, at::ArrayRef<Scalar> scalars) {                      \
+    check_foreach_api_restrictions(tensors, scalars);                          \
+    if (!can_use_fast_route(tensors, scalars, DIV_OP)) {                       \
+      return foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars); \
     }                                                                          \
                                                                                \
-    native::xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars); \
+    return xpu::FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)(tensors, scalars); \
+  }
+
+FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false);
+FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false);
+FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true);
+
+void foreach_tensor_addcmul_scalar_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value = 1);
+::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value = 1);
+::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_addcmul_scalarlist_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_addcdiv_scalar_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value = 1);
+::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    const at::Scalar& value = 1);
+::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+void foreach_tensor_addcdiv_scalarlist_slow_(
+    at::TensorList self,
+    at::TensorList tensor1,
+    at::TensorList tensor2,
+    at::ArrayRef<at::Scalar> scalars);
+
+#define FOREACH_POINTWISE_OP_SCALARLIST(NAME)                                \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_xpu(                \
+      TensorList input,                                                      \
+      TensorList tensors1,                                                   \
+      TensorList tensors2,                                                   \
+      at::ArrayRef<Scalar> scalars) {                                        \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);      \
+                                                                             \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) ||         \
+        has_integral_tensor(input, /* includeBool */ true)) {                \
+      return foreach_tensor_##NAME##_scalarlist_slow(                        \
+          input, tensors1, tensors2, scalars);                               \
+    }                                                                        \
+                                                                             \
+    return xpu::foreach_##NAME##_kernel(input, tensors1, tensors2, scalars); \
+  }                                                                          \
+                                                                             \
+  void foreach_tensor_##NAME##_scalarlist_xpu_(                              \
+      TensorList input,                                                      \
+      TensorList tensors1,                                                   \
+      TensorList tensors2,                                                   \
+      at::ArrayRef<Scalar> scalars) {                                        \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);      \
+                                                                             \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) ||         \
+        has_integral_tensor(input, /* includeBool */ true)) {                \
+      return foreach_tensor_##NAME##_scalarlist_slow_(                       \
+          input, tensors1, tensors2, scalars);                               \
+    }                                                                        \
+                                                                             \
+    xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars);       \
   }
 
 FOREACH_POINTWISE_OP_SCALARLIST(addcmul)
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv)
 
+}; // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachReduceOp.cpp b/src/ATen/native/xpu/ForeachReduceOp.cpp
index b67314db3..003f6ae14 100644
--- a/src/ATen/native/xpu/ForeachReduceOp.cpp
+++ b/src/ATen/native/xpu/ForeachReduceOp.cpp
@@ -1,9 +1,10 @@
 #include <ATen/native/ForeachUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 #include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
+#include <xpu/ATen/ops/_foreach_norm_native.h>
 
 namespace at {
+namespace native {
 
 static inline void check_foreach_norm_dtype(
     optional<ScalarType> opt_dtype,
@@ -39,7 +40,7 @@ static inline void check_foreach_norm_dtype(
   }
 }
 
-std::vector<Tensor> XPUNativeFunctions::_foreach_norm(
+std::vector<Tensor> foreach_tensor_norm_xpu(
     TensorList tensors,
     const Scalar& ord,
     c10::optional<ScalarType> dtype) {
@@ -68,5 +69,5 @@ std::vector<Tensor> XPUNativeFunctions::_foreach_norm(
 
   return native::xpu::foreach_norm_kernel(tensors, ord, p, dtype);
 }
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachUnaryOp.cpp b/src/ATen/native/xpu/ForeachUnaryOp.cpp
index 4287488dd..89cd0ab4e 100644
--- a/src/ATen/native/xpu/ForeachUnaryOp.cpp
+++ b/src/ATen/native/xpu/ForeachUnaryOp.cpp
@@ -1,31 +1,33 @@
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
-
+namespace native {
 // given a functor and a "dispatch function", creates the outplace and inplace
 // operations
-#define FOREACH_UNARY_OP(op_name)                                       \
-  std::vector<Tensor> XPUNativeFunctions::_foreach_##op_name(           \
-      TensorList tensors) {                                             \
-    native::check_foreach_api_restrictions(tensors);                    \
-    if (!native::can_use_fast_route(tensors) ||                         \
-        native::has_integral_tensor(tensors, /* includeBool */ true)) { \
-      return at::native::foreach_tensor_##op_name##_slow(tensors);      \
-    }                                                                   \
-    return native::xpu::foreach_##op_name##_kernel(tensors);            \
-  }                                                                     \
-  void XPUNativeFunctions::_foreach_##op_name##_(TensorList tensors) {  \
-    native::check_foreach_api_restrictions(tensors);                    \
-    if (!native::can_use_fast_route(tensors) ||                         \
-        native::has_integral_tensor(tensors, /* includeBool */ true)) { \
-      return at::native::foreach_tensor_##op_name##_slow_(tensors);     \
-    }                                                                   \
-                                                                        \
-    native::xpu::foreach_##op_name##_kernel_(tensors);                  \
+
+::std::vector<at::Tensor> foreach_tensor_sqrt_slow(at::TensorList self);
+void foreach_tensor_sqrt_slow_(at::TensorList self);
+
+#define FOREACH_UNARY_OP(op_name)                                          \
+  std::vector<Tensor> foreach_tensor_##op_name##_xpu(TensorList tensors) { \
+    check_foreach_api_restrictions(tensors);                               \
+    if (!can_use_fast_route(tensors) ||                                    \
+        has_integral_tensor(tensors, /* includeBool */ true)) {            \
+      return foreach_tensor_##op_name##_slow(tensors);                     \
+    }                                                                      \
+    return xpu::foreach_##op_name##_kernel(tensors);                       \
+  }                                                                        \
+  void foreach_tensor_##op_name##_xpu_(TensorList tensors) {               \
+    check_foreach_api_restrictions(tensors);                               \
+    if (!can_use_fast_route(tensors) ||                                    \
+        has_integral_tensor(tensors, /* includeBool */ true)) {            \
+      return foreach_tensor_##op_name##_slow_(tensors);                    \
+    }                                                                      \
+                                                                           \
+    xpu::foreach_##op_name##_kernel_(tensors);                             \
   }
 
 FOREACH_UNARY_OP(sqrt);
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/GatedLinearUnit.cpp b/src/ATen/native/xpu/GatedLinearUnit.cpp
index ef45a3bd7..5872ecbb2 100644
--- a/src/ATen/native/xpu/GatedLinearUnit.cpp
+++ b/src/ATen/native/xpu/GatedLinearUnit.cpp
@@ -1,49 +1,15 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/TensorIterator.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/Resize.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/ActivationGluKernels.h>
 
 namespace at {
+namespace native {
+REGISTER_XPU_DISPATCH(glu_stub, &xpu::glu_kernel);
 
-TensorIterator glu_meta(const Tensor& self, int64_t dim, Tensor& out) {
-  // this can't pass anyway because a 0-dimensional tensor has "size" 1, which
-  // can't be evenly halved, but give a nicer error message here.
-  TORCH_CHECK(self.dim() > 0, "glu does not support 0-dimensional tensors");
-  auto wrap_dim = maybe_wrap_dim(dim, self.dim());
-  const int64_t nIn = self.size(wrap_dim);
-  TORCH_CHECK(
-      nIn % 2 == 0,
-      "Halving dimension must be even, but dimension ",
-      wrap_dim,
-      " is size ",
-      nIn);
-
-  // size output to half of input
-  const int64_t selfSize = nIn / 2;
-  Tensor firstHalf = self.narrow(wrap_dim, 0, selfSize);
-  Tensor secondHalf = self.narrow(wrap_dim, selfSize, selfSize);
-  return TensorIterator::borrowing_binary_op(out, firstHalf, secondHalf);
-}
-
-Tensor& XPUNativeFunctions::glu_out(
-    const Tensor& self,
-    int64_t dim,
-    Tensor& out) {
-  auto iter = glu_meta(self, dim, out);
-  native::xpu::glu_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::glu(const Tensor& self, int64_t dim) {
-  Tensor out;
-  auto iter = glu_meta(self, dim, out);
-  native::xpu::glu_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::glu_backward_out(
+Tensor& glu_backward_xpu_out(
     const Tensor& grad_output,
     const Tensor& input,
     int64_t dim,
@@ -91,12 +57,13 @@ Tensor& XPUNativeFunctions::glu_backward_out(
   return grad_input;
 }
 
-Tensor XPUNativeFunctions::glu_backward(
+Tensor glu_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     int64_t dim) {
   auto grad_input = at::empty({0}, input.options());
-  return glu_backward_out(grad_output, input, dim, grad_input);
+  return glu_backward_xpu_out(grad_output, input, dim, grad_input);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/GridSampler.cpp b/src/ATen/native/xpu/GridSampler.cpp
index 17e8baf21..fa9a5d17e 100644
--- a/src/ATen/native/xpu/GridSampler.cpp
+++ b/src/ATen/native/xpu/GridSampler.cpp
@@ -1,22 +1,24 @@
-#include <ATen/ATen.h>
 #include <ATen/core/op_registration/adaption.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 #include <ATen/native/xpu/sycl/GridSamplerKernels.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros_like.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
+namespace native {
 
-Tensor XPUNativeFunctions::grid_sampler_2d(
+Tensor grid_sampler_2d_xpu(
     const Tensor& input,
     const Tensor& grid,
     int64_t interpolation_mode,
     int64_t padding_mode,
     bool align_corners) {
-  return native::xpu::grid_sampler_2d_kernel(
+  return xpu::grid_sampler_2d_kernel(
       input, grid, interpolation_mode, padding_mode, align_corners);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::grid_sampler_2d_backward(
+std::tuple<Tensor, Tensor> grid_sampler_2d_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& grid,
@@ -33,7 +35,7 @@ std::tuple<Tensor, Tensor> XPUNativeFunctions::grid_sampler_2d_backward(
     }
   })();
   auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  native::xpu::grid_sampler_2d_backward_kernel(
+  xpu::grid_sampler_2d_backward_kernel(
       grad_input,
       grad_grid,
       grad_output,
@@ -45,5 +47,5 @@ std::tuple<Tensor, Tensor> XPUNativeFunctions::grid_sampler_2d_backward(
       output_mask);
   return std::make_tuple(grad_input, grad_grid);
 }
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/GroupNorm.cpp b/src/ATen/native/xpu/GroupNorm.cpp
index 0e0a2e558..77d788059 100644
--- a/src/ATen/native/xpu/GroupNorm.cpp
+++ b/src/ATen/native/xpu/GroupNorm.cpp
@@ -1,145 +1,15 @@
-#include <ATen/ATen.h>
 #include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/cpu/mixed_data_type.h>
+#include <ATen/native/group_norm.h>
 #include <ATen/native/xpu/sycl/GroupNormKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
-
-template <typename T>
-void check_group_norm_inputs(
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    T C,
-    int64_t num_groups) {
-  TORCH_CHECK(
-      num_groups > 0,
-      "Expected num groups to be greater than 0, got ",
-      num_groups);
-  TORCH_CHECK(
-      C % num_groups == 0,
-      "Expected number of channels in input to be divisible by ",
-      "num_groups, but got input of shape ",
-      input.sizes(),
-      " and "
-      "num_groups=",
-      num_groups);
-  TORCH_CHECK(
-      !weight.defined() ||
-          (weight.dim() == 1 && at::symint::numel<T>(weight) == C),
-      "Expected weight to be a vector of size equal to the number of ",
-      "channels in input, but got weight of shape ",
-      weight.sizes(),
-      " and input of shape ",
-      input.sizes());
-  TORCH_CHECK(
-      !bias.defined() || (bias.dim() == 1 && at::symint::numel<T>(bias) == C),
-      "Expected bias to be a vector of size equal to the number of ",
-      "channels in input, but got bias of shape ",
-      weight.sizes(),
-      " and input of shape ",
-      input.sizes());
-}
-
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::native_group_norm(
-    const Tensor& X,
-    const std::optional<Tensor>& gamma_opt,
-    const std::optional<Tensor>& beta_opt,
-    int64_t N,
-    int64_t C,
-    int64_t HxW,
-    int64_t group,
-    double eps) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> gamma_maybe_owned =
-      at::borrow_from_optional_tensor(gamma_opt);
-  const Tensor& gamma = *gamma_maybe_owned;
-  const Tensor& beta = c10::value_or_else(beta_opt, [] { return Tensor(); });
-
-  // repeated check so expanded weights can call native_group_norm directly but
-  // save mean and variance from forward
-  check_group_norm_inputs(X, gamma, beta, C, group);
-
-  bool mixed_type = at::native::is_mixed_type(X, gamma, beta);
-  if (mixed_type) {
-    at::native::check_mixed_data_type(X, gamma, beta);
-  }
-
-  Tensor Y = at::native::empty_like(
-      X,
-      c10::nullopt /* dtype */,
-      c10::nullopt /* layout */,
-      c10::nullopt /* device */,
-      c10::nullopt /* pin_memory */,
-      MemoryFormat::Contiguous);
-  const auto dtype = at::native::param_scalar_type(X, mixed_type);
-  Tensor mean = at::empty({N, group}, X.options().dtype(dtype));
-  Tensor rstd = at::empty({N, group}, X.options().dtype(dtype));
-  native::xpu::group_norm_kernel(
-      X, gamma, beta, N, C, HxW, group, eps, Y, mean, rstd, dtype);
-  return std::make_tuple(Y, mean, rstd);
-}
-
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::
-    native_group_norm_backward(
-        const Tensor& dY,
-        const Tensor& X,
-        const Tensor& mean,
-        const Tensor& rstd,
-        const c10::optional<Tensor>& gamma_opt,
-        int64_t N,
-        int64_t C,
-        int64_t HxW,
-        int64_t group,
-        std::array<bool, 3> grad_input_mask) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> gamma_maybe_owned =
-      at::borrow_from_optional_tensor(gamma_opt);
-  const Tensor& gamma = *gamma_maybe_owned;
-  TORCH_CHECK(
-      X.scalar_type() == dY.scalar_type(),
-      "Expected scalar types of X and dY are same.");
-  bool mixed_type = at::native::is_mixed_type(X, mean, rstd);
-  if (mixed_type) {
-    at::native::check_mixed_data_type(X, mean, rstd);
-  }
-  auto memory_format = X.device().is_cpu() ? X.suggest_memory_format()
-                                           : at::MemoryFormat::Contiguous;
-
-  Tensor dX;
-  Tensor dgamma;
-  Tensor dbeta;
-  if (grad_input_mask[0]) {
-    dX = at::native::empty_like(
-        X,
-        c10::nullopt /* dtype */,
-        c10::nullopt /* layout */,
-        c10::nullopt /* device */,
-        c10::nullopt /* pin_memory */,
-        memory_format);
-  }
-  if (grad_input_mask[1]) {
-    dgamma = at::native::empty_like(
-        gamma,
-        c10::nullopt /* dtype */,
-        c10::nullopt /* layout */,
-        c10::nullopt /* device */,
-        c10::nullopt /* pin_memory */,
-        at::MemoryFormat::Contiguous);
-  }
-  if (grad_input_mask[2]) {
-    dbeta = at::native::empty_like(
-        gamma,
-        c10::nullopt /* dtype */,
-        c10::nullopt /* layout */,
-        c10::nullopt /* device */,
-        c10::nullopt /* pin_memory */,
-        at::MemoryFormat::Contiguous);
-  }
-  native::xpu::group_norm_backward_kernel(
-      dY, X, mean, rstd, gamma, N, C, HxW, group, dX, dgamma, dbeta);
-  return std::make_tuple(dX, dgamma, dbeta);
-}
-
+namespace native {
+REGISTER_XPU_DISPATCH(GroupNormKernel, &xpu::group_norm_kernel);
+REGISTER_XPU_DISPATCH(
+    GroupNormBackwardKernel,
+    &xpu::group_norm_backward_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Histogram.cpp b/src/ATen/native/xpu/Histogram.cpp
index f1d675286..3b0dcf5c1 100644
--- a/src/ATen/native/xpu/Histogram.cpp
+++ b/src/ATen/native/xpu/Histogram.cpp
@@ -1,330 +1,17 @@
 #include <ATen/Context.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/Histogram.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/xpu/sycl/HistogramKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
 
-/* Checks properties of input tensors input, bins, and weight.
- */
-void histogramdd_check_inputs(
-    const Tensor& input,
-    const TensorList& bins,
-    const std::optional<Tensor>& weight) {
-  TORCH_CHECK(
-      input.dim() >= 2,
-      "torch.histogramdd: input tensor should have at least 2 dimensions, but got ",
-      input.dim());
-
-  const int64_t N = input.size(-1);
-
-  TORCH_CHECK(
-      static_cast<int64_t>(bins.size()) == N,
-      "torch.histogramdd: expected ",
-      N,
-      " sequences of bin edges for a ",
-      N,
-      "-dimensional histogram but got ",
-      bins.size());
-
-  auto input_dtype = input.dtype();
-  for (const auto dim : c10::irange(N)) {
-    const Tensor& dim_bins = bins[dim];
-
-    auto bins_dtype = dim_bins.dtype();
-    TORCH_CHECK(
-        input_dtype == bins_dtype,
-        "torch.histogramdd: input tensor and bins tensors should",
-        " have the same dtype, but got input with dtype ",
-        input_dtype,
-        " and bins for dimension ",
-        dim,
-        " with dtype ",
-        bins_dtype);
-
-    const int64_t dim_bins_dim = dim_bins.dim();
-    TORCH_CHECK(
-        dim_bins_dim == 1,
-        "torch.histogramdd: bins tensor should have one dimension,",
-        " but got ",
-        dim_bins_dim,
-        " dimensions in the bins tensor for dimension ",
-        dim);
-
-    const int64_t numel = dim_bins.numel();
-    TORCH_CHECK(
-        numel > 0,
-        "torch.histogramdd: bins tensor should have at least 1 element,",
-        " but got ",
-        numel,
-        " elements in the bins tensor for dimension ",
-        dim);
-  }
-
-  if (weight.has_value()) {
-    TORCH_CHECK(
-        input.dtype() == weight.value().dtype(),
-        "torch.histogramdd: if weight tensor is provided,"
-        " input tensor and weight tensor should have the same dtype, but got input(",
-        input.dtype(),
-        ")",
-        ", and weight(",
-        weight.value().dtype(),
-        ")");
-
-    /* If a weight tensor is provided, we expect its shape to match that of
-     * the input tensor excluding its innermost dimension N.
-     */
-    auto input_sizes = input.sizes().vec();
-    input_sizes.pop_back();
-
-    auto weight_sizes = weight.value().sizes().vec();
-    if (weight_sizes.empty()) {
-      // correctly handle scalars
-      weight_sizes = {1};
-    }
-
-    TORCH_CHECK(
-        input_sizes == weight_sizes,
-        "torch.histogramdd: if weight tensor is provided it should have"
-        " the same shape as the input tensor excluding its innermost dimension, but got input with shape ",
-        input.sizes(),
-        " and weight with shape ",
-        weight.value().sizes());
-  }
-}
-
-/* Checks properties of output tensors hist and bin_edges, then resizes them.
- */
-void histogramdd_prepare_out(
-    const Tensor& input,
-    const std::vector<int64_t>& bin_ct,
-    const Tensor& hist,
-    const TensorList& bin_edges) {
-  const int64_t N = input.size(-1);
-
-  TORCH_INTERNAL_ASSERT((int64_t)bin_ct.size() == N);
-  TORCH_INTERNAL_ASSERT((int64_t)bin_edges.size() == N);
-
-  TORCH_CHECK(
-      input.dtype() == hist.dtype(),
-      "torch.histogram: input tensor and hist tensor should",
-      " have the same dtype, but got input ",
-      input.dtype(),
-      " and hist ",
-      hist.dtype());
-
-  for (const auto dim : c10::irange(N)) {
-    TORCH_CHECK(
-        input.dtype() == bin_edges[dim].dtype(),
-        "torch.histogram: input tensor and bin_edges tensor should",
-        " have the same dtype, but got input ",
-        input.dtype(),
-        " and bin_edges ",
-        bin_edges[dim].dtype(),
-        " for dimension ",
-        dim);
-
-    TORCH_CHECK(
-        bin_ct[dim] > 0,
-        "torch.histogram(): bins must be > 0, but got ",
-        bin_ct[dim],
-        " for dimension ",
-        dim);
-
-    at::native::resize_output(bin_edges[dim], bin_ct[dim] + 1);
-  }
-
-  at::native::resize_output(hist, bin_ct);
-}
-
-void histogramdd_prepare_out(
-    const Tensor& input,
-    TensorList bins,
-    const Tensor& hist,
-    const TensorList& bin_edges) {
-  std::vector<int64_t> bin_ct(bins.size());
-  std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](Tensor t) {
-    return t.numel() - 1;
-  });
-  histogramdd_prepare_out(input, bin_ct, hist, bin_edges);
-}
-
-void histogram_select_outer_bin_edges_kernel(
-    const Tensor& input,
-    const int64_t N,
-    std::vector<double>& leftmost_edges,
-    std::vector<double>& rightmost_edges) {
-  auto [min, max] = at::aminmax(input, 0);
-
-  for (const auto i : c10::irange(N)) {
-    leftmost_edges[i] = min[i].item().to<double>();
-    rightmost_edges[i] = max[i].item().to<double>();
-  }
-}
-
-/* Determines the outermost bin edges. For simplicity when calling into aminmax,
- * assumes that input has already been reshaped to (M, N).
- */
-std::pair<std::vector<double>, std::vector<double>> select_outer_bin_edges(
-    const Tensor& input,
-    std::optional<c10::ArrayRef<double>> range) {
-  TORCH_INTERNAL_ASSERT(
-      input.dim() == 2, "expected input to have shape (M, N)");
-  const int64_t N = input.size(-1);
-
-  // Default ranges for empty input matching numpy.histogram's default
-  std::vector<double> leftmost_edges(N, 0.);
-  std::vector<double> rightmost_edges(N, 1.);
-
-  if (range.has_value()) {
-    // range is specified
-    TORCH_CHECK(
-        (int64_t)range.value().size() == 2 * N,
-        "torch.histogramdd: for a ",
-        N,
-        "-dimensional histogram",
-        " range should have ",
-        2 * N,
-        " elements, but got ",
-        range.value().size());
-
-    for (const auto dim : c10::irange(N)) {
-      leftmost_edges[dim] = range.value()[2 * dim];
-      rightmost_edges[dim] = range.value()[2 * dim + 1];
-    }
-  } else if (input.numel() > 0) {
-    // non-empty input
-
-    histogram_select_outer_bin_edges_kernel(
-        input, N, leftmost_edges, rightmost_edges);
-  }
-
-  for (const auto dim : c10::irange(N)) {
-    double leftmost_edge = leftmost_edges[dim];
-    double rightmost_edge = rightmost_edges[dim];
-
-    TORCH_CHECK(
-        std::isfinite(leftmost_edge) && std::isfinite(rightmost_edge),
-        "torch.histogramdd: dimension ",
-        dim,
-        "'s range [",
-        leftmost_edge,
-        ", ",
-        rightmost_edge,
-        "] is not finite");
-
-    TORCH_CHECK(
-        leftmost_edge <= rightmost_edge,
-        "torch.histogramdd: min should not exceed max, but got",
-        " min ",
-        leftmost_edge,
-        " max ",
-        rightmost_edge,
-        " for dimension ",
-        dim);
-
-    // Expand empty range to match numpy behavior and avoid division by 0 in
-    // normalization
-    if (leftmost_edge == rightmost_edge) {
-      leftmost_edges[dim] -= 0.5;
-      rightmost_edges[dim] += 0.5;
-    }
-  }
-
-  return std::make_pair(leftmost_edges, rightmost_edges);
-}
-
-static Tensor& histogramdd_out(
-    const Tensor& self,
-    TensorList bins,
-    const std::optional<Tensor>& weight,
-    bool density,
-    Tensor& hist,
-    TensorList& bin_edges) {
-  histogramdd_check_inputs(self, bins, weight);
-  histogramdd_prepare_out(self, bins, hist, bin_edges);
-
-  for (const auto dim : c10::irange(bins.size())) {
-    bin_edges[dim].copy_(bins[dim]);
-  }
-
-  at::native::xpu::histogramdd_kernel(self, weight, density, hist, bin_edges);
-  return hist;
-}
-
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::histogram_out(
-    const Tensor& self,
-    const Tensor& bins,
-    const std::optional<Tensor>& weight,
-    bool density,
-    Tensor& hist,
-    Tensor& bin_edges) {
-  Tensor reshaped_self = self.reshape({self.numel(), 1});
-  std::optional<Tensor> reshaped_weight = weight.has_value()
-      ? weight.value().reshape({weight.value().numel()})
-      : weight;
-  TensorList bins_in = bins;
-  TensorList bins_out = bin_edges;
-
-  histogramdd_out(
-      reshaped_self, bins_in, reshaped_weight, density, hist, bins_out);
-
-  return std::forward_as_tuple(hist, bin_edges);
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::histogram(
-    const Tensor& self,
-    const Tensor& bins,
-    const std::optional<Tensor>& weight,
-    bool density) {
-  Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
-  Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous);
-  return histogram_out(self, bins, weight, density, hist, bin_edges);
-}
-
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::histogram_out(
-    const Tensor& self,
-    int64_t bin_ct,
-    std::optional<c10::ArrayRef<double>> range,
-    const std::optional<Tensor>& weight,
-    bool density,
-    Tensor& hist,
-    Tensor& bin_edges) {
-  Tensor reshaped_self = self.reshape({self.numel(), 1});
-  std::optional<Tensor> reshaped_weight = weight.has_value()
-      ? weight.value().reshape({weight.value().numel()})
-      : weight;
-  TensorList bins_in = bin_edges;
-  TensorList bins_out = bin_edges;
-
-  histogramdd_prepare_out(
-      reshaped_self, std::vector<int64_t>{bin_ct}, hist, bins_out);
-  auto outer_bin_edges = select_outer_bin_edges(reshaped_self, range);
-  at::linspace_out(
-      bin_edges,
-      outer_bin_edges.first[0],
-      outer_bin_edges.second[0],
-      bin_ct + 1);
-
-  histogramdd_check_inputs(reshaped_self, bins_in, reshaped_weight);
-
-  at::native::xpu::histogramdd_linear_kernel(
-      reshaped_self, reshaped_weight, density, hist, bin_edges, true);
-  return std::forward_as_tuple(hist, bin_edges);
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::histogram(
-    const Tensor& self,
-    int64_t bin_ct,
-    std::optional<c10::ArrayRef<double>> range,
-    const std::optional<Tensor>& weight,
-    bool density) {
-  Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
-  Tensor bin_edges_out = at::empty({0}, self.options());
-  return histogram_out(
-      self, bin_ct, range, weight, density, hist, bin_edges_out);
-}
+namespace native {
+REGISTER_XPU_DISPATCH(histogramdd_stub, &xpu::histogramdd_kernel);
+REGISTER_XPU_DISPATCH(histogramdd_linear_stub, &xpu::histogramdd_linear_kernel);
+REGISTER_XPU_DISPATCH(
+    histogram_select_outer_bin_edges_stub,
+    &xpu::histogram_select_outer_bin_edges_kernel);
 
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/Im2Col.cpp b/src/ATen/native/xpu/Im2Col.cpp
index 1bfc498da..eb9f4077a 100644
--- a/src/ATen/native/xpu/Im2Col.cpp
+++ b/src/ATen/native/xpu/Im2Col.cpp
@@ -1,15 +1,16 @@
-#include <ATen/ATen.h>
+
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/div_rtn.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <torch/library.h>
 
-#include <ATen/native/xpu/sycl/Im2ColKernel.h>
+#include <xpu/ATen/ops/im2col_native.h>
 
-namespace at {
+#include <ATen/native/xpu/sycl/Im2ColKernel.h>
+#include <comm/xpu_aten.h>
 
-Tensor& XPUNativeFunctions::im2col_out(
+namespace at::native {
+Tensor& im2col_out_xpu(
     const Tensor& self,
     IntArrayRef kernel_size,
     IntArrayRef dilation,
@@ -26,7 +27,7 @@ Tensor& XPUNativeFunctions::im2col_out(
   return out;
 }
 
-Tensor XPUNativeFunctions::im2col(
+Tensor im2col_xpu(
     const Tensor& self,
     IntArrayRef kernel_size,
     IntArrayRef dilation,
@@ -40,5 +41,4 @@ Tensor XPUNativeFunctions::im2col(
       output, self, kernel_size, dilation, padding, stride);
   return output;
 }
-
-} // namespace at
+} // namespace at::native
diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
index d4d5598e6..6ba148607 100644
--- a/src/ATen/native/xpu/Indexing.cpp
+++ b/src/ATen/native/xpu/Indexing.cpp
@@ -1,16 +1,21 @@
-#include <ATen/ATen.h>
+
 #include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/core/op_registration/adaption.h>
+
 #include <ATen/native/xpu/sycl/IndexingKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/TensorInfo.h>
+#include <comm/xpu_aten.h>
+
+#include <ATen/ops/index.h>
+#include <xpu/ATen/ops/index_native.h>
 
 namespace at {
+namespace native {
 
-Tensor& XPUNativeFunctions::index_select_out(
+Tensor& index_select_out_xpu(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
@@ -32,20 +37,17 @@ Tensor& XPUNativeFunctions::index_select_out(
   dim = at::maybe_wrap_dim(dim, self);
   TORCH_CHECK(self.dim() <= XPU_MAX_TENSORINFO_DIMS, DIM_WARNING);
   TORCH_CHECK(index.dim() <= XPU_MAX_TENSORINFO_DIMS, DIM_WARNING);
-  native::xpu::index_select_kernel(self, dim, index, out);
+  xpu::index_select_kernel(self, dim, index, out);
 
   return out;
 }
 
-Tensor XPUNativeFunctions::index_select(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index) {
-  auto out = at::empty({0}, self.options());
-  return index_select_out(self, dim, index, out);
+Tensor index_select_xpu_(const Tensor& self, int64_t dim, const Tensor& index) {
+  Tensor result = at::empty({0}, self.options());
+  return at::native::index_select_out_xpu(self, dim, index, result);
 }
 
-Tensor& XPUNativeFunctions::masked_scatter_(
+Tensor& masked_scatter__xpu(
     Tensor& self,
     const Tensor& mask,
     const Tensor& source) {
@@ -99,29 +101,27 @@ static Tensor& masked_select_out_impl(
   // owning and expand_outplace returns a borrow, the returned borrow
   // would dangle.
   auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp);
-  XPUNativeFunctions::index_out(
+  at::index_out(
+      result,
       *std::get<1>(mask_self_expanded),
       c10::List<std::optional<at::Tensor>>(
-          {*std::move(std::get<0>(mask_self_expanded))}),
-      result);
+          {*std::move(std::get<0>(mask_self_expanded))}));
 
   return result;
 }
 
-Tensor XPUNativeFunctions::masked_select(
-    const Tensor& self,
-    const Tensor& mask) {
+Tensor masked_select_xpu(const Tensor& self, const Tensor& mask) {
   namedinference::compute_broadcast_outnames(self, mask);
   Tensor result = at::empty({0}, self.options());
   return masked_select_out_impl(result, self, mask);
 }
 
-Tensor& XPUNativeFunctions::masked_select_out(
+Tensor& masked_select_out_xpu(
     const Tensor& self,
     const Tensor& mask,
     Tensor& result) {
   namedinference::compute_broadcast_outnames(self, mask);
   return masked_select_out_impl(result, self, mask);
 }
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/LayerNorm.cpp b/src/ATen/native/xpu/LayerNorm.cpp
index 8a467a9f4..0addcd718 100644
--- a/src/ATen/native/xpu/LayerNorm.cpp
+++ b/src/ATen/native/xpu/LayerNorm.cpp
@@ -1,29 +1,29 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/layer_norm.h>
 #include <c10/core/SymIntArrayRef.h>
+#include <comm/xpu_aten.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/empty.h>
-#endif
+// #ifndef AT_PER_OPERATOR_HEADERS
+// #include <ATen/Functions.h>
+// #include <ATen/NativeFunctions.h>
+// #else
+// #include <ATen/ops/empty.h>
+// #endif
 
 #include <ATen/native/xpu/sycl/LayerNormKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/zeros_like_native.h>
 
 namespace at {
-
-::std::tuple<at::Tensor, at::Tensor, at::Tensor> XPUNativeFunctions::
-    native_layer_norm(
-        const at::Tensor& input,
-        at::IntArrayRef normalized_shape,
-        const ::std::optional<at::Tensor>& weight_opt,
-        const ::std::optional<at::Tensor>& bias_opt,
-        double epsilon) {
+namespace native {
+::std::tuple<at::Tensor, at::Tensor, at::Tensor> layer_norm_xpu(
+    const at::Tensor& input,
+    at::IntArrayRef normalized_shape,
+    const ::std::optional<at::Tensor>& weight_opt,
+    const ::std::optional<at::Tensor>& bias_opt,
+    double epsilon) {
   std::optional<Device> common_device = std::nullopt;
   c10::impl::check_and_update_common_device(
       common_device, input, "xpu::native_layer_norm", "input");
@@ -79,16 +79,15 @@ ::std::tuple<at::Tensor, at::Tensor, at::Tensor> XPUNativeFunctions::
   return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd));
 }
 
-::std::tuple<at::Tensor, at::Tensor, at::Tensor> XPUNativeFunctions::
-    native_layer_norm_backward(
-        const at::Tensor& grad_output,
-        const at::Tensor& input,
-        at::IntArrayRef normalized_shape,
-        const at::Tensor& mean,
-        const at::Tensor& rstd,
-        const ::std::optional<at::Tensor>& weight_opt,
-        const ::std::optional<at::Tensor>& bias_opt,
-        ::std::array<bool, 3> grad_input_mask) {
+::std::tuple<at::Tensor, at::Tensor, at::Tensor> layer_norm_backward_xpu(
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    at::IntArrayRef normalized_shape,
+    const at::Tensor& mean,
+    const at::Tensor& rstd,
+    const ::std::optional<at::Tensor>& weight_opt,
+    const ::std::optional<at::Tensor>& bias_opt,
+    ::std::array<bool, 3> grad_input_mask) {
   std::optional<Device> common_device = std::nullopt;
   c10::impl::check_and_update_common_device(
       common_device, grad_output, "xpu::native_layer_norm_backward", "goutput");
@@ -177,5 +176,6 @@ ::std::tuple<at::Tensor, at::Tensor, at::Tensor> XPUNativeFunctions::
       grad_bias,
       grad_input_mask);
 }
+} // namespace native
 
 } // namespace at
diff --git a/src/ATen/native/xpu/Lerp.cpp b/src/ATen/native/xpu/Lerp.cpp
index 272417b39..d64fc1acb 100644
--- a/src/ATen/native/xpu/Lerp.cpp
+++ b/src/ATen/native/xpu/Lerp.cpp
@@ -1,110 +1,16 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Lerp.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/LerpKernels.h>
 
 namespace at {
+namespace native {
 
-TensorIterator lerp_tensor_meta(
-    const Tensor& self,
-    const Tensor& end,
-    const Tensor& weight,
-    Tensor& out) {
-  TORCH_CHECK(
-      self.dtype() == end.dtype(),
-      "expected dtype ",
-      self.dtype(),
-      " for `end` but got dtype ",
-      end.dtype());
-  TORCH_CHECK(
-      self.dtype() == weight.dtype(),
-      "expected dtype ",
-      self.dtype(),
-      " for `weight` but got dtype ",
-      weight.dtype());
-  TensorIterator iter;
-  iter.build(TensorIteratorConfig()
-                 .add_output(out)
-                 .add_const_input(self)
-                 .add_const_input(end)
-                 .add_const_input(weight));
-  return iter;
-}
-
-Tensor XPUNativeFunctions::lerp(
-    const Tensor& self,
-    const Tensor& end,
-    const Tensor& weight) {
-  Tensor out;
-  auto iter = lerp_tensor_meta(self, end, weight, out);
-  native::xpu::lerp_tensor_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::lerp_(
-    Tensor& self,
-    const Tensor& end,
-    const Tensor& weight) {
-  auto iter = lerp_tensor_meta(self, end, weight, self);
-  native::xpu::lerp_tensor_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::lerp_out(
-    const Tensor& self,
-    const Tensor& end,
-    const Tensor& weight,
-    Tensor& out) {
-  auto iter = lerp_tensor_meta(self, end, weight, out);
-  native::xpu::lerp_tensor_kernel(iter);
-  return out;
-}
-
-TensorIterator lerp_scalar_meta(
-    const Tensor& self,
-    const Tensor& end,
-    const Scalar& /*weight*/,
-    Tensor& out) {
-  TORCH_CHECK(
-      self.dtype() == end.dtype(),
-      "expected dtype ",
-      self.dtype(),
-      " for `end` but got dtype ",
-      end.dtype());
-  TensorIterator iter;
-  iter.build_binary_op(out, self, end);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::lerp(
-    const Tensor& self,
-    const Tensor& end,
-    const Scalar& weight) {
-  Tensor out;
-  auto iter = lerp_scalar_meta(self, end, weight, out);
-  native::xpu::lerp_scalar_kernel(iter, weight);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::lerp_(
-    Tensor& self,
-    const Tensor& end,
-    const Scalar& weight) {
-  auto iter = lerp_scalar_meta(self, end, weight, self);
-  native::xpu::lerp_scalar_kernel(iter, weight);
-  return self;
-}
+REGISTER_XPU_DISPATCH(lerp_kernel_tensor_weight, &xpu::lerp_tensor_kernel);
+REGISTER_XPU_DISPATCH(lerp_kernel_scalar_weight, &xpu::lerp_scalar_kernel);
 
-Tensor& XPUNativeFunctions::lerp_out(
-    const Tensor& self,
-    const Tensor& end,
-    const Scalar& weight,
-    Tensor& out) {
-  auto iter = lerp_scalar_meta(self, end, weight, out);
-  native::xpu::lerp_scalar_kernel(iter, weight);
-  return out;
-}
+} // namespace native
 
 } // namespace at
diff --git a/src/ATen/native/xpu/LinearAlgebra.cpp b/src/ATen/native/xpu/LinearAlgebra.cpp
index 2f857f18b..719f23f6f 100644
--- a/src/ATen/native/xpu/LinearAlgebra.cpp
+++ b/src/ATen/native/xpu/LinearAlgebra.cpp
@@ -1,273 +1,22 @@
-#include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/ReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/utils/ParamUtils.h>
+#include <comm/xpu_aten.h>
+
 #include <ATen/native/xpu/sycl/LinearAlgebraKernels.h>
 #include <ATen/native/xpu/sycl/ReduceNormKernel.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
 namespace at {
-namespace detail {
-
-static void check_linalg_norm_dtype(
-    optional<ScalarType> opt_dtype,
-    ScalarType self_dtype,
-    const char* const name) {
-  if (opt_dtype.has_value()) {
-    auto dtype = opt_dtype.value();
-    TORCH_CHECK(
-        isFloatingType(dtype) || isComplexType(dtype),
-        name,
-        ": dtype should"
-        " be floating point or complex, but got ",
-        dtype);
-    TORCH_CHECK(
-        isComplexType(self_dtype) == isComplexType(dtype),
-        name,
-        ": dtype should be ",
-        isComplexType(self_dtype) ? "complex" : "real",
-        " for ",
-        isComplexType(self_dtype) ? "complex" : "real",
-        " inputs, but got ",
-        dtype);
-    TORCH_CHECK(
-        promoteTypes(self_dtype, dtype) == dtype,
-        name,
-        ": the dtype of the input ",
-        "(",
-        self_dtype,
-        ") should be convertible ",
-        "without narrowing to the specified dtype (",
-        dtype,
-        ")");
-  }
-}
-
-} // namespace detail
-
-Tensor& linalg_vector_norm_meta(
-    const Tensor& self,
-    const Scalar& scalar_ord,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    optional<ScalarType> opt_dtype,
-    Tensor& output) {
-  at::native::checkFloatingOrComplex(self, "linalg.vector_norm");
-
-  auto dim = opt_dim.value_or(IntArrayRef{});
-  // Casting a large integer to a double will just introduce an error for
-  // values larger than 10^53 (same for negative numbers), so that's fine.
-  auto ord = scalar_ord.toDouble();
-
-  // For more context, see issue 52783
-  // If the tensor is empty and norm < 0 || norm == infty
-  //   - We cannot reduce the whole tensor
-  //   - We cannot reduce over an empty dimension
-  if (self.numel() == 0 && (ord < 0. || ord == INFINITY)) {
-    // dim=None or dim=() reduces the whole tensor
-    TORCH_CHECK(
-        opt_dim.has_value() && !opt_dim->empty(),
-        "linalg.vector_norm cannot compute the ",
-        scalar_ord,
-        " norm on an empty ",
-        "tensor because the operation does not have an identity");
-    for (auto dim_num : dim) {
-      TORCH_CHECK(
-          self.size(dim_num) != 0,
-          "linalg.vector_norm cannot compute the ",
-          scalar_ord,
-          " norm on the dimension ",
-          dim_num,
-          "because this dimension is empty and the operation does not have an identity");
-    }
-  }
-
-  at::detail::check_linalg_norm_dtype(
-      opt_dtype, self.scalar_type(), "linalg.vector_norm");
-
-  auto mask = at::native::make_dim_mask(dim, self.dim());
-  auto shape = at::native::shape_from_dim_mask(self, std::move(mask), keepdim);
-  auto options = self.options().dtype(
-      toRealValueType(opt_dtype.value_or(self.scalar_type())));
-  if (output.defined()) {
-    at::xpu::resize_out(output, shape, {}, options);
-  } else {
-    output = at::xpu::create_out(shape, {}, options);
-  }
-  return output;
-}
-
-static void check_1d(const Tensor& t, const char* arg, const char* fn) {
-  TORCH_CHECK(
-      t.dim() == 1,
-      fn,
-      ": Expected 1-D argument ",
-      arg,
-      ", but got ",
-      t.dim(),
-      "-D");
-}
-
-static void check_addr_scalar(
-    const ScalarType dtype,
-    const Scalar& scalar,
-    const std::string& scalar_name) {
-  TORCH_CHECK(
-      !scalar.isBoolean() || dtype == ScalarType::Bool,
-      "Boolean ",
-      scalar_name,
-      " only supported for Boolean results.");
-  TORCH_CHECK(
-      isFloatingType(dtype) || isComplexType(dtype) || scalar.isIntegral(true),
-      "For integral input tensors, "
-      "argument ",
-      scalar_name,
-      " must not be a floating point number.");
-}
-
-static TensorIterator build_addr_iter(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& vec1,
-    const Tensor& vec2) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-
-  const auto vec1_size0 = vec1.sizes()[0];
-  const auto vec2_size0 = vec2.sizes()[0];
-  auto self_ = &result == &self
-      ? c10::MaybeOwned<Tensor>::borrowed(self)
-      : expand_size(self, {vec1_size0, vec2_size0}, "addr");
-  TORCH_CHECK(
-      self_->dim() == 2,
-      "2D tensor expected, got ",
-      self_->dim(),
-      "D tensor for input");
-  TORCH_CHECK(
-      self_->sizes()[0] == vec1_size0 && self_->sizes()[1] == vec2_size0,
-      "size mismatch, input: ",
-      self_->sizes(),
-      ", v1: ",
-      vec1.sizes(),
-      ", v2: ",
-      vec2.sizes());
-
-  auto iter = TensorIteratorConfig()
-                  .set_check_mem_overlap(true)
-                  .add_output(result)
-                  .add_owned_const_input(*self_)
-                  .add_owned_const_input(vec1.reshape({vec1_size0, 1}))
-                  .add_const_input(vec2)
-                  .allow_cpu_scalars(true)
-                  .promote_inputs_to_common_dtype(true)
-                  .cast_common_dtype_to_outputs(true)
-                  .enforce_safe_casting_to_output(true)
-                  .build();
-  return iter;
-}
-
-Tensor XPUNativeFunctions::addr(
-    const Tensor& self,
-    const Tensor& vec1,
-    const Tensor& vec2,
-    const Scalar& beta,
-    const Scalar& alpha) {
-  Tensor result;
-  auto iter = build_addr_iter(result, self, vec1, vec2);
-
-  check_addr_scalar(iter.dtype(), beta, "beta");
-  check_addr_scalar(iter.dtype(), alpha, "alpha");
-
-  native::xpu::addr_kernel(iter, beta, alpha);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::addr_out(
-    const Tensor& self,
-    const Tensor& vec1,
-    const Tensor& vec2,
-    const Scalar& beta,
-    const Scalar& alpha,
-    Tensor& out) {
-  auto iter = build_addr_iter(out, self, vec1, vec2);
-  check_addr_scalar(iter.dtype(), beta, "beta");
-  check_addr_scalar(iter.dtype(), alpha, "alpha");
-
-  native::xpu::addr_kernel(iter, beta, alpha);
-  return out;
-}
-
-Tensor XPUNativeFunctions::linalg_vector_norm(
-    const Tensor& self,
-    const Scalar& scalar_ord,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    optional<ScalarType> opt_dtype) {
-  Tensor result;
-  linalg_vector_norm_out(self, scalar_ord, opt_dim, keepdim, opt_dtype, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::linalg_vector_norm_out(
-    const Tensor& self,
-    const Scalar& scalar_ord,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    optional<ScalarType> opt_dtype,
-    Tensor& result) {
-  result = linalg_vector_norm_meta(
-      self, scalar_ord, opt_dim, keepdim, opt_dtype, result);
-  auto ord = scalar_ord.toDouble();
-  auto dim = opt_dim.value_or(IntArrayRef{});
-  auto size = self.sizes();
-  auto ndim = self.dim();
-
-  auto opt_dim_ = dim.vec();
-  maybe_wrap_dims(opt_dim_, ndim);
-
-  using Int = IntArrayRef::value_type;
-  std::vector<Int> all_dim(ndim);
-  std::iota(all_dim.begin(), all_dim.end(), 0);
-
-  bool is_all_reduce = !opt_dim.has_value() || opt_dim.value().empty();
-  auto reduce_dim = is_all_reduce ? all_dim : opt_dim_;
-
-  bool is_reduce_over_1D_vector = true;
-  for (auto i : reduce_dim) {
-    if (size[i] != 1) {
-      is_reduce_over_1D_vector = false;
-      break;
-    }
-  }
-
-  if (is_reduce_over_1D_vector) {
-    Tensor self_;
-    if (opt_dtype.has_value()) {
-      self_ = self.to(*opt_dtype);
-    } else {
-      self_ = self;
-    }
-    if (ord != 0.0) {
-      keepdim ? at::abs_outf(self_, const_cast<Tensor&>(result))
-              : at::abs_outf(
-                    self_.squeeze(reduce_dim), const_cast<Tensor&>(result));
-    } else {
-      keepdim ? at::ne_outf(self_, 0, const_cast<Tensor&>(result))
-              : at::ne_outf(
-                    self_.squeeze(reduce_dim), 0, const_cast<Tensor&>(result));
-    }
-    return result;
-  }
-
-  auto iter = at::native::make_reduction(
-      "vector_norm",
-      const_cast<Tensor&>(result),
-      self,
-      dim,
-      keepdim,
-      result.scalar_type());
-  native::xpu::norm_kernel(iter, ord);
-  return result;
-}
-
+namespace native {
+REGISTER_XPU_DISPATCH(addr_stub, &xpu::addr_kernel);
+REGISTER_XPU_DISPATCH(norm_stub, &xpu::norm_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Loss.cpp b/src/ATen/native/xpu/Loss.cpp
index f2ca7d9c0..c3ca675dc 100644
--- a/src/ATen/native/xpu/Loss.cpp
+++ b/src/ATen/native/xpu/Loss.cpp
@@ -1,143 +1,25 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Reduction.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/PointwiseOps.h>
+#include <comm/xpu_aten.h>
+
 #include <ATen/native/xpu/sycl/BinaryMiscOpsKernels.h>
 #include <ATen/native/xpu/sycl/LossKernels.h>
 #include <ATen/native/xpu/sycl/PointwiseOpsKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
 namespace at {
-
-static inline at::Tensor apply_loss_reduction(
-    const at::Tensor& unreduced,
-    int64_t reduction) {
-  if (reduction == at::Reduction::Mean) {
-    return unreduced.mean();
-  } else if (reduction == at::Reduction::Sum) {
-    return unreduced.sum();
-  }
-  return unreduced;
-}
-
-Tensor& XPUNativeFunctions::mse_loss_out(
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    Tensor& result) {
-  if (reduction != Reduction::None) {
-    TORCH_INTERNAL_ASSERT(
-        reduction == Reduction::Mean || reduction == Reduction::Sum);
-    result.resize_({});
-    Tensor loss;
-    auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
-    native::xpu::mse_kernel(iter);
-    if (reduction == Reduction::Mean) {
-      at::mean_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
-    } else {
-      at::sum_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
-    }
-  } else {
-    auto iter = TensorIterator::borrowing_binary_op(result, input, target);
-    native::xpu::mse_kernel(iter);
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::mse_loss(
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction) {
-  Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  result = XPUNativeFunctions::mse_loss_out(input, target, reduction, result);
-  return result;
-}
-
-Tensor XPUNativeFunctions::mse_loss_backward(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction) {
-  Tensor grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  return at::mse_loss_backward_out(
-      grad_input, grad_output, input, target, reduction);
-}
-
-Tensor& XPUNativeFunctions::mse_loss_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    Tensor& grad_input) {
-  auto norm = reduction == Reduction::Mean ? 2. / input.numel() : 2.;
-  auto iter = at::TensorIteratorConfig()
-                  .add_output(grad_input)
-                  .add_const_input(input)
-                  .add_const_input(target)
-                  .add_const_input(grad_output)
-                  .build();
-  native::xpu::mse_backward_kernel(iter, norm);
-  return grad_input;
-}
-
-Tensor& XPUNativeFunctions::smooth_l1_loss_out(
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    double beta,
-    Tensor& result) {
-  if (reduction != Reduction::None) {
-    TORCH_INTERNAL_ASSERT(
-        reduction == Reduction::Mean || reduction == Reduction::Sum);
-    result.resize_({});
-    Tensor loss;
-    auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
-    native::xpu::smooth_l1_kernel(iter, beta);
-    if (reduction == Reduction::Mean) {
-      at::mean_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
-    } else {
-      at::sum_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
-    }
-  } else {
-    auto iter = TensorIterator::borrowing_binary_op(result, input, target);
-    native::xpu::smooth_l1_kernel(iter, beta);
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::smooth_l1_loss(
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    double beta) {
-  Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  result = XPUNativeFunctions::smooth_l1_loss_out(
-      input, target, reduction, beta, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    double beta,
-    Tensor& grad_input) {
-  auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
-  auto iter = at::TensorIteratorConfig()
-                  .add_output(grad_input)
-                  .add_const_input(input)
-                  .add_const_input(target)
-                  .add_const_input(grad_output)
-                  .promote_inputs_to_common_dtype(true)
-                  .cast_common_dtype_to_outputs(true)
-                  .enforce_safe_casting_to_output(true)
-                  .build();
-  native::xpu::smooth_l1_backward_kernel(iter, norm, beta);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::binary_cross_entropy(
+namespace native {
+REGISTER_XPU_DISPATCH(mse_stub, &xpu::mse_kernel);
+REGISTER_XPU_DISPATCH(mse_backward_stub, &xpu::mse_backward_kernel);
+REGISTER_XPU_DISPATCH(huber_stub, &xpu::huber_kernel);
+REGISTER_XPU_DISPATCH(huber_backward_stub, &xpu::huber_backward_kernel);
+REGISTER_XPU_DISPATCH(smooth_l1_stub, &xpu::smooth_l1_kernel);
+REGISTER_XPU_DISPATCH(smooth_l1_backward_stub, &xpu::smooth_l1_backward_kernel);
+
+Tensor binary_cross_entropy_xpu(
     const Tensor& self,
     const Tensor& target,
     const std::optional<Tensor>& weight_opt,
@@ -150,7 +32,7 @@ Tensor XPUNativeFunctions::binary_cross_entropy(
       self, target, weight, reduction, loss);
 }
 
-Tensor& XPUNativeFunctions::binary_cross_entropy_out(
+Tensor& binary_cross_entropy_out_xpu(
     const Tensor& self,
     const Tensor& target,
     const std::optional<Tensor>& weight_opt,
@@ -163,7 +45,7 @@ Tensor& XPUNativeFunctions::binary_cross_entropy_out(
       self, target, weight, reduction, loss);
 }
 
-Tensor XPUNativeFunctions::binary_cross_entropy_backward(
+Tensor binary_cross_entropy_backward_xpu(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -177,7 +59,7 @@ Tensor XPUNativeFunctions::binary_cross_entropy_backward(
       grad_output, self, target, weight, reduction, grad_input);
 }
 
-Tensor& XPUNativeFunctions::binary_cross_entropy_backward_out(
+Tensor& binary_cross_entropy_backward_out_xpu(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -191,53 +73,5 @@ Tensor& XPUNativeFunctions::binary_cross_entropy_backward_out(
       grad_output, self, target, weight, reduction, grad_input);
 }
 
-Tensor XPUNativeFunctions::huber_loss(
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    double delta) {
-  TORCH_CHECK(
-      delta > 0, "huber_loss does not support non-positive values for delta.")
-  Tensor loss = at::empty_like(input);
-  auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
-  native::xpu::huber_kernel(iter, delta);
-  return apply_loss_reduction(loss, reduction);
-}
-
-Tensor& XPUNativeFunctions::huber_loss_out(
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    double delta,
-    Tensor& result) {
-  TORCH_CHECK(
-      delta > 0, "huber_loss does not support non-positive values for delta.")
-  auto iter = TensorIterator::borrowing_binary_op(result, input, target);
-  native::xpu::huber_kernel(iter, delta);
-  if (reduction != Reduction::None) {
-    auto reduced = apply_loss_reduction(result, reduction);
-    result.resize_({});
-    result.copy_(reduced);
-  }
-  return result;
-}
-
-Tensor& XPUNativeFunctions::huber_loss_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    double delta,
-    Tensor& grad_input) {
-  auto norm = (reduction == Reduction::Mean) ? (1. / input.numel()) : 1.;
-  auto iter = at::TensorIteratorConfig()
-                  .add_output(grad_input)
-                  .add_const_input(input)
-                  .add_const_input(target)
-                  .add_const_input(grad_output)
-                  .build();
-  native::xpu::huber_backward_kernel(iter, norm, delta);
-  return grad_input;
-}
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/LossNLL.cpp b/src/ATen/native/xpu/LossNLL.cpp
index d30eb7258..d80fef746 100644
--- a/src/ATen/native/xpu/LossNLL.cpp
+++ b/src/ATen/native/xpu/LossNLL.cpp
@@ -1,105 +1,28 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Reduction.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/xpu/sycl/LossNLLKernel.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
+#include <comm/xpu_aten.h>
 
-namespace at {
-void nll_loss_forward_meta(
-    const Tensor& self,
-    const Tensor& target,
-    const OptionalTensorRef weight_opt,
-    int64_t reduction,
-    int64_t ignore_index,
-    Tensor& output,
-    Tensor& total_weight) {
-  const Tensor& weight = weight_opt.getTensorRef();
-
-  TORCH_CHECK(
-      self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D");
-  TORCH_CHECK(
-      target.dim() <= 1,
-      "0D or 1D target tensor expected, multi-target not supported");
-
-  auto no_batch_dim = self.dim() == 1 && target.dim() == 0;
-  TORCH_CHECK(
-      no_batch_dim || (self.size(0) == target.size(0)),
-      "size mismatch (got input: ",
-      self.sizes(),
-      ", target: ",
-      target.sizes(),
-      ")")
-
-  const auto n_classes = self.size(-1);
-
-  TORCH_CHECK(
-      !weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes),
-      "weight tensor should be defined either for all ",
-      n_classes,
-      " classes or no classes"
-      " but got weight tensor of shape: ",
-      weight.sizes());
-
-  const auto n_dims = self.dim();
-  const auto batch_size = self.size(0);
+#include <xpu/ATen/ops/nll_loss_backward_native.h>
+#include <xpu/ATen/ops/nll_loss_forward_native.h>
 
-  if (reduction == Reduction::None && n_dims == 2) {
-    if (output.defined()) {
-      at::xpu::resize_out(output, {batch_size}, {}, self.options());
-    } else {
-      output = at::xpu::create_out({batch_size}, {}, self.options());
-    }
-  } else {
-    // produce scalar output when reducing or input is 1d
-    if (output.defined()) {
-      at::xpu::resize_out(output, {}, {}, self.options());
-    } else {
-      output = at::xpu::create_out({}, {}, self.options());
-    }
-  }
-  if (total_weight.defined()) {
-    at::xpu::resize_out(total_weight, {}, {}, self.options());
-  } else {
-    total_weight = at::xpu::create_out({}, {}, self.options());
-  }
-}
-
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::nll_loss_forward_out(
-    const Tensor& self,
-    const Tensor& target,
-    const c10::optional<Tensor>& weight,
-    int64_t reduction,
-    int64_t ignore_index,
-    Tensor& output,
-    Tensor& total_weight) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, output, "xpu::nll_loss_forward_out", "output");
-  c10::impl::check_and_update_common_device(
-      common_device, total_weight, "xpu::nll_loss_forward_out", "total_weight");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::nll_loss_forward_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, target, "xpu::nll_loss_forward_out", "target");
-  c10::impl::check_and_update_common_device(
-      common_device, weight, "xpu::nll_loss_forward_out", "weight");
-  nll_loss_forward_meta(
-      self,
-      target,
-      ((weight.has_value() && (*weight).defined())
-           ? at::OptionalTensorRef(*weight)
-           : at::OptionalTensorRef()),
-      reduction,
-      ignore_index,
-      output,
-      total_weight);
-  return native::xpu::nll_loss_forward_kernel(
+namespace at {
+namespace native {
+TORCH_IMPL_FUNC(nll_loss_forward_out_xpu)
+(const Tensor& self,
+ const Tensor& target,
+ const OptionalTensorRef weight_opt,
+ int64_t reduction,
+ int64_t ignore_index,
+ const Tensor& output,
+ const Tensor& total_weight) {
+  xpu::nll_loss_forward_kernel(
       self,
       target,
-      ((weight.has_value() && (*weight).defined())
-           ? at::OptionalTensorRef(*weight)
+      ((weight_opt.has_value() && (*weight_opt).defined())
+           ? at::OptionalTensorRef(*weight_opt)
            : at::OptionalTensorRef()),
       reduction,
       ignore_index,
@@ -107,122 +30,22 @@ std::tuple<Tensor&, Tensor&> XPUNativeFunctions::nll_loss_forward_out(
       total_weight);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::nll_loss_forward(
-    const Tensor& self,
-    const Tensor& target,
-    const c10::optional<Tensor>& weight,
-    int64_t reduction,
-    int64_t ignore_index) {
-  Tensor output;
-  Tensor total_weight;
-  return nll_loss_forward_out(
-      self, target, weight, reduction, ignore_index, output, total_weight);
-}
-
-void nll_loss_backward_meta(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& target,
-    OptionalTensorRef weight_opt,
-    int64_t reduction,
-    int64_t ignore_index,
-    const Tensor& total_weight,
-    Tensor& grad_input) {
-  TORCH_CHECK(
-      self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D");
-  TORCH_CHECK(
-      target.dim() <= 1,
-      "0D or 1D target tensor expected, multi-target not supported");
-
-  auto no_batch_dim = self.dim() == 1 && target.dim() == 0;
-  TORCH_CHECK(
-      no_batch_dim || (self.size(0) == target.size(0)),
-      "size mismatch (got input: ",
-      self.sizes(),
-      ", target: ",
-      target.sizes(),
-      ")")
-  TORCH_CHECK(
-      total_weight.numel() == 1,
-      "expected total_weight to be a  single element tensor, got: ",
-      total_weight.sizes(),
-      " (",
-      total_weight.numel(),
-      " elements)");
-
-  const auto& weight = weight_opt.getTensorRef();
-
-  TORCH_CHECK(
-      !weight.defined() || weight.numel() == self.size(-1),
-      "weight tensor should be defined either for all or no classes");
-
-  const auto n_dims = self.dim();
-
-  if (reduction == Reduction::None && n_dims == 2) {
-    const auto batch_size = self.size(0);
-    check_dim_size(grad_output, 1, 0, batch_size);
-  } else {
-    TORCH_CHECK(
-        grad_output.dim() <= 1 && grad_output.numel() == 1,
-        "Expected a single element grad_output tensor, but got: ",
-        grad_output.sizes());
-  }
-  if (grad_input.defined()) {
-    at::xpu::resize_out(
-        grad_input,
-        self.sizes(),
-        {},
-        self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-  } else {
-    grad_input = at::xpu::create_out(
-        self.sizes(),
-        {},
-        self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-  }
-}
-
-Tensor& XPUNativeFunctions::nll_loss_backward_out(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& target,
-    const c10::optional<Tensor>& weight,
-    int64_t reduction,
-    int64_t ignore_index,
-    const Tensor& total_weight,
-    Tensor& grad_input) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, grad_input, "xpu::nll_loss_backward_out", "grad_input");
-  c10::impl::check_and_update_common_device(
-      common_device, grad_output, "xpu::nll_loss_backward_out", "grad_output");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::nll_loss_backward_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, target, "xpu::nll_loss_backward_out", "target");
-  c10::impl::check_and_update_common_device(
-      common_device, weight, "xpu::nll_loss_backward_out", "weight");
-  c10::impl::check_and_update_common_device(
-      common_device,
-      total_weight,
-      "xpu::nll_loss_backward_out",
-      "total_weight");
-  nll_loss_backward_meta(
+TORCH_IMPL_FUNC(nll_loss_backward_out_xpu)
+(const Tensor& grad_output,
+ const Tensor& self,
+ const Tensor& target,
+ OptionalTensorRef weight_opt,
+ int64_t reduction,
+ int64_t ignore_index,
+ const Tensor& total_weight,
+ const Tensor& grad_input) {
+  grad_input.zero_();
+  xpu::nll_loss_backward_kernel(
       grad_output,
       self,
       target,
-      ((weight.has_value() && (*weight).defined())
-           ? at::OptionalTensorRef(*weight)
-           : at::OptionalTensorRef()),
-      reduction,
-      ignore_index,
-      total_weight,
-      grad_input);
-  return native::xpu::nll_loss_backward_kernel(
-      grad_output,
-      self,
-      target,
-      ((weight.has_value() && (*weight).defined())
-           ? at::OptionalTensorRef(*weight)
+      ((weight_opt.has_value() && (*weight_opt).defined())
+           ? at::OptionalTensorRef(*weight_opt)
            : at::OptionalTensorRef()),
       reduction,
       ignore_index,
@@ -230,23 +53,5 @@ Tensor& XPUNativeFunctions::nll_loss_backward_out(
       grad_input);
 }
 
-Tensor XPUNativeFunctions::nll_loss_backward(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& target,
-    const c10::optional<Tensor>& weight,
-    int64_t reduction,
-    int64_t ignore_index,
-    const Tensor& total_weight) {
-  Tensor grad_input;
-  return nll_loss_backward_out(
-      grad_output,
-      self,
-      target,
-      weight,
-      reduction,
-      ignore_index,
-      total_weight,
-      grad_input);
-}
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/LossNLL2d.cpp b/src/ATen/native/xpu/LossNLL2d.cpp
index 08f9d464e..7aaca4911 100644
--- a/src/ATen/native/xpu/LossNLL2d.cpp
+++ b/src/ATen/native/xpu/LossNLL2d.cpp
@@ -1,9 +1,10 @@
 #include <ATen/ATen.h>
 #include <ATen/native/xpu/sycl/LossNLL2dKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
+namespace native {
 
+namespace {
 void check_inputs_nll_loss2d(
     const Tensor& input,
     const Tensor& target,
@@ -30,8 +31,9 @@ void check_inputs_nll_loss2d(
       ", input ",
       input.sizes());
 }
+} // namespace
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::nll_loss2d_forward(
+std::tuple<Tensor, Tensor> nll_loss2d_forward_xpu(
     const Tensor& self,
     const Tensor& target,
     const ::std::optional<Tensor>& weight_opt,
@@ -51,7 +53,7 @@ std::tuple<Tensor, Tensor> XPUNativeFunctions::nll_loss2d_forward(
   return std::make_tuple(output, total_weight);
 }
 
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::nll_loss2d_forward_out(
+std::tuple<Tensor&, Tensor&> nll_loss2d_forward_out_xpu(
     const Tensor& self,
     const Tensor& target,
     const std::optional<Tensor>& weight_opt,
@@ -70,7 +72,7 @@ std::tuple<Tensor&, Tensor&> XPUNativeFunctions::nll_loss2d_forward_out(
   return std::tuple<Tensor&, Tensor&>(output, total_weight);
 }
 
-Tensor XPUNativeFunctions::nll_loss2d_backward(
+Tensor nll_loss2d_backward_xpu(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -97,7 +99,7 @@ Tensor XPUNativeFunctions::nll_loss2d_backward(
   return grad_input;
 }
 
-Tensor& XPUNativeFunctions::nll_loss2d_backward_out(
+Tensor& nll_loss2d_backward_out_xpu(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -123,4 +125,5 @@ Tensor& XPUNativeFunctions::nll_loss2d_backward_out(
   return grad_input;
 }
 
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/NMS.cpp b/src/ATen/native/xpu/NMS.cpp
index ea1ac4e9c..dc4fa666b 100644
--- a/src/ATen/native/xpu/NMS.cpp
+++ b/src/ATen/native/xpu/NMS.cpp
@@ -1,7 +1,7 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/NMSKernel.h>
 #include <comm/XPUGuard.h>
+#include <comm/xpu_aten.h>
 #include <torch/library.h>
 
 namespace at::native::xpu {
diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp
index 1aa45021f..deb646f6c 100644
--- a/src/ATen/native/xpu/Nonzero.cpp
+++ b/src/ATen/native/xpu/Nonzero.cpp
@@ -1,13 +1,12 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/xpu/EmptyTensor.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 #include <ATen/native/xpu/sycl/NonzeroKernel.h>
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 
 namespace at {
-
-Tensor& XPUNativeFunctions::nonzero_out(const Tensor& self, Tensor& out) {
+namespace native{
+Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
   TORCH_CHECK(
       self.numel() < std::numeric_limits<int>::max(),
       "nonzero is not supported for tensors with more than INT_MAX elements, \
@@ -30,14 +29,14 @@ Tensor& XPUNativeFunctions::nonzero_out(const Tensor& self, Tensor& out) {
       MAX_DIMS,
       " dimensions");
 
-  at::native::xpu::nonzero_kernel(self, out);
+  xpu::nonzero_kernel(self, out);
   return out;
 }
 
-Tensor XPUNativeFunctions::nonzero(const Tensor& self) {
+Tensor nonzero_xpu(const Tensor& self) {
   Tensor out = at::detail::empty_xpu({0}, self.options().dtype(kLong));
-  XPUNativeFunctions::nonzero_out(self, out);
+  nonzero_out_xpu(self, out);
   return out;
 }
-
-} // namespace at
+}
+} // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/Normalization.cpp b/src/ATen/native/xpu/Normalization.cpp
index 3bc170da6..34422df11 100644
--- a/src/ATen/native/xpu/Normalization.cpp
+++ b/src/ATen/native/xpu/Normalization.cpp
@@ -1,113 +1,15 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Normalization.h>
 #include <ATen/native/xpu/sycl/RenormKernel.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
 #include <comm/RegisterUtils.h>
 
 namespace at {
-
-void renorm_meta(
-    const Tensor& self,
-    const Scalar& p,
-    int64_t dim,
-    const Scalar& maxnorm,
-    Tensor& output) {
-  TORCH_CHECK(!p.isComplex(), "renorm: p must be real-valued");
-  TORCH_CHECK(p.toDouble() > 0.0, "renorm: non-positive-norm not supported");
-  TORCH_CHECK(!maxnorm.isComplex(), "renorm: maxnorm must be real-valued");
-  TORCH_CHECK(
-      maxnorm.toDouble() >= 0.0,
-      "renorm: expected maxnorm to be >= 0 but got ",
-      maxnorm.toDouble());
-  const auto ndim = self.dim();
-  TORCH_CHECK(
-      ndim > 1,
-      "renorm: input needs at least 2 dimensions, got ",
-      ndim,
-      " dimensions");
-  if (output.defined()) {
-    xpu::resize_out(output, self.sizes(), {}, self.options());
-  } else {
-    output = xpu::create_out(self.sizes(), {}, self.options());
-  }
-}
-
-Tensor& renorm_impl(
-    const Tensor& self,
-    const Scalar& p,
-    int64_t dim,
-    const Scalar& maxnorm,
-    Tensor& out) {
-  auto self_sizes = self.sizes();
-  dim = c10::maybe_wrap_dim(dim, self_sizes.size());
-
-  DimVector reduce_dims(self_sizes.size());
-  std::iota(reduce_dims.begin(), reduce_dims.end(), 0);
-  reduce_dims.erase(reduce_dims.begin() + dim);
-
-  auto dtype = self.scalar_type();
-  
-  // This is a device-independent accumulate type, and we follow PyTorch's design.
-  auto acc_type = at::toAccumulateType(dtype, true); 
-
-  Tensor norm;
-  if (acc_type != dtype) {
-    norm = at::linalg_vector_norm(
-        self,
-        p.toDouble(),
-        reduce_dims,
-        /*keepdim=*/true,
-        /*dtype=*/acc_type);
-  } else {
-    norm = at::linalg_vector_norm(
-        self,
-        p.toDouble(),
-        reduce_dims,
-        /*keepdim=*/true);
-  }
-
-  auto factor = (acc_type == c10::toRealValueType(dtype))
-      ? norm
-      : at::empty(norm.sizes(), self.options());
-  auto iter = TensorIteratorConfig()
-                  .add_output(factor)
-                  .add_input(norm)
-                  .set_check_mem_overlap(false)
-                  .cast_common_dtype_to_outputs(true)
-                  .build();
-
-  at::native::xpu::renorm_scale_factor_kernel(iter, maxnorm.toDouble());
-  return at::mul_outf(self, factor, const_cast<Tensor&>(out));
-}
-
-Tensor& XPUNativeFunctions::renorm_(
-    Tensor& self,
-    const Scalar& p,
-    int64_t dim,
-    const Scalar& maxnorm) {
-  renorm_meta(self, p, dim, maxnorm, self);
-  renorm_impl(self, p, dim, maxnorm, self);
-  return self;
-}
-Tensor& XPUNativeFunctions::renorm_out(
-    const Tensor& self,
-    const Scalar& p,
-    int64_t dim,
-    const Scalar& maxnorm,
-    Tensor& out) {
-  renorm_meta(self, p, dim, maxnorm, out);
-  renorm_impl(self, p, dim, maxnorm, out);
-  return out;
-}
-Tensor XPUNativeFunctions::renorm(
-    const Tensor& self,
-    const Scalar& p,
-    int64_t dim,
-    const Scalar& maxnorm) {
-  Tensor out;
-  renorm_meta(self, p, dim, maxnorm, out);
-  renorm_impl(self, p, dim, maxnorm, out);
-  return out;
+namespace native {
+REGISTER_XPU_DISPATCH(
+    renorm_scale_factor_stub,
+    &xpu::renorm_scale_factor_kernel);
 }
 } // namespace at
diff --git a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
new file mode 100644
index 000000000..a12b686b2
--- /dev/null
+++ b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
@@ -0,0 +1,38 @@
+#include <ATen/CPUFunctions.h>
+
+#include <ATen/xpu/PinnedMemoryAllocator.h>
+#include <comm/xpu_aten.h>
+
+#include <xpu/ATen/ops/is_pinned_native.h>
+
+namespace at {
+namespace native {
+// Note: The user must call is_pinned(device='xpu') to explicitly call here.
+bool is_pinned_xpu(const Tensor& self, c10::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      !device.has_value() || device->type() == c10::DeviceType::XPU);
+
+  return at::detail::getXPUHooks().isPinnedPtr(self.storage().data());
+}
+
+// Note: The user must call tensor.pin_memory(device='xpu') to explicitly call
+// here.
+Tensor _pin_memory_xpu(const Tensor& self, c10::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      !device.has_value() || device->type() == c10::DeviceType::XPU);
+
+  auto* allocator = at::xpu::getPinnedMemoryAllocator();
+  auto storage = c10::Storage(
+      c10::Storage::use_byte_size_t(),
+      at::detail::computeStorageNbytes(
+          self.sizes(), self.strides(), self.dtype().itemsize()),
+      allocator,
+      /*resizable=*/false);
+  auto tensor = at::cpu::empty({0}, self.options())
+                    .set_(storage, 0, self.sizes(), self.strides());
+  tensor.copy_(self);
+  return tensor;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/ATen/native/xpu/PointwiseOps.cpp b/src/ATen/native/xpu/PointwiseOps.cpp
index a01bdc391..f95a90a93 100644
--- a/src/ATen/native/xpu/PointwiseOps.cpp
+++ b/src/ATen/native/xpu/PointwiseOps.cpp
@@ -1,109 +1,11 @@
 #include <ATen/core/Tensor.h>
+#include <ATen/native/PointwiseOps.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/PointwiseOpsKernels.h>
 
 namespace at {
-
-TensorIterator addcdiv_meta(
-    const Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value,
-    Tensor& out) {
-  if (isIntegralType(tensor1.scalar_type(), /*includeBool=*/true) &&
-      isIntegralType(tensor2.scalar_type(), /*includeBool=*/true)) {
-    TORCH_CHECK(
-        false,
-        "Integer division with addcdiv is no longer supported, and in a future  ",
-        "release addcdiv will perform a true division of tensor1 and tensor2. ",
-        "The historic addcdiv behavior can be implemented as ",
-        "(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) ",
-        "for integer inputs and as ",
-        "(input + value * tensor1 / tensor2) for float inputs. ",
-        "The future addcdiv behavior is just the latter implementation: ",
-        "(input + value * tensor1 / tensor2), for all dtypes.");
-  }
-
-  TensorIterator iter;
-  iter.build_ternary_op(out, self, tensor1, tensor2);
-  return iter;
-}
-
-Tensor& XPUNativeFunctions::addcdiv_out(
-    const Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value,
-    Tensor& out) {
-  auto iter = addcdiv_meta(self, tensor1, tensor2, value, out);
-  native::xpu::addcdiv_kernel(iter, value);
-  return out;
-}
-
-Tensor XPUNativeFunctions::addcdiv(
-    const Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value) {
-  Tensor out;
-  auto iter = addcdiv_meta(self, tensor1, tensor2, value, out);
-  native::xpu::addcdiv_kernel(iter, value);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::addcdiv_(
-    Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value) {
-  auto iter = addcdiv_meta(self, tensor1, tensor2, value, self);
-  native::xpu::addcdiv_kernel(iter, value);
-  return self;
-}
-
-TensorIterator addcmul_meta(
-    const Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_ternary_op(out, self, tensor1, tensor2);
-  return iter;
-}
-
-Tensor& XPUNativeFunctions::addcmul_out(
-    const Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value,
-    Tensor& out) {
-  auto iter = addcmul_meta(self, tensor1, tensor2, value, out);
-  native::xpu::addcmul_kernel(iter, value);
-  return out;
-}
-
-Tensor XPUNativeFunctions::addcmul(
-    const Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value) {
-  Tensor out;
-  auto iter = addcmul_meta(self, tensor1, tensor2, value, out);
-  native::xpu::addcmul_kernel(iter, value);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::addcmul_(
-    Tensor& self,
-    const Tensor& tensor1,
-    const Tensor& tensor2,
-    const Scalar& value) {
-  auto iter = addcmul_meta(self, tensor1, tensor2, value, self);
-  native::xpu::addcmul_kernel(iter, value);
-  return self;
-}
-
+namespace native {
+REGISTER_XPU_DISPATCH(addcmul_stub, &xpu::addcmul_kernel);
+REGISTER_XPU_DISPATCH(addcdiv_stub, &xpu::addcdiv_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/Pow.cpp b/src/ATen/native/xpu/Pow.cpp
index 97dc5a0c2..4b88036db 100644
--- a/src/ATen/native/xpu/Pow.cpp
+++ b/src/ATen/native/xpu/Pow.cpp
@@ -1,118 +1,13 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/BinaryOps.h>
+#include <ATen/native/Pow.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/PowKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
-
-TensorIterator pow_tensor_tensor_meta(
-    const Tensor& base,
-    const Tensor& exp,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_op(out, base, exp);
-  return iter;
-}
-
-TensorIterator pow_tensor_scalar_meta(
-    const Tensor& base,
-    const Scalar& exp,
-    Tensor& out) {
-  // Numpy compatibility check:
-  TORCH_CHECK(
-      !(isIntegralType(base.scalar_type(), true) && exp.isIntegral(true) &&
-        exp.toLong() < 0),
-      "Integers to negative integer powers are not allowed.");
-
-  auto common_dtype = at::result_type(base, exp);
-  TensorIterator iter;
-  iter.build_output_borrowing_argument_owning_unary_op(
-      out, base.to(common_dtype));
-  return iter;
-}
-
-Tensor XPUNativeFunctions::pow(const Tensor& self, const Tensor& exponent) {
-  Tensor out;
-  auto iter = pow_tensor_tensor_meta(self, exponent, out);
-  native::xpu::pow_tensor_tensor_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::pow_(Tensor& self, const Tensor& exponent) {
-  auto iter = pow_tensor_tensor_meta(self, exponent, self);
-  native::xpu::pow_tensor_tensor_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::pow_out(
-    const Tensor& base,
-    const Tensor& exp,
-    Tensor& out) {
-  auto iter = pow_tensor_tensor_meta(base, exp, out);
-  native::xpu::pow_tensor_tensor_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::pow(const Tensor& self, const Scalar& exponent) {
-  Tensor out;
-  auto iter = pow_tensor_scalar_meta(self, exponent, out);
-  if (exponent.equal(0.0) || exponent.equal(false)) {
-    iter.output().fill_(1);
-  } else if (exponent.equal(1.0) || exponent.equal(true)) {
-    iter.output().copy_(self);
-  } else {
-    native::xpu::pow_tensor_scalar_kernel(iter, exponent);
-  }
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::pow_(Tensor& self, const Scalar& exponent) {
-  auto iter = pow_tensor_scalar_meta(self, exponent, self);
-  if (exponent.equal(0.0) || exponent.equal(false)) {
-    self.fill_(1);
-  } else if (exponent.equal(1.0) || exponent.equal(true)) {
-  } else {
-    native::xpu::pow_tensor_scalar_kernel(iter, exponent);
-  }
-  return self;
-}
-
-Tensor& XPUNativeFunctions::pow_out(
-    const Tensor& self,
-    const Scalar& exponent,
-    Tensor& out) {
-  auto iter = pow_tensor_scalar_meta(self, exponent, out);
-  if (exponent.equal(0.0) || exponent.equal(false)) {
-    out.fill_(1);
-  } else if (exponent.equal(1.0) || exponent.equal(true)) {
-    out.copy_(self);
-  } else {
-    native::xpu::pow_tensor_scalar_kernel(iter, exponent);
-  }
-  return out;
-}
-
-Tensor XPUNativeFunctions::pow(const Scalar& self, const Tensor& exponent) {
-  Tensor out;
-  auto iter = TensorIterator::binary_op(
-      out, native::wrapped_scalar_tensor(self), exponent);
-  native::xpu::pow_tensor_tensor_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::pow_out(
-    const Scalar& self,
-    const Tensor& exponent,
-    Tensor& out) {
-  if (self.equal(1.0)) {
-    out.fill_(1);
-  } else {
-    return XPUNativeFunctions::pow_out(
-        native::wrapped_scalar_tensor(self), exponent, out);
-  }
-  return out;
-}
-
+namespace native {
+REGISTER_XPU_DISPATCH(pow_tensor_tensor_stub, &xpu::pow_tensor_tensor_kernel);
+REGISTER_XPU_DISPATCH(pow_tensor_scalar_stub, &xpu::pow_tensor_scalar_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index 3aca0d5c7..ad0a6ffc6 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -1,17 +1,19 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/RangeFactories.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/RangeFactoriesKernel.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 #include <torch/library.h>
 
 namespace at {
 
-Tensor& XPUNativeFunctions::arange_out(
+namespace native {
+Tensor& arange_out_xpu(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
@@ -83,10 +85,10 @@ Tensor& XPUNativeFunctions::arange_out(
         }
       });
 
-  return at::native::xpu::arange_kernel(start, end, step, out);
+  return xpu::arange_kernel(start, end, step, out);
 }
 
-Tensor& XPUNativeFunctions::range_out(
+Tensor& range_xpu_out(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
@@ -113,4 +115,5 @@ Tensor& XPUNativeFunctions::range_out(
   return at::native::xpu::range_kernel(start, end, step, out);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ReduceAllOps.cpp b/src/ATen/native/xpu/ReduceAllOps.cpp
index 9d5e6a1d3..9719d51fa 100644
--- a/src/ATen/native/xpu/ReduceAllOps.cpp
+++ b/src/ATen/native/xpu/ReduceAllOps.cpp
@@ -1,11 +1,12 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReduceAllOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/ReduceMaxValuesKernels.h>
 #include <ATen/native/xpu/sycl/ReduceMinValuesKernels.h>
+#include <comm/xpu_aten.h>
+#include <torch/library.h>
 
 namespace at {
 
@@ -16,28 +17,6 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
   native::xpu::min_all_kernel(iter);
 }
 
-Tensor XPUNativeFunctions::min(const Tensor& self) {
-  TORCH_CHECK(
-      self.numel() > 0,
-      "min(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.");
-  Tensor result = at::empty({}, self.options());
-  min_all_kernel_impl(result, self.contiguous());
-  return result;
-}
-
-Tensor& XPUNativeFunctions::min_out(const Tensor& self, Tensor& out) {
-  // First check if the devices match (CPU vs GPU)
-  TORCH_CHECK(self.device() == out.device());
-
-  TORCH_CHECK(canCast(
-      typeMetaToScalarType(self.dtype()), typeMetaToScalarType(out.dtype())));
-
-  at::native::resize_output(out, {});
-
-  min_all_kernel_impl(out, self.contiguous());
-  return out;
-}
-
 void max_all_kernel_impl(Tensor& result, const Tensor& input) {
   auto dtype = input.scalar_type();
   auto iter = native::make_reduction(
@@ -45,33 +24,9 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) {
   native::xpu::max_all_kernel(iter);
 }
 
-Tensor XPUNativeFunctions::max(const Tensor& self) {
-  TORCH_CHECK(
-      self.numel() > 0,
-      "max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.");
-  Tensor result = at::empty({}, self.options());
-  max_all_kernel_impl(result, self.contiguous());
-  return result;
-}
-
-Tensor& XPUNativeFunctions::max_out(const Tensor& self, Tensor& out) {
-  // First check if the devices match (CPU vs GPU)
-  TORCH_CHECK(self.device() == out.device());
-
-  TORCH_CHECK(canCast(
-      typeMetaToScalarType(self.dtype()), typeMetaToScalarType(out.dtype())));
-
-  at::native::resize_output(out, {});
-
-  max_all_kernel_impl(out, self.contiguous());
-  return out;
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::_aminmax(const Tensor& self) {
-  TORCH_WARN_ONCE(
-      "_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead."
-      " This warning will only appear once per process.");
-  return XPUNativeFunctions::aminmax(self, {}, false);
-}
+namespace native {
+REGISTER_XPU_DISPATCH(min_all_stub, &min_all_kernel_impl);
+REGISTER_XPU_DISPATCH(max_all_stub, &max_all_kernel_impl);
+} // namespace native
 
 } // namespace at
diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp
index 834bb3a04..db72e5fbb 100644
--- a/src/ATen/native/xpu/ReduceOps.cpp
+++ b/src/ATen/native/xpu/ReduceOps.cpp
@@ -1,608 +1,48 @@
-#include <ATen/ATen.h>
+
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
+
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/Fill.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/ReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/ReduceMaxValuesKernels.h>
 #include <ATen/native/xpu/sycl/ReduceMinValuesKernels.h>
 #include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
 #include <ATen/native/xpu/sycl/ScanKernels.h>
 #include <ATen/native/xpu/sycl/ScanUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/ReduceOpsUtils.h>
+#include <torch/library.h>
 
-namespace at {
-
-using namespace at::xpu;
-
-template <class Stub>
-void impl_func_cum_ops(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& result,
-    Stub& stub) {
-  NoNamesGuard guard;
-  if (self.dim() == 0) {
-    result.fill_(self);
-  } else if (self.numel() == 0) {
-    result.zero_();
-  } else {
-    dim = maybe_wrap_dim(dim, self.dim());
-    stub(result, self.to(result.scalar_type()), dim);
-  }
-}
-
-static void cum_ops_meta(
-    const char* name,
-    const Tensor& self,
-    int64_t dim,
-    std::optional<ScalarType> dtype,
-    Tensor& result) {
-  // Checking whether 'dim' is valid.
-  maybe_wrap_dim(dim, self.dim());
-
-  ScalarType out_dtype;
-  if (result.defined()) {
-    out_dtype = dtype.value_or(result.scalar_type());
-    at::xpu::resize_out(
-        result, self.sizes(), {}, self.options().dtype(out_dtype));
-  } else {
-    auto is_integral =
-        at::isIntegralType(self.scalar_type(), /*includeBool=*/true);
-    out_dtype =
-        dtype.value_or(is_integral ? ScalarType::Long : self.scalar_type());
-    result =
-        at::xpu::create_out(self.sizes(), {}, self.options().dtype(out_dtype));
-  }
-
-  namedinference::propagate_names(result, self);
-}
-
-Tensor& XPUNativeFunctions::cumsum_out(
-    const Tensor& self,
-    int64_t dim,
-    c10::optional<ScalarType> dtype,
-    Tensor& result) {
-  cum_ops_meta("cumsum", self, dim, dtype, result);
-
-  impl_func_cum_ops(self, dim, result, at::native::xpu::cumsum_kernel);
-  return result;
-}
-
-Tensor XPUNativeFunctions::cumsum(
-    const Tensor& self,
-    int64_t dim,
-    c10::optional<ScalarType> dtype) {
-  Tensor result;
-  return XPUNativeFunctions::cumsum_out(self, dim, dtype, result);
-}
-
-Tensor& XPUNativeFunctions::cumsum_(
-    Tensor& self,
-    int64_t dim,
-    c10::optional<ScalarType> dtype) {
-  return XPUNativeFunctions::cumsum_out(self, dim, dtype, self);
-}
-
-Tensor& XPUNativeFunctions::cumprod_out(
-    const Tensor& self,
-    int64_t dim,
-    c10::optional<ScalarType> dtype,
-    Tensor& result) {
-  cum_ops_meta("cumprod", self, dim, dtype, result);
-
-  impl_func_cum_ops(self, dim, result, at::native::xpu::cumprod_kernel);
-  return result;
-}
-
-Tensor XPUNativeFunctions::cumprod(
-    const Tensor& self,
-    int64_t dim,
-    c10::optional<ScalarType> dtype) {
-  Tensor result;
-  return XPUNativeFunctions::cumprod_out(self, dim, dtype, result);
-}
-
-Tensor& XPUNativeFunctions::cumprod_(
-    Tensor& self,
-    int64_t dim,
-    c10::optional<ScalarType> dtype) {
-  return XPUNativeFunctions::cumprod_out(self, dim, dtype, self);
-}
-
-static ScalarType infer_dtype_from_optional(
-    const Tensor& self,
-    const optional<ScalarType>& opt_dtype,
-    const Tensor& result) {
-  // 'opt_dtype' has the priority for both cases.
-  if (result.defined()) {
-    // Otherwise, get the result type, if defined.
-    return opt_dtype.value_or(result.scalar_type());
-  } else {
-    // Last case is to get the self type.
-    // If the self type is an integer, we promote it to kLong.
-    return at::native::get_dtype_from_self(self, opt_dtype, true);
-  }
-}
-
-inline bool should_use_acc_buffer(at::TensorIterator& iter) {
-  const auto ndim = iter.ndim();
-  if (!iter.device().is_cpu() || iter.noutputs() != 1) {
-    return false;
-  }
-  if (!at::isReducedFloatingType(iter.common_dtype())) {
-    return false;
-  }
-  if (ndim < 2) {
-    return false;
-  }
-  auto out_strides = iter.strides(0);
-  for (const auto dim : c10::irange(0, 2)) {
-    if (out_strides[dim] != 0) {
-      return false;
-    }
-  }
-  return true;
-}
-
-Tensor& XPUNativeFunctions::sum_out(
-    const Tensor& self,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    c10::optional<ScalarType> opt_dtype,
-    Tensor& result) {
-  auto out_dtype = infer_dtype_from_optional(self, opt_dtype, result);
-  result = resize_reduction(result, self, opt_dim, keepdim, out_dtype);
-  auto iter = meta::make_reduction_from_out_ty(
-      self, result, opt_dim, keepdim, result.scalar_type());
-  if (iter.numel() == 0) {
-    result.zero_();
-  } else {
-    // Here is a limitation of TensorIterator reductions for permuted input with
-    // lower precision on CPU. Consider the case: TensorIterator coalesces such
-    // input and output to >= 2 dims tensors, and the output stride is [0, 0, x,
-    // x, ...] with x >= 0 (two reduced dimensions and non-reduced dims). Since
-    // the reduction loop only operates on two dimensions at a time, the
-    // intermediate sums is forced to do accumulation in the second reduced dim
-    // with lower precision. See https://github.com/pytorch/pytorch/issues/83149
-    if (should_use_acc_buffer(iter)) {
-      auto tmp_output =
-          at::empty(result.sizes(), result.options().dtype(kFloat));
-      at::sum_outf(
-          self.to(ScalarType::Float),
-          opt_dim,
-          keepdim,
-          /*dtype=*/c10::nullopt,
-          tmp_output);
-      result.copy_(tmp_output);
-    } else {
-      native::xpu::sum_kernel(iter);
-    }
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::sum(
-    const Tensor& self,
-    OptionalIntArrayRef dim,
-    bool keepdim,
-    c10::optional<ScalarType> opt_dtype) {
-  Tensor out;
-  return XPUNativeFunctions::sum_out(self, dim, keepdim, opt_dtype, out);
-}
-
-Tensor& prod_meta(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    std::optional<ScalarType> dtype,
-    Tensor& result) {
-  auto out_dtype = infer_dtype_from_optional(self, dtype, result);
-  result = resize_reduction(result, self, dim, keepdim, out_dtype);
-  return result;
-}
-
-static void impl_func_prod(
-    const Tensor& self,
-    IntArrayRef dims,
-    bool keepdim,
-    std::optional<ScalarType> dtype,
-    Tensor& result) {
-  auto iter = meta::make_reduction_from_out_ty(
-      self, result, dims, keepdim, result.scalar_type());
-  if (iter.numel() == 0) {
-    result.fill_(1);
-  } else {
-    native::xpu::prod_kernel(iter);
-  }
-}
-
-Tensor& XPUNativeFunctions::prod_out(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    std::optional<ScalarType> dtype,
-    Tensor& result) {
-  result = prod_meta(self, dim, keepdim, dtype, result);
-  impl_func_prod(self, dim, keepdim, dtype, result);
-  return result;
-}
-
-Tensor XPUNativeFunctions::prod(
-    const Tensor& self,
-    std::optional<ScalarType> opt_dtype) {
-  auto dtype = at::native::get_dtype_from_self(self, opt_dtype, true);
-  auto shape = meta::get_reduction_shape(self, {}, false);
-  Tensor result = at::empty(shape, self.options().dtype(dtype));
-  impl_func_prod(self, {}, false, dtype, result);
-  return result;
-}
-
-Tensor XPUNativeFunctions::prod(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    std::optional<ScalarType> dtype) {
-  Tensor result;
-  result = prod_meta(self, dim, keepdim, dtype, result);
-  impl_func_prod(self, dim, keepdim, dtype, result);
-  return result;
-}
-
-Tensor& mean_meta(
-    const Tensor& self,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    optional<ScalarType> opt_dtype,
-    Tensor& out) {
-  auto in_dtype = at::native::get_dtype_from_self(self, opt_dtype, true);
-  if (!at::isFloatingType(in_dtype) && !at::isComplexType(in_dtype)) {
-    std::string what = "Input";
-    std::string dtype = toString(self.scalar_type());
-
-    if (opt_dtype.has_value()) {
-      what = "Optional";
-      dtype = toString(opt_dtype.value());
-    }
-
-    TORCH_CHECK(
-        false,
-        "mean(): could not infer output dtype. ",
-        what,
-        " dtype must be either a floating point or complex dtype. ",
-        "Got: ",
-        dtype);
-  }
-
-  auto out_dtype = infer_dtype_from_optional(self, opt_dtype, out);
-  out = resize_reduction(out, self, opt_dim, keepdim, out_dtype);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::mean_out(
-    const Tensor& self,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    c10::optional<ScalarType> opt_dtype,
-    Tensor& result) {
-  result = mean_meta(self, opt_dim, keepdim, opt_dtype, result);
-  ScalarType dtype = result.scalar_type();
-  // device is not CPU
-  auto iter = at::meta::make_reduction_from_out_ty(
-      self, result, opt_dim, keepdim, dtype);
-  if (iter.numel() == 0) {
-    result.fill_(std::numeric_limits<double>::quiet_NaN());
-  } else {
-    native::xpu::mean_kernel(iter);
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::mean(
-    const Tensor& self,
-    OptionalIntArrayRef dim,
-    bool keepdim,
-    ::std::optional<at::ScalarType> dtype) {
-  Tensor out;
-  out = mean_meta(self, dim, keepdim, dtype, out);
-  out = XPUNativeFunctions::mean_out(self, dim, keepdim, dtype, out);
-  return out;
-}
-
-inline TensorIterator get_allany_iter(
-    const Tensor& self,
-    const Tensor& result,
-    OptionalIntArrayRef dims,
-    bool keepdim) {
-  return meta::make_reduction_from_out_ty(
-      self, result, dims, keepdim, result.scalar_type());
-}
-
-template <int identity, typename Stub>
-inline void allany_impl(
-    const Tensor& self,
-    const Tensor& result,
-    OptionalIntArrayRef dims,
-    bool keepdim,
-    Stub& stub) {
-  if (self.numel() == 0) {
-    result.fill_(identity);
-  } else if (self.numel() == 1) {
-    result.copy_(self.view_as(result).to(at::kBool));
-  } else {
-    auto iter = get_allany_iter(self, result, dims, keepdim);
-    stub(iter);
-  }
-}
-
-static ScalarType get_result_or_bytebool_dtype(
-    const Tensor& self,
-    const Tensor& result) {
-  // Refer [all, any : uint8 compatibility]
-  if (result.defined()) {
-    return result.scalar_type();
-  } else {
-    return (self.scalar_type() == kByte) ? kByte : kBool;
-  }
-}
-
-static void check_result_is_bytebool(
-    const char* name,
-    const Tensor& self,
-    const Tensor& result) {
-  if (result.defined()) {
-    // Refer [all, any : uint8 compatibility]
-    TORCH_CHECK(
-        result.scalar_type() == ScalarType::Bool ||
-            result.scalar_type() == ScalarType::Byte,
-        name,
-        " only supports bool tensor for result, got: ",
-        result.scalar_type());
-  }
-}
-
-Tensor& allany_meta(
-    Tensor& result,
-    const char* name,
-    const Tensor& self,
-    OptionalIntArrayRef dims,
-    bool keepdim) {
-  check_result_is_bytebool(name, self, result);
-  auto out_dtype = get_result_or_bytebool_dtype(self, result);
-  result = resize_reduction(
-      result, self, dims, keepdim, out_dtype, /*allow_empty_dims=*/true);
-  return result;
-}
-
-// aten::all.dim
-Tensor XPUNativeFunctions::all(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor out;
-  out = allany_meta(out, "all", self, dim, keepdim);
-  allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel);
-  return out;
-}
-
-// aten::all.out
-Tensor& XPUNativeFunctions::all_out(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    Tensor& out) {
-  out = allany_meta(out, "all", self, dim, keepdim);
-  allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel);
-  return out;
-}
-
-// aten::all.dims
-Tensor XPUNativeFunctions::all(
-    const Tensor& self,
-    OptionalIntArrayRef dim,
-    bool keepdim) {
-  Tensor out;
-  out = allany_meta(out, "all", self, dim, keepdim);
-  allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel);
-  return out;
-}
-
-// aten::all.dims_out
-Tensor& XPUNativeFunctions::all_out(
-    const Tensor& self,
-    OptionalIntArrayRef dim,
-    bool keepdim,
-    Tensor& out) {
-  out = allany_meta(out, "all", self, dim, keepdim);
-  allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel);
-  return out;
-}
-
-// aten::all
-Tensor XPUNativeFunctions::all(const Tensor& self) {
-  Tensor out;
-  out = allany_meta(out, "all", self, {}, false);
-  allany_impl<1>(self, out, {}, false, native::xpu::and_kernel);
-  return out;
-}
-
-// aten::all.all_out
-Tensor& XPUNativeFunctions::all_out(const Tensor& self, Tensor& out) {
-  out = allany_meta(out, "all", self, {}, false);
-  allany_impl<1>(self, out, {}, false, native::xpu::and_kernel);
-  return out;
-}
-
-// aten::any.dim
-Tensor XPUNativeFunctions::any(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor out;
-  out = allany_meta(out, "any", self, dim, keepdim);
-  allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel);
-  return out;
-}
-
-// aten::any.out
-Tensor& XPUNativeFunctions::any_out(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    Tensor& out) {
-  out = allany_meta(out, "any", self, dim, keepdim);
-  allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel);
-  return out;
-}
-
-// aten::any.dims
-Tensor XPUNativeFunctions::any(
-    const Tensor& self,
-    OptionalIntArrayRef dim,
-    bool keepdim) {
-  Tensor out;
-  out = allany_meta(out, "any", self, dim, keepdim);
-  allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel);
-  return out;
-}
-
-// aten::any.dims_out
-Tensor& XPUNativeFunctions::any_out(
-    const Tensor& self,
-    OptionalIntArrayRef dim,
-    bool keepdim,
-    Tensor& out) {
-  out = allany_meta(out, "any", self, dim, keepdim);
-  allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel);
-  return out;
-}
-
-// aten::any
-Tensor XPUNativeFunctions::any(const Tensor& self) {
-  Tensor out;
-  out = allany_meta(out, "any", self, {}, false);
-  allany_impl<0>(self, out, {}, false, native::xpu::or_kernel);
-  return out;
-}
+#include <ATen/ops/add.h>
+#include <ATen/ops/complex.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/sqrt.h>
 
-// aten::any.any_out
-Tensor& XPUNativeFunctions::any_out(const Tensor& self, Tensor& out) {
-  out = allany_meta(out, "any", self, {}, false);
-  allany_impl<0>(self, out, {}, false, native::xpu::or_kernel);
-  return out;
-}
-
-template <class Stub>
-void argmax_argmin_impl(
-    const Tensor& self,
-    c10::optional<int64_t> dim,
-    bool keepdim,
-    const Tensor& result,
-    Stub& stub) {
-  c10::MaybeOwned<Tensor> in;
-  DimVector dims;
-  int64_t _dim = 0;
-
-  if (dim.has_value()) {
-    _dim = maybe_wrap_dim(dim.value(), self.dim());
-    auto sizes = self.sizes();
-
-    if (sizes[_dim] == 1) {
-      result.fill_(0);
-      return;
-    }
-
-    dims = IntArrayRef(_dim);
-    in = c10::MaybeOwned<Tensor>::borrowed(self);
-  } else {
-    in = c10::MaybeOwned<Tensor>::owned(self.reshape({-1}));
-    keepdim = false;
-  }
-
-  auto iter =
-      meta::make_reduction(*in, result, dims, keepdim, self.scalar_type());
-
-  if (iter.numel() != 0) {
-    stub(iter);
-  }
-}
-
-static void check_argmax_argmin(
-    const char* name,
-    const Tensor& self,
-    const c10::optional<int64_t>& dim) {
-  if (dim.has_value()) {
-    auto dim_ = maybe_wrap_dim(dim.value(), self.dim());
-    native::zero_numel_check_dims(self, dim_, name);
-  } else {
-    TORCH_CHECK_INDEX(
-        self.numel() != 0,
-        name,
-        ": Expected reduction dim to be specified for input.numel() == 0.");
-  }
-}
-
-static IntArrayRef optional_to_arrayref(const c10::optional<int64_t>& opt) {
-  return opt.has_value() ? opt.value() : IntArrayRef{};
-}
-
-Tensor& argmax_meta(
-    const Tensor& self,
-    c10::optional<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  check_argmax_argmin("argmax()", self, dim);
-  return resize_reduction(out, self, optional_to_arrayref(dim), keepdim, kLong);
-}
-
-Tensor& XPUNativeFunctions::argmax_out(
-    const Tensor& self,
-    c10::optional<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  out = argmax_meta(self, dim, keepdim, out);
-  argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmax_kernel);
-  return out;
-}
-
-Tensor XPUNativeFunctions::argmax(
-    const Tensor& self,
-    c10::optional<int64_t> dim,
-    bool keepdim) {
-  Tensor out;
-  out = argmax_meta(self, dim, keepdim, out);
-  argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmax_kernel);
-  return out;
-}
-
-Tensor& argmin_meta(
-    const Tensor& self,
-    c10::optional<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  check_argmax_argmin("argmin()", self, dim);
-  return resize_reduction(out, self, optional_to_arrayref(dim), keepdim, kLong);
-}
-
-Tensor& XPUNativeFunctions::argmin_out(
-    const Tensor& self,
-    c10::optional<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  out = argmin_meta(self, dim, keepdim, out);
-  argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmin_kernel);
-  return out;
-}
-
-Tensor XPUNativeFunctions::argmin(
-    const Tensor& self,
-    c10::optional<int64_t> dim,
-    bool keepdim) {
-  Tensor out;
-  out = argmin_meta(self, dim, keepdim, out);
-  argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmin_kernel);
-  return out;
-}
+namespace at {
+namespace native {
+REGISTER_XPU_DISPATCH(sum_stub, &xpu::sum_kernel);
+REGISTER_XPU_DISPATCH(mean_stub, &xpu::mean_kernel);
+REGISTER_XPU_DISPATCH(prod_stub, &xpu::prod_kernel);
+REGISTER_XPU_DISPATCH(argmax_stub, &xpu::argmax_kernel);
+REGISTER_XPU_DISPATCH(argmin_stub, &xpu::argmin_kernel);
+REGISTER_XPU_DISPATCH(and_stub, &xpu::and_kernel);
+REGISTER_XPU_DISPATCH(or_stub, &xpu::or_kernel);
+REGISTER_XPU_DISPATCH(max_values_stub, &xpu::max_values_kernel);
+REGISTER_XPU_DISPATCH(min_values_stub, &xpu::min_values_kernel);
+REGISTER_XPU_DISPATCH(std_var_stub, &xpu::std_var_kernel);
+REGISTER_XPU_DISPATCH(cumsum_stub, &xpu::cumsum_kernel);
+REGISTER_XPU_DISPATCH(cumprod_stub, &xpu::cumprod_kernel);
+REGISTER_XPU_DISPATCH(nansum_stub, &xpu::nansum_kernel);
 
 static inline void warn_invalid_degrees_of_freedom(
     const char* fname,
@@ -793,7 +233,7 @@ static inline TensorOptions options_to_value_type(TensorOptions opts) {
   return opts.dtype(c10::toRealValueType(scalar_type));
 }
 
-Tensor XPUNativeFunctions::std(
+Tensor std_xpu(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     const std::optional<Scalar>& correction,
@@ -802,7 +242,7 @@ Tensor XPUNativeFunctions::std(
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
-Tensor& XPUNativeFunctions::std_out(
+Tensor& std_xpu_out(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     const std::optional<Scalar>& correction,
@@ -811,7 +251,7 @@ Tensor& XPUNativeFunctions::std_out(
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
-Tensor& XPUNativeFunctions::var_out(
+Tensor& var_xpu_out(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     const std::optional<Scalar>& correction,
@@ -820,7 +260,7 @@ Tensor& XPUNativeFunctions::var_out(
   return std_var_out("var", result, self, dim, correction, keepdim, false);
 }
 
-Tensor XPUNativeFunctions::var(
+Tensor var_xpu(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     const std::optional<Scalar>& correction,
@@ -829,7 +269,7 @@ Tensor XPUNativeFunctions::var(
   return std_var_out("var", result, self, dim, correction, keepdim, false);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::var_mean(
+std::tuple<Tensor, Tensor> var_mean_xpu(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     const std::optional<Scalar>& correction,
@@ -840,7 +280,7 @@ std::tuple<Tensor, Tensor> XPUNativeFunctions::var_mean(
       "var_mean", result1, result2, self, dim, correction, keepdim, false);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::std_mean(
+std::tuple<Tensor, Tensor> std_mean_xpu(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     const std::optional<Scalar>& correction,
@@ -851,354 +291,31 @@ std::tuple<Tensor, Tensor> XPUNativeFunctions::std_mean(
       "std_mean", result1, result2, self, dim, correction, keepdim, true);
 }
 
-static Tensor& amax_amin_meta(
-    Tensor& result,
-    const char* name,
-    const Tensor& self,
-    IntArrayRef dim,
-    bool keepdim) {
-  if (result.defined()) {
-    TORCH_CHECK(
-        self.scalar_type() == result.scalar_type(),
-        "Expected the dtype for input and out to match, but got ",
-        self.scalar_type(),
-        " for input's dtype and ",
-        result.scalar_type(),
-        " for out's dtype.");
-  }
-  if (self.numel() == 0) {
-    at::native::zero_numel_check_dims(self, dim, "amax()");
-  }
-  const ScalarType& out_dtype =
-      result.defined() ? result.scalar_type() : self.scalar_type();
-  return resize_reduction(result, self, dim, keepdim, out_dtype);
-}
-
-template <class Stub>
-void amax_amin_impl(
-    const Tensor& self,
-    IntArrayRef dim,
-    bool keepdim,
-    const Tensor& result,
-    Stub& stub) {
-  auto iter =
-      meta::make_reduction(self, result, dim, keepdim, self.scalar_type());
-
-  if (iter.numel() != 0) {
-    stub(iter);
-  }
-}
-
-Tensor& XPUNativeFunctions::amax_out(
-    const Tensor& self,
-    IntArrayRef dim,
-    bool keepdim,
-    Tensor& out) {
-  out = amax_amin_meta(out, "amax()", self, dim, keepdim);
-  amax_amin_impl(self, dim, keepdim, out, native::xpu::max_all_kernel);
-  return out;
-}
-
-Tensor XPUNativeFunctions::amax(
-    const Tensor& self,
-    IntArrayRef dim,
-    bool keepdim) {
-  Tensor out;
-  out = amax_amin_meta(out, "amax()", self, dim, keepdim);
-  amax_amin_impl(self, dim, keepdim, out, native::xpu::max_all_kernel);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::amin_out(
-    const Tensor& self,
-    IntArrayRef dim,
-    bool keepdim,
-    Tensor& out) {
-  out = amax_amin_meta(out, "amin()", self, dim, keepdim);
-  amax_amin_impl(self, dim, keepdim, out, native::xpu::min_all_kernel);
-  return out;
-}
-
-Tensor XPUNativeFunctions::amin(
-    const Tensor& self,
-    IntArrayRef dim,
-    bool keepdim) {
-  Tensor out;
-  out = amax_amin_meta(out, "amin()", self, dim, keepdim);
-  amax_amin_impl(self, dim, keepdim, out, native::xpu::min_all_kernel);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::nansum_out(
-    const Tensor& self,
-    at::OptionalIntArrayRef dim,
-    bool keepdim,
-    optional<ScalarType> opt_dtype,
-    Tensor& result) {
-  // For integral types, use existing sum as
-  // integral types don't have `Nan`.
-  if (c10::isIntegralType(self.scalar_type(), true)) {
-    return at::sum_out(result, self, dim, keepdim, opt_dtype);
-  }
-
-  auto out_dtype = infer_dtype_from_optional(self, opt_dtype, result);
-  result = resize_reduction(result, self, dim, keepdim, out_dtype);
-  auto iter = meta::make_reduction_from_out_ty(
-      self, result, dim, keepdim, result.scalar_type());
-
-  if (iter.numel() == 0) {
-    result = result.zero_();
-  } else {
-    native::xpu::nansum_kernel(iter);
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::nansum(
-    const Tensor& self,
-    at::OptionalIntArrayRef dim,
-    bool keepdim,
-    std::optional<ScalarType> opt_dtype) {
-  Tensor result;
-  return XPUNativeFunctions::nansum_out(self, dim, keepdim, opt_dtype, result);
-}
-
-static ScalarType get_result_or_self_value_dtype(
-    const Tensor& self,
-    const Tensor& result,
-    const std::optional<ScalarType>& dtype) {
-  if (result.defined()) {
-    return result.scalar_type();
-  } else {
-    return dtype.value_or(toRealValueType(self.scalar_type()));
-  }
-}
-
-Tensor& norm_scalaropt_dim_dtype_meta(
-    const Tensor& self,
-    const OptionalScalarRef p,
-    IntArrayRef dim,
-    bool keepdim,
-    ScalarType dtype,
-    Tensor& result) {
-  TORCH_CHECK(
-      at::isFloatingType(dtype) || at::isComplexType(dtype),
-      "norm(): the desired output dtype should be either floating point or complex. "
-      "Got ",
-      dtype,
-      " instead.");
-  auto out_dtype = get_result_or_self_value_dtype(self, result, dtype);
-  return resize_reduction(result, self, dim, keepdim, out_dtype);
-}
-
-static void impl_func_norm(
-    const Tensor& self,
-    const OptionalScalarRef& opt_p,
-    IntArrayRef dim,
-    bool keepdim,
-    optional<ScalarType> opt_dtype,
-    const Tensor& result) {
-  // Left this implementation without deprecating it as it is called in a number
-  // of places in the codebase. We should swap those by linalg_vector_norm
-  auto p = opt_p.has_value() ? opt_p.get() : Scalar(2.0).to<double>();
-  at::linalg_vector_norm_out(
-      const_cast<Tensor&>(result), self, p, dim, keepdim, opt_dtype);
-}
-
-Tensor XPUNativeFunctions::norm(
-    const Tensor& self,
-    const std::optional<Scalar>& p,
-    IntArrayRef dim,
-    bool keepdim,
-    ScalarType dtype) {
-  Tensor result;
-  auto p_ =
-      (p.has_value() ? at::OptionalScalarRef(&(p.value()))
-                     : at::OptionalScalarRef());
-  result = norm_scalaropt_dim_dtype_meta(self, p_, dim, keepdim, dtype, result);
-  impl_func_norm(self, p_, dim, keepdim, dtype, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::norm_out(
-    const Tensor& self,
-    const std::optional<Scalar>& p,
-    IntArrayRef dim,
-    bool keepdim,
-    ScalarType dtype,
-    Tensor& result) {
-  auto p_ =
-      (p.has_value() ? at::OptionalScalarRef(&(p.value()))
-                     : at::OptionalScalarRef());
-  result = norm_scalaropt_dim_dtype_meta(self, p_, dim, keepdim, dtype, result);
-  impl_func_norm(self, p_, dim, keepdim, dtype, result);
-  return result;
-}
-
-Tensor& norm_scalaropt_dim_meta(
-    const Tensor& self,
-    const OptionalScalarRef p,
-    IntArrayRef dim,
-    bool keepdim,
-    Tensor& result) {
-  TORCH_CHECK(
-      at::isFloatingType(self.scalar_type()) ||
-          at::isComplexType(self.scalar_type()),
-      "norm(): input dtype should be either floating point or complex. "
-      "Got ",
-      self.scalar_type(),
-      " instead.");
-
-  auto out_dtype = get_result_or_self_value_dtype(self, result, c10::nullopt);
-  return resize_reduction(result, self, dim, keepdim, out_dtype);
-}
-
-Tensor XPUNativeFunctions::norm(
-    const Tensor& self,
-    const std::optional<Scalar>& p,
-    IntArrayRef dim,
-    bool keepdim) {
-  auto p_ =
-      (p.has_value() ? at::OptionalScalarRef(&(p.value()))
-                     : at::OptionalScalarRef());
-  Tensor result;
-  result = norm_scalaropt_dim_meta(self, p_, dim, keepdim, result);
-  impl_func_norm(self, p_, dim, keepdim, c10::nullopt, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::norm_out(
-    const Tensor& self,
-    const std::optional<Scalar>& p,
-    IntArrayRef dim,
-    bool keepdim,
-    Tensor& result) {
-  auto p_ =
-      (p.has_value() ? at::OptionalScalarRef(&(p.value()))
-                     : at::OptionalScalarRef());
-  result = norm_scalaropt_dim_meta(self, p_, dim, keepdim, result);
-  impl_func_norm(self, p_, dim, keepdim, c10::nullopt, result);
-  return result;
-}
-
-TensorIterator meta_aminmax(
-    const Tensor& self,
-    std::optional<int64_t> dim_opt,
-    bool keepdim,
-    Tensor& min,
-    Tensor& max) {
-  TensorIterator iter;
-  auto dtype = self.scalar_type();
-  DimVector shape;
-  if (dim_opt.has_value()) {
-    auto dim = maybe_wrap_dim(dim_opt.value(), self.ndimension());
-    native::zero_numel_check_dims(self, dim, "aminmax");
-    shape = meta::get_reduction_shape(self, dim, keepdim);
-    iter = at::native::make_reduction(
-        "aminmax_xpu", min, max, self, dim, keepdim, dtype);
-  } else {
-    TORCH_CHECK(
-        self.numel() > 0,
-        "aminmax(): cannot compute aminmax over an empty dimension as the "
-        "operation has no identity.");
-    if (keepdim) {
-      shape = DimVector(self.ndimension(), 1);
-    }
-    iter = at::native::make_reduction(
-        "aminmax_xpu",
-        min,
-        max,
-        self.contiguous(),
-        IntArrayRef{},
-        false,
-        dtype);
-  }
-  const auto options = self.options();
-  iter.set_output_raw_strided(
-      0, shape, {}, options, min.has_names() ? min.names() : DimnameList{});
-  iter.set_output_raw_strided(
-      1, shape, {}, options, max.has_names() ? max.names() : DimnameList{});
-  return iter;
-}
-
 void aminmax_impl(
     const Tensor& self,
-    std::optional<int64_t> dim_opt,
+    int64_t dim_opt,
     bool keepdim,
     Tensor& min,
     Tensor& max) {
-  TensorIterator iter;
-  iter = meta_aminmax(self, dim_opt, keepdim, min, max);
+  auto dtype = self.scalar_type();
+  TensorIterator iter = make_reduction(
+      "aminmax_xpu", min, max, self, dim_opt, keepdim, dtype);
   if (iter.numel() != 0) {
     native::xpu::aminmax_kernel(iter);
   }
 }
 
 void aminmax_allreduce_impl(const Tensor& self, Tensor& min, Tensor& max) {
-  TensorIterator iter;
-  iter = meta_aminmax(self, {}, false, min, max);
+  auto dtype = self.scalar_type();
+  auto iter = make_reduction(
+      "aminmax_xpu", min, max, self, IntArrayRef{}, false, dtype);
   TORCH_CHECK(
       iter.numel() > 0, "min_max on a tensor with no elements is not defined.");
   native::xpu::aminmax_allreduce_kernel(iter);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::aminmax(
-    const Tensor& self,
-    std::optional<int64_t> dim_opt,
-    bool keepdim) {
-  Tensor min;
-  Tensor max;
-  return XPUNativeFunctions::aminmax_out(self, dim_opt, keepdim, min, max);
-}
-
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::aminmax_out(
-    const Tensor& self,
-    std::optional<int64_t> dim_opt,
-    bool keepdim,
-    Tensor& min,
-    Tensor& max) {
-  if (!min.defined()) {
-    min = native::create_reduction_result(
-        self,
-        dim_opt.has_value() ? dim_opt.value() : IntArrayRef{},
-        false,
-        self.scalar_type());
-  }
-  if (!max.defined()) {
-    max = native::create_reduction_result(
-        self,
-        dim_opt.has_value() ? dim_opt.value() : IntArrayRef{},
-        false,
-        self.scalar_type());
-  }
-
-  TORCH_CHECK(
-      self.dtype() == min.dtype(),
-      "Expected out tensor to have dtype ",
-      self.dtype(),
-      ", but got ",
-      min.dtype(),
-      " instead");
-
-  TORCH_CHECK(
-      self.dtype() == max.dtype(),
-      "Expected out tensor to have dtype ",
-      self.dtype(),
-      ", but got ",
-      max.dtype(),
-      " instead");
-
-  if (dim_opt.has_value()) {
-    aminmax_impl(
-        self,
-        maybe_wrap_dim(dim_opt.value(), self.ndimension()),
-        keepdim,
-        min,
-        max);
-  } else {
-    aminmax_allreduce_impl(self.contiguous(), min, max);
-  }
-  return std::tuple<Tensor&, Tensor&>(min, max);
-}
+REGISTER_XPU_DISPATCH(aminmax_stub, &aminmax_impl);
+REGISTER_XPU_DISPATCH(aminmax_allreduce_stub, &aminmax_allreduce_impl)
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ReflectionPad.cpp b/src/ATen/native/xpu/ReflectionPad.cpp
index 2488ed229..a88151914 100644
--- a/src/ATen/native/xpu/ReflectionPad.cpp
+++ b/src/ATen/native/xpu/ReflectionPad.cpp
@@ -3,321 +3,35 @@
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/Padding.h>
 #include <ATen/native/xpu/sycl/ReflectionPadKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-#include <comm/RegisterUtils.h>
 
-namespace at {
-
-void reflection_pad1d_meta(
-    Tensor& output,
-    const Tensor& input,
-    IntArrayRef padding) {
-  int64_t dim_plane = 0;
-  int64_t dim_w = 1;
-  int64_t nbatch = 1;
-
-  if (input.ndimension() == 3) {
-    nbatch = input.size(0);
-    dim_w++;
-    dim_plane++;
-  }
-
-  at::native::padding::check_valid_input<1>(input, padding);
-
-  /* sizes */
-  auto pad_l = padding[0];
-  auto pad_r = padding[1];
-
-  int64_t nplane = input.size(dim_plane);
-  int64_t input_w = input.size(dim_w);
-  int64_t output_w = input_w + pad_l + pad_r;
-
-  TORCH_CHECK(
-      pad_l < input_w && pad_r < input_w,
-      "Argument #4: Padding size "
-      "should be less than the corresponding input dimension, but got: padding (",
-      pad_l,
-      ", ",
-      pad_r,
-      ") at dimension ",
-      dim_w,
-      " of input ",
-      input.sizes());
-
-  TORCH_CHECK(
-      output_w >= 1,
-      "input (W: ",
-      input_w,
-      ") is too small. Calculated output W: ",
-      output_w);
-
-  if (output.defined()) {
-    if (input.ndimension() == 2) {
-      xpu::resize_out(output, {nplane, output_w}, {}, input.options());
-    } else {
-      xpu::resize_out(output, {nbatch, nplane, output_w}, {}, input.options());
-    }
-  } else {
-    if (input.ndimension() == 2) {
-      output = xpu::create_out({nplane, output_w}, {}, input.options());
-    } else {
-      output = xpu::create_out({nbatch, nplane, output_w}, {}, input.options());
-    }
-  }
-}
-
-void reflection_pad1d_backward_meta(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef padding) {
-  int64_t dim_w = 1;
-  if (input.ndimension() == 3) {
-    dim_w++;
-  }
-
-  /* sizes */
-  auto pad_l = padding[0];
-  auto pad_r = padding[1];
-  int64_t input_w = input.size(dim_w);
-  int64_t output_w = input_w + pad_l + pad_r;
-
-  TORCH_CHECK(
-      pad_l < input_w && pad_r < input_w,
-      "Argument #4: Padding size "
-      "should be less than the corresponding input dimension, but got: padding (",
-      pad_l,
-      ", ",
-      pad_r,
-      ") at dimension ",
-      dim_w,
-      " of input ",
-      input.sizes());
-
-  TORCH_CHECK(
-      output_w == grad_output.size(dim_w),
-      "grad_output width unexpected."
-      " Expected: ",
-      output_w,
-      ", Got: ",
-      grad_output.size(dim_w));
-
-  if (grad_input.defined()) {
-    xpu::resize_out(grad_input, input.sizes(), {}, input.options());
-  } else {
-    grad_input = xpu::create_out(input.sizes(), {}, input.options());
-  }
-}
-
-void reflection_pad3d_meta(
-    Tensor& output,
-    const Tensor& input,
-    IntArrayRef padding) {
-  int64_t pad_left = padding[0];
-  int64_t pad_right = padding[1];
-  int64_t pad_top = padding[2];
-  int64_t pad_bottom = padding[3];
-  int64_t pad_front = padding[4];
-  int64_t pad_back = padding[5];
-  int64_t dim_w = 3;
-  int64_t dim_h = 2;
-  int64_t dim_d = 1;
-  int64_t dim_plane = 0;
-
-  at::native::padding::check_valid_input<3>(input, padding);
-
-  bool batch_mode = (input.dim() == 5);
-  if (batch_mode) {
-    dim_w++;
-    dim_h++;
-    dim_d++;
-    dim_plane++;
-  }
-
-  int64_t nplane = input.size(dim_plane);
-  int64_t input_d = input.size(dim_d);
-  int64_t input_h = input.size(dim_h);
-  int64_t input_w = input.size(dim_w);
-  int64_t output_d = input_d + pad_front + pad_back;
-  int64_t output_h = input_h + pad_top + pad_bottom;
-  int64_t output_w = input_w + pad_left + pad_right;
-
-  TORCH_CHECK(
-      pad_left < input_w && pad_right < input_w,
-      "Argument #4: Padding size "
-      "should be less than the corresponding input dimension, but got: padding (",
-      pad_left,
-      ", ",
-      pad_right,
-      ") at dimension ",
-      dim_w,
-      " of input ",
-      input.sizes());
-  TORCH_CHECK(
-      pad_top < input_h && pad_bottom < input_h,
-      "Argument #6: Padding size "
-      "should be less than the corresponding input dimension, but got: padding (",
-      pad_top,
-      ", ",
-      pad_bottom,
-      ") at dimension ",
-      dim_h,
-      " of input ",
-      input.sizes());
-  TORCH_CHECK(
-      pad_front < input_d && pad_back < input_d,
-      "Argument #8: Padding size "
-      "should be less than the corresponding input dimension, but got: padding (",
-      pad_front,
-      ", ",
-      pad_back,
-      ") at dimension ",
-      dim_d,
-      " of input ",
-      input.sizes());
-
-  TORCH_CHECK(
-      output_w >= 1 || output_h >= 1 || output_d >= 1,
-      "input (D: ",
-      input_d,
-      " H: ",
-      input_h,
-      ", W: ",
-      input_w,
-      ") is too small."
-      " Calculated output D: ",
-      output_d,
-      " H: ",
-      output_h,
-      " W: ",
-      output_w);
-
-  if (output.defined()) {
-    if (batch_mode) {
-      xpu::resize_out(
-          output,
-          {input.size(0), nplane, output_d, output_h, output_w},
-          {},
-          input.options());
-    } else {
-      xpu::resize_out(
-          output, {nplane, output_d, output_h, output_w}, {}, input.options());
-    }
-  } else {
-    if (batch_mode) {
-      output = xpu::create_out(
-          {input.size(0), nplane, output_d, output_h, output_w},
-          {},
-          input.options());
-    } else {
-      output = xpu::create_out(
-          {nplane, output_d, output_h, output_w}, {}, input.options());
-    }
-  }
-}
-
-void reflection_pad3d_backward_meta(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef padding) {
-  TORCH_CHECK(padding.size() == 6, "padding size is expected to be 6");
-  TORCH_CHECK(input.dim() > 3);
-  TORCH_CHECK(grad_output.dim() == input.dim());
-
-  int64_t pad_left = padding[0];
-  int64_t pad_right = padding[1];
-  int64_t pad_top = padding[2];
-  int64_t pad_bottom = padding[3];
-  int64_t pad_front = padding[4];
-  int64_t pad_back = padding[5];
-  int64_t dim_w = 3;
-  int64_t dim_h = 2;
-  int64_t dim_d = 1;
-
-  if (input.dim() == 5) {
-    // batch mode
-    dim_w++;
-    dim_h++;
-    dim_d++;
-  }
-
-  int64_t input_d = input.size(dim_d);
-  int64_t input_h = input.size(dim_h);
-  int64_t input_w = input.size(dim_w);
-  int64_t output_d = input_d + pad_front + pad_back;
-  int64_t output_h = input_h + pad_top + pad_bottom;
-  int64_t output_w = input_w + pad_left + pad_right;
-
-  TORCH_CHECK(
-      output_w == grad_output.size(dim_w),
-      "grad_output width unexpected."
-      " Expected: ",
-      output_w,
-      ", Got: ",
-      grad_output.size(dim_w));
-  TORCH_CHECK(
-      output_h == grad_output.size(dim_h),
-      "grad_output height unexpected."
-      " Expected: ",
-      output_h,
-      ", Got: ",
-      grad_output.size(dim_h));
-  TORCH_CHECK(
-      output_d == grad_output.size(dim_d),
-      "grad_output depth unexpected."
-      " Expected: ",
-      output_d,
-      ", Got: ",
-      grad_output.size(dim_d));
-
-  if (grad_input.defined()) {
-    xpu::resize_out(grad_input, input.sizes(), {}, input.options());
-  } else {
-    grad_input = xpu::create_out(input.sizes(), {}, input.options());
-  }
-}
-
-Tensor XPUNativeFunctions::reflection_pad1d(
-    const Tensor& input,
-    IntArrayRef padding) {
-  Tensor output;
-  reflection_pad1d_meta(output, input, padding);
-  native::xpu::reflection_pad1d_kernel(output, input, padding);
-  return output;
-}
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#include <xpu/ATen/ops/reflection_pad1d_backward_native.h>
+#include <xpu/ATen/ops/reflection_pad1d_native.h>
+#include <xpu/ATen/ops/reflection_pad2d_backward_native.h>
+#include <xpu/ATen/ops/reflection_pad2d_native.h>
+#include <xpu/ATen/ops/reflection_pad3d_backward_native.h>
+#include <xpu/ATen/ops/reflection_pad3d_native.h>
+#include "ATen/TensorMeta.h"
 
-Tensor& XPUNativeFunctions::reflection_pad1d_out(
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& output) {
-  reflection_pad1d_meta(output, input, padding);
-  native::xpu::reflection_pad1d_kernel(output, input, padding);
-  return output;
-}
+namespace at {
+namespace native {
 
-Tensor XPUNativeFunctions::reflection_pad1d_backward(
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef padding) {
-  Tensor grad_input;
-  reflection_pad1d_backward_meta(grad_input, grad_output, input, padding);
-  native::xpu::reflection_pad1d_backward_kernel(
-      grad_input, grad_output, input, padding);
-  return grad_input;
+TORCH_IMPL_FUNC(reflection_pad1d_out_xpu)
+(const Tensor& input_, IntArrayRef padding, const Tensor& output) {
+  xpu::reflection_pad1d_kernel(output, input_, padding);
 }
 
-Tensor& XPUNativeFunctions::reflection_pad1d_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& grad_input) {
-  native::xpu::reflection_pad1d_backward_kernel(
+TORCH_IMPL_FUNC(reflection_pad1d_backward_out_xpu)
+(const Tensor& grad_output,
+ const Tensor& input,
+ IntArrayRef padding,
+ const Tensor& grad_input) {
+  xpu::reflection_pad1d_backward_kernel(
       grad_input, grad_output, input, padding);
-  return grad_input;
 }
 
-Tensor& XPUNativeFunctions::reflection_pad2d_out(
+Tensor& reflection_pad2d_out_xpu(
     const Tensor& input,
     IntArrayRef padding,
     Tensor& output) {
@@ -325,15 +39,13 @@ Tensor& XPUNativeFunctions::reflection_pad2d_out(
   return output;
 }
 
-Tensor XPUNativeFunctions::reflection_pad2d(
-    const Tensor& input,
-    IntArrayRef padding) {
+Tensor reflection_pad2d_xpu(const Tensor& input, IntArrayRef padding) {
   auto output = at::empty({0}, input.options());
   native::xpu::reflection_pad2d_kernel(output, input, padding);
   return output;
 }
 
-Tensor& XPUNativeFunctions::reflection_pad2d_backward_out(
+Tensor& reflection_pad2d_backward_out_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding,
@@ -348,7 +60,7 @@ Tensor& XPUNativeFunctions::reflection_pad2d_backward_out(
   return grad_input;
 }
 
-Tensor XPUNativeFunctions::reflection_pad2d_backward(
+Tensor reflection_pad2d_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
@@ -361,44 +73,19 @@ Tensor XPUNativeFunctions::reflection_pad2d_backward(
   return grad_input;
 }
 
-Tensor XPUNativeFunctions::reflection_pad3d(
-    const Tensor& input,
-    IntArrayRef padding) {
-  Tensor output;
-  reflection_pad3d_meta(output, input, padding);
-  native::xpu::reflection_pad3d_kernel(output, input, padding);
-  return output;
+TORCH_IMPL_FUNC(reflection_pad3d_out_xpu)
+(const Tensor& input_, IntArrayRef padding, const Tensor& output) {
+  xpu::reflection_pad3d_kernel(output, input_, padding);
 }
 
-Tensor& XPUNativeFunctions::reflection_pad3d_out(
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& output) {
-  reflection_pad3d_meta(output, input, padding);
-  native::xpu::reflection_pad3d_kernel(output, input, padding);
-  return output;
-}
-
-Tensor XPUNativeFunctions::reflection_pad3d_backward(
-    const Tensor& grad_output,
-    const Tensor& input,
-    at::IntArrayRef padding) {
-  Tensor grad_input;
-  reflection_pad3d_backward_meta(grad_input, grad_output, input, padding);
-  native::xpu::reflection_pad3d_backward_kernel(
-      grad_input, grad_output, input, padding);
-  return grad_input;
-}
-
-Tensor& XPUNativeFunctions::reflection_pad3d_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& grad_input) {
-  reflection_pad3d_backward_meta(grad_input, grad_output, input, padding);
-  native::xpu::reflection_pad3d_backward_kernel(
+TORCH_IMPL_FUNC(reflection_pad3d_backward_out_xpu)
+(const Tensor& grad_output,
+ const Tensor& input,
+ IntArrayRef padding,
+ const Tensor& grad_input) {
+  xpu::reflection_pad3d_backward_kernel(
       grad_input, grad_output, input, padding);
-  return grad_input;
 }
 
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/Repeat.cpp b/src/ATen/native/xpu/Repeat.cpp
index 38e5ae8da..c62348dd2 100644
--- a/src/ATen/native/xpu/Repeat.cpp
+++ b/src/ATen/native/xpu/Repeat.cpp
@@ -1,10 +1,13 @@
 #include <ATen/ATen.h>
 #include <ATen/native/xpu/sycl/RepeatKernel.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
 namespace at {
-Tensor XPUNativeFunctions::repeat_interleave(
+namespace native {
+Tensor repeat_interleave_xpu(
     const Tensor& repeats,
     c10::optional<int64_t> output_size) {
   return at::native::xpu::repeat_interleave_kernel(repeats, output_size);
 }
+
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ReplicationPadding.cpp b/src/ATen/native/xpu/ReplicationPadding.cpp
index 062d5bc1c..3f0093845 100644
--- a/src/ATen/native/xpu/ReplicationPadding.cpp
+++ b/src/ATen/native/xpu/ReplicationPadding.cpp
@@ -3,339 +3,82 @@
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/Padding.h>
 #include <ATen/native/xpu/sycl/ReplicationPaddingKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-#include <comm/RegisterUtils.h>
-
-namespace at {
-
-void replication_pad1d_meta(
-    Tensor& output,
-    const Tensor& input,
-    IntArrayRef paddingSize) {
-  TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
-
-  int64_t dimw = 1;
-  int64_t dimslices = 0;
-  int64_t nbatch = 1;
-
-  int64_t pad_l = paddingSize[0];
-  int64_t pad_r = paddingSize[1];
-
-  at::native::padding::check_valid_input<1>(input, paddingSize);
-
-  if (input.ndimension() == 3) {
-    nbatch = input.size(0);
-    dimw++;
-    dimslices++;
-  }
-
-  /* sizes */
-  int64_t nslices = input.size(dimslices);
-  int64_t iwidth = input.size(dimw);
-  int64_t owidth = iwidth + pad_l + pad_r;
-
-  TORCH_CHECK(
-      owidth >= 1,
-      "input (W: ",
-      iwidth,
-      ") is too small."
-      " Calculated output W: ",
-      owidth);
-
-  if (output.defined()) {
-    if (input.ndimension() == 2) {
-      xpu::resize_out(output, {nslices, owidth}, {}, input.options());
-    } else {
-      xpu::resize_out(output, {nbatch, nslices, owidth}, {}, input.options());
-    }
-  } else {
-    if (input.ndimension() == 2) {
-      output = xpu::create_out({nslices, owidth}, {}, input.options());
-    } else {
-      output = xpu::create_out({nbatch, nslices, owidth}, {}, input.options());
-    }
-  }
-}
-
-void replication_pad1d_backward_meta(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef paddingSize) {
-  int64_t dimw = 1;
-  TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
-  int64_t pad_l = paddingSize[0];
-  int64_t pad_r = paddingSize[1];
-
-  if (input.ndimension() == 3) {
-    dimw++;
-  }
-
-  /* sizes */
-  int64_t iwidth = input.size(dimw);
-  int64_t owidth = iwidth + pad_l + pad_r;
-
-  TORCH_CHECK(
-      owidth == grad_output.size(dimw),
-      "grad_output width unexpected. Expected: ",
-      owidth,
-      " Got: ",
-      grad_output.size(dimw));
-
-  if (grad_input.defined()) {
-    xpu::resize_out(grad_input, input.sizes(), {}, input.options());
-  } else {
-    grad_input = xpu::create_out(input.sizes(), {}, input.options());
-  }
-}
-
-void replication_pad2d_meta(
-    Tensor& output,
-    const Tensor& input,
-    IntArrayRef paddingSize) {
-  TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
-  int64_t pad_l = paddingSize[0];
-  int64_t pad_r = paddingSize[1];
-  int64_t pad_t = paddingSize[2];
-  int64_t pad_b = paddingSize[3];
-  int64_t dimw = 2;
-  int64_t dimh = 1;
-  int64_t dimslices = 0;
-  int64_t nbatch = 1;
-
-  at::native::padding::check_valid_input<2>(input, paddingSize);
-
-  if (input.dim() == 4) {
-    nbatch = input.size(0);
-    dimw++;
-    dimh++;
-    dimslices++;
-  }
-
-  /* sizes */
-  int64_t nslices = input.size(dimslices);
-  int64_t iheight = input.size(dimh);
-  int64_t iwidth = input.size(dimw);
-  int64_t oheight = iheight + pad_t + pad_b;
-  int64_t owidth = iwidth + pad_l + pad_r;
-
-  TORCH_CHECK(
-      owidth >= 1 || oheight >= 1,
-      "input (H: ",
-      iheight,
-      ", W: ",
-      iwidth,
-      " ) is too small."
-      " Calculated output H: ",
-      oheight,
-      " W: ",
-      owidth);
-
-  if (output.defined()) {
-    if (input.dim() == 3) {
-      xpu::resize_out(output, {nslices, oheight, owidth}, {}, input.options());
-    } else {
-      xpu::resize_out(
-          output, {nbatch, nslices, oheight, owidth}, {}, input.options());
-    }
-  } else {
-    if (input.dim() == 3) {
-      output = xpu::create_out({nslices, oheight, owidth}, {}, input.options());
-    } else {
-      output = xpu::create_out(
-          {nbatch, nslices, oheight, owidth}, {}, input.options());
-    }
-  }
-}
-
-void replication_pad3d_meta(
-    Tensor& output,
-    const Tensor& input,
-    IntArrayRef paddingSize) {
-  TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6");
-  int64_t pleft = paddingSize[0];
-  int64_t pright = paddingSize[1];
-  int64_t ptop = paddingSize[2];
-  int64_t pbottom = paddingSize[3];
-  int64_t pfront = paddingSize[4];
-  int64_t pback = paddingSize[5];
-  int64_t dimw = 3;
-  int64_t dimh = 2;
-  int64_t dimd = 1;
-  int64_t dimslices = 0;
-  int64_t nbatch = 1;
-
-  at::native::padding::check_valid_input<3>(input, paddingSize);
-
-  if (input.dim() == 5) {
-    nbatch = input.size(0);
-    dimw++;
-    dimh++;
-    dimd++;
-    dimslices++;
-  }
-
-  /* sizes */
-  int64_t nslices = input.size(dimslices);
-  int64_t idepth = input.size(dimd);
-  int64_t iheight = input.size(dimh);
-  int64_t iwidth = input.size(dimw);
-  int64_t odepth = idepth + pfront + pback;
-  int64_t oheight = iheight + ptop + pbottom;
-  int64_t owidth = iwidth + pleft + pright;
 
-  TORCH_CHECK(
-      owidth >= 1 || oheight >= 1 || odepth >= 1,
-      "input (D: ",
-      idepth,
-      " H: ",
-      iheight,
-      ", W: ",
-      iwidth,
-      ") is too small."
-      " Calculated output D: ",
-      odepth,
-      " H: ",
-      oheight,
-      " W: ",
-      owidth);
+#include <comm/RegisterUtils.h>
 
-  if (output.defined()) {
-    if (input.dim() == 4) {
-      xpu::resize_out(
-          output, {nslices, odepth, oheight, owidth}, {}, input.options());
-    } else {
-      xpu::resize_out(
-          output,
-          {nbatch, nslices, odepth, oheight, owidth},
-          {},
-          input.options());
-    }
-  } else {
-    if (input.dim() == 4) {
-      output = xpu::create_out(
-          {nslices, odepth, oheight, owidth}, {}, input.options());
-    } else {
-      output = xpu::create_out(
-          {nbatch, nslices, odepth, oheight, owidth}, {}, input.options());
-    }
-  }
-}
+#include <xpu/ATen/ops/replication_pad1d_backward_native.h>
+#include <xpu/ATen/ops/replication_pad1d_native.h>
+#include <xpu/ATen/ops/replication_pad2d_backward_native.h>
+#include <xpu/ATen/ops/replication_pad2d_native.h>
+#include <xpu/ATen/ops/replication_pad3d_backward_native.h>
+#include <xpu/ATen/ops/replication_pad3d_native.h>
 
-Tensor XPUNativeFunctions::replication_pad1d(
-    const Tensor& input,
-    IntArrayRef padding) {
-  Tensor output;
-  replication_pad1d_meta(output, input, padding);
-  native::xpu::replication_pad1d_kernel(output, input, padding);
-  return output;
-}
+namespace at {
+namespace native {
 
-Tensor& XPUNativeFunctions::replication_pad1d_out(
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& output) {
-  replication_pad1d_meta(output, input, padding);
-  native::xpu::replication_pad1d_kernel(output, input, padding);
-  return output;
+TORCH_IMPL_FUNC(replication_pad1d_out_xpu)
+(const Tensor& input, IntArrayRef paddingSize, const Tensor& output) {
+  xpu::replication_pad1d_kernel(output, input, paddingSize);
 }
 
-Tensor XPUNativeFunctions::replication_pad1d_backward(
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef padding) {
-  Tensor grad_input;
-  replication_pad1d_backward_meta(grad_input, grad_output, input, padding);
-  native::xpu::replication_pad1d_backward_kernel(
-      grad_input, grad_output, input, padding);
-  return grad_input;
+TORCH_IMPL_FUNC(replication_pad1d_backward_out_xpu)
+(const Tensor& gradOutput,
+ const Tensor& input,
+ IntArrayRef paddingSize,
+ const Tensor& gradInput) {
+  xpu::replication_pad1d_backward_kernel(
+      gradInput, gradOutput, input, paddingSize);
 }
 
-Tensor& XPUNativeFunctions::replication_pad1d_backward_out(
-    const Tensor& grad_output,
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& grad_input) {
-  replication_pad1d_backward_meta(grad_input, grad_output, input, padding);
-  native::xpu::replication_pad1d_backward_kernel(
-      grad_input, grad_output, input, padding);
-  return grad_input;
-}
-
-Tensor& XPUNativeFunctions::replication_pad2d_out(
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& output) {
-  replication_pad2d_meta(output, input, padding);
-  native::xpu::replication_pad2d_kernel(output, input, padding);
-  return output;
-}
-
-Tensor XPUNativeFunctions::replication_pad2d(
-    const Tensor& input,
-    IntArrayRef padding) {
-  Tensor output;
-  replication_pad2d_meta(output, input, padding);
-  native::xpu::replication_pad2d_kernel(output, input, padding);
-  return output;
+TORCH_IMPL_FUNC(replication_pad2d_out_xpu)
+(const Tensor& input, IntArrayRef paddingSize, const Tensor& output) {
+  xpu::replication_pad2d_kernel(output, input, paddingSize);
 }
 
-Tensor& XPUNativeFunctions::replication_pad2d_backward_out(
+Tensor& replication_pad2d_backward_out_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding,
     Tensor& grad_input) {
-  native::xpu::replication_pad2d_backward_kernel(
+  xpu::replication_pad2d_backward_kernel(
       grad_input, grad_output, input, padding);
   return grad_input;
 }
 
-Tensor XPUNativeFunctions::replication_pad2d_backward(
+Tensor replication_pad2d_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
   auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  native::xpu::replication_pad2d_backward_kernel(
+  xpu::replication_pad2d_backward_kernel(
       grad_input, grad_output, input, padding);
   return grad_input;
 }
 
-Tensor XPUNativeFunctions::replication_pad3d(
-    const Tensor& input,
-    IntArrayRef padding) {
-  Tensor output;
-  replication_pad3d_meta(output, input, padding);
-  native::xpu::replication_pad3d_kernel(output, input, padding);
-  return output;
-}
-
-Tensor& XPUNativeFunctions::replication_pad3d_out(
-    const Tensor& input,
-    IntArrayRef padding,
-    Tensor& output) {
-  replication_pad3d_meta(output, input, padding);
-  native::xpu::replication_pad3d_kernel(output, input, padding);
-  return output;
+TORCH_IMPL_FUNC(replication_pad3d_out_xpu)
+(const Tensor& input, IntArrayRef paddingSize, const Tensor& output) {
+  xpu::replication_pad3d_kernel(output, input, paddingSize);
 }
 
-Tensor XPUNativeFunctions::replication_pad3d_backward(
+Tensor replication_pad3d_backward_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     at::IntArrayRef padding) {
   auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  native::xpu::replication_pad3d_backward_kernel(
+  xpu::replication_pad3d_backward_kernel(
       grad_input, grad_output, input, padding);
   return grad_input;
 }
 
-Tensor& XPUNativeFunctions::replication_pad3d_backward_out(
+Tensor& replication_pad3d_backward_out_xpu(
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding,
     Tensor& grad_input) {
-  native::xpu::replication_pad3d_backward_kernel(
+  xpu::replication_pad3d_backward_kernel(
       grad_input, grad_output, input, padding);
   return grad_input;
 }
 
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/Resize.cpp b/src/ATen/native/xpu/Resize.cpp
index 719b7ea84..66c95302b 100644
--- a/src/ATen/native/xpu/Resize.cpp
+++ b/src/ATen/native/xpu/Resize.cpp
@@ -1,21 +1,25 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/ResizeCommon.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <c10/core/Allocator.h>
+#include <comm/xpu_aten.h>
 #include <torch/library.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/set_native.h>
-#endif
+#include <ATen/native/Resize.h>
+#include <xpu/ATen/ops/copy.h>
+#include <xpu/ATen/ops/resize_native.h>
+#include <xpu/ATen/ops/set_native.h>
 
 #include <ATen/native/xpu/sycl/ResizeKernel.h>
 
 namespace at {
+
+namespace native {
+const at::Tensor& resize_(
+    const at::Tensor& self,
+    at::IntArrayRef size,
+    ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt);
+}
 namespace native::xpu {
 
 const Tensor& resize_xpu_(
@@ -50,7 +54,7 @@ const Tensor& resize_as_(
     const Tensor& self,
     const Tensor& the_template,
     c10::optional<MemoryFormat> optional_memory_format = c10::nullopt) {
-  return resize_(self, the_template.sizes(), optional_memory_format);
+  return resize_xpu_(self, the_template.sizes(), optional_memory_format);
 }
 
 Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) {
@@ -60,17 +64,15 @@ Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) {
   } else {
     at::native::resize_(dst, self.sizes());
   }
-  return at::XPUNativeFunctions::copy_(const_cast<Tensor&>(dst), self, false);
+  return const_cast<Tensor&>(dst.copy_(self, false));
 }
 
-// For test infrastructure
 Tensor _copy_from(const Tensor& self, const Tensor& dst, bool non_blocking) {
   dst.resize_as_(self);
-  return at::XPUNativeFunctions::copy_(
-      const_cast<Tensor&>(dst), self, non_blocking);
+  return const_cast<Tensor&>(dst.copy_(self, non_blocking));
 }
 
-// Should not register the operator. Desc of
+// Should not register the operator. Desc of resize_as_ and
 // _copy_from_and_resize native_function.yaml is simplistic since PyTorch
 // intends backend should not register it (e.g. CPU/CUDA) or handle
 // sanity check by backend (e.g. MPS).
@@ -80,23 +82,18 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
       TORCH_FN(_copy_from_and_resize));
   m.impl(TORCH_SELECTIVE_NAME("aten::_copy_from"), TORCH_FN(_copy_from));
 }
-
 } // namespace native::xpu
 
-const at::Tensor& XPUNativeFunctions::resize_(
+namespace native {
+
+const at::Tensor& resize_xpu_(
     const at::Tensor& self,
     at::IntArrayRef size,
     c10::optional<at::MemoryFormat> memory_format) {
   return native::xpu::resize_xpu_(self, size, memory_format);
 }
 
-Tensor& XPUNativeFunctions::set_(Tensor& self, Storage source) {
-  int64_t new_size =
-      static_cast<int64_t>(source.nbytes() / self.dtype().itemsize());
-  return self.set_(source, 0, new_size, {});
-}
-
-Tensor& XPUNativeFunctions::set_(
+Tensor& set_storage_xpu_(
     Tensor& self,
     Storage source,
     int64_t storage_offset,
@@ -112,16 +109,12 @@ Tensor& XPUNativeFunctions::set_(
   return self;
 }
 
-Tensor& XPUNativeFunctions::set_(Tensor& self, const at::Tensor& source) {
-  return at::native::set_tensor_(self, source);
-}
-
-Tensor& XPUNativeFunctions::set_(Tensor& result) {
+Tensor& set_xpu_(Tensor& result) {
   caffe2::TypeMeta dtype = result.dtype();
   Storage storage(Storage::use_byte_size_t(), 0, c10::GetAllocator(kXPU), true);
   result.set_(storage, 0, {0}, {});
   TORCH_INTERNAL_ASSERT(dtype == result.dtype());
   return result;
 }
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp
index 3c469b23d..95824577a 100644
--- a/src/ATen/native/xpu/SoftMax.cpp
+++ b/src/ATen/native/xpu/SoftMax.cpp
@@ -1,148 +1,30 @@
-#include <ATen/ATen.h>
+
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/xpu/sycl/SoftMaxKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
-
-namespace at {
-
-Tensor& _softmax_meta(
-    const Tensor& input,
-    const int64_t dim,
-    const bool half_to_float,
-    Tensor& out) {
-  int64_t dim_ = maybe_wrap_dim(dim, input.dim());
-
-  auto output_options =
-      input.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-
-  if (half_to_float) {
-    output_options = output_options.dtype(ScalarType::Float);
-  }
-
-  int64_t input_dim = input.dim() > 0 ? input.dim() : 1;
-  TORCH_CHECK(
-      dim_ >= 0 && dim_ < input_dim,
-      "dim must be non-negative and less than input dimensions");
-
-  if (out.defined()) {
-    xpu::resize_out(out, input.sizes(), {}, output_options);
-  } else {
-    out = xpu::create_out(input.sizes(), {}, output_options);
-  }
-
-  return out;
-}
-
-Tensor XPUNativeFunctions::_softmax(
-    const Tensor& self,
-    int64_t dim,
-    bool half_to_float) {
-  Tensor out;
-  out = _softmax_meta(self, dim, half_to_float, out);
-  native::xpu::_softmax_kernel(self, dim, half_to_float, out);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::_softmax_out(
-    const Tensor& self,
-    int64_t dim,
-    bool half_to_float,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::_softmax_out_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::_softmax_out_out", "self");
-  out = _softmax_meta(self, dim, half_to_float, out);
-  return native::xpu::_softmax_kernel(self, dim, half_to_float, out);
+#include <comm/xpu_aten.h>
+
+#include <xpu/ATen/ops/_log_softmax_backward_data_native.h>
+#include <xpu/ATen/ops/_log_softmax_native.h>
+#include <xpu/ATen/ops/_softmax_backward_data_native.h>
+#include <xpu/ATen/ops/_softmax_native.h>
+namespace at::native {
+
+TORCH_IMPL_FUNC(softmax_xpu_out)
+(const Tensor& input,
+ const int64_t dim,
+ const bool half_to_float,
+ const Tensor& output) {
+  xpu::_softmax_kernel(input, dim, half_to_float, output);
 }
 
-Tensor XPUNativeFunctions::_log_softmax(
-    const Tensor& self,
-    int64_t dim,
-    bool half_to_float) {
-  Tensor out;
-  out = _softmax_meta(self, dim, half_to_float, out);
-  native::xpu::_log_softmax_kernel(self, dim, half_to_float, out);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::_log_softmax_out(
-    const Tensor& self,
-    int64_t dim,
-    bool half_to_float,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::_log_softmax_out_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::_log_softmax_out_out", "self");
-  out = _softmax_meta(self, dim, half_to_float, out);
-  return native::xpu::_log_softmax_kernel(self, dim, half_to_float, out);
-}
-
-Tensor& _softmax_backward_data_meta(
-    const Tensor& grad,
-    const Tensor& output,
-    int64_t dim,
-    ScalarType input_dtype,
-    Tensor& grad_input) {
-  TensorArg grad_arg{grad, "grad", 1}, output_arg{output, "output", 2};
-  checkSameSize("softmax_backward", grad_arg, output_arg);
-
-  int64_t dim_ = maybe_wrap_dim(dim, grad.dim());
-
-  auto grad_input_options =
-      grad.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-
-  bool half_to_float = grad.scalar_type() != input_dtype;
-  if (half_to_float) {
-    if (grad.scalar_type() == ScalarType::Float &&
-        input_dtype == ScalarType::Half) {
-      grad_input_options = grad_input_options.dtype(ScalarType::Half);
-    }
-  }
-
-  int64_t grad_dim = grad.dim() > 0 ? grad.dim() : 1;
-  TORCH_CHECK(
-      dim_ >= 0 && dim_ < grad_dim,
-      "dim must be non-negative and less than input dimensions");
-
-  if (grad_input.defined()) {
-    xpu::resize_out(grad_input, grad.sizes(), {}, grad_input_options);
-  } else {
-    grad_input = xpu::create_out(grad.sizes(), {}, grad_input_options);
-  }
-
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::_softmax_backward_data(
-    const at::Tensor& grad_output,
-    const at::Tensor& output,
-    int64_t dim,
-    at::ScalarType input_dtype) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, grad_output, "xpu::_softmax_backward_data", "grad_output");
-  c10::impl::check_and_update_common_device(
-      common_device, output, "xpu::_softmax_backward_data", "output");
-  Tensor grad_input;
-  grad_input = _softmax_backward_data_meta(
-      grad_output, output, dim, input_dtype, grad_input);
-  native::xpu::_softmax_backward_kernel(
-      grad_output, output, dim, false, grad_input);
-  return grad_input;
-}
-
-Tensor& XPUNativeFunctions::_softmax_backward_data_out(
-    const at::Tensor& grad_output,
-    const at::Tensor& output,
-    int64_t dim,
-    at::ScalarType input_dtype,
-    Tensor& grad_input) {
+TORCH_IMPL_FUNC(softmax_backward_xpu_out)
+(const Tensor& grad,
+ const Tensor& output,
+ int64_t dim,
+ ScalarType input_dtype,
+ const Tensor& grad_input) {
   std::optional<Device> common_device = std::nullopt;
   c10::impl::check_and_update_common_device(
       common_device,
@@ -151,44 +33,21 @@ Tensor& XPUNativeFunctions::_softmax_backward_data_out(
       "grad_input");
   c10::impl::check_and_update_common_device(
       common_device,
-      grad_output,
+      output,
       "xpu::_softmax_backward_data_out_out",
       "grad_output");
   c10::impl::check_and_update_common_device(
       common_device, output, "xpu::_softmax_backward_data_out_out", "output");
-  grad_input = _softmax_backward_data_meta(
-      grad_output, output, dim, input_dtype, grad_input);
-  return native::xpu::_softmax_backward_kernel(
-      grad_output, output, dim, false, grad_input);
-}
 
-Tensor XPUNativeFunctions::_log_softmax_backward_data(
-    const at::Tensor& grad_output,
-    const at::Tensor& output,
-    int64_t dim,
-    at::ScalarType input_dtype) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device,
-      grad_output,
-      "xpu::_log_softmax_backward_data",
-      "grad_output");
-  c10::impl::check_and_update_common_device(
-      common_device, output, "xpu::_log_softmax_backward_data", "output");
-  Tensor grad_input;
-  grad_input = _softmax_backward_data_meta(
-      grad_output, output, dim, input_dtype, grad_input);
-  native::xpu::_log_softmax_backward_kernel(
-      grad_output, output, dim, false, grad_input);
-  return grad_input;
+  native::xpu::_softmax_backward_kernel(grad, output, dim, false, grad_input);
 }
 
-Tensor& XPUNativeFunctions::_log_softmax_backward_data_out(
-    const at::Tensor& grad_output,
-    const at::Tensor& output,
-    int64_t dim,
-    at::ScalarType input_dtype,
-    Tensor& grad_input) {
+TORCH_IMPL_FUNC(log_softmax_backward_xpu_out)
+(const Tensor& grad,
+ const Tensor& output,
+ int64_t dim,
+ ScalarType input_dtype,
+ const Tensor& grad_input) {
   std::optional<Device> common_device = std::nullopt;
   c10::impl::check_and_update_common_device(
       common_device,
@@ -197,7 +56,7 @@ Tensor& XPUNativeFunctions::_log_softmax_backward_data_out(
       "grad_input");
   c10::impl::check_and_update_common_device(
       common_device,
-      grad_output,
+      output,
       "xpu::_log_softmax_backward_data_out_out",
       "grad_output");
   c10::impl::check_and_update_common_device(
@@ -205,10 +64,16 @@ Tensor& XPUNativeFunctions::_log_softmax_backward_data_out(
       output,
       "xpu::_log_softmax_backward_data_out_out",
       "output");
-  grad_input = _softmax_backward_data_meta(
-      grad_output, output, dim, input_dtype, grad_input);
-  return native::xpu::_log_softmax_backward_kernel(
-      grad_output, output, dim, false, grad_input);
+  native::xpu::_log_softmax_backward_kernel(
+      grad, output, dim, false, grad_input);
+}
+
+TORCH_IMPL_FUNC(log_softmax_xpu_out)
+(const Tensor& input,
+ const int64_t dim,
+ const bool half_to_float,
+ const Tensor& output) {
+  xpu::_log_softmax_kernel(input, dim, half_to_float, output);
 }
 
-} // namespace at
+} // namespace at::native
diff --git a/src/ATen/native/xpu/Sorting.cpp b/src/ATen/native/xpu/Sorting.cpp
index e934347c2..5e7bbc0cb 100644
--- a/src/ATen/native/xpu/Sorting.cpp
+++ b/src/ATen/native/xpu/Sorting.cpp
@@ -1,81 +1,21 @@
-#include <ATen/ATen.h>
+
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/Sorting.h>
 #include <ATen/native/TensorIterator.h>
+
 #include <ATen/native/xpu/sycl/Sorting.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-#include <comm/RegisterUtils.h>
 #include <comm/TensorInfo.h>
+#include <comm/xpu_aten.h>
 
-namespace at {
-
-void sort_stable_meta(
-    const Tensor& self,
-    Tensor& values,
-    Tensor& indices,
-    int64_t dim) {
-  maybe_wrap_dim(dim, self.dim());
-
-  // See issue: https://github.com/pytorch/pytorch/issues/65863
-  // Strides should be dense, so as not to allocate too much memory.
-  // We either use 'self' strides, or infer dense strides from them.
-  std::vector<int64_t> strides = (self.is_non_overlapping_and_dense())
-      ? self.strides().vec()
-      : at::infer_dense_strides(self.sizes(), self.strides());
-  auto sizes = self.sizes();
-  if (values.defined()) {
-    at::xpu::resize_out(values, sizes, strides, self.options());
-  } else {
-    values = at::xpu::create_out(sizes, strides, self.options());
-  }
-  if (indices.defined()) {
-    at::xpu::resize_out(indices, sizes, strides, self.options().dtype(kLong));
-  } else {
-    indices = at::xpu::create_out(sizes, strides, self.options().dtype(kLong));
-  }
-}
-
-::std::tuple<Tensor, Tensor> XPUNativeFunctions::sort(
-    const Tensor& self,
-    ::std::optional<bool> stable,
-    int64_t dim,
-    bool descending) {
-  Tensor values, indices;
-  sort_stable_meta(self, values, indices, dim);
-  return native::xpu::sort_stable_kernel(
-      self, stable, values, indices, dim, descending);
-}
+#include <ATen/ops/full.h>
+#include <ATen/ops/where.h>
 
-::std::tuple<Tensor&, Tensor&> XPUNativeFunctions::sort_out(
-    const Tensor& self,
-    ::std::optional<bool> stable,
-    int64_t dim,
-    bool descending,
-    Tensor& values,
-    Tensor& indices) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, values, "xpu::sort_out_values_stable", "values");
-  c10::impl::check_and_update_common_device(
-      common_device, indices, "xpu::sort_out_values_stable", "indices");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::sort_out_values_stable", "self");
-  sort_stable_meta(self, values, indices, dim);
-  return native::xpu::sort_stable_kernel(
-      self, stable, values, indices, dim, descending);
-}
-
-Tensor XPUNativeFunctions::argsort(
-    const Tensor& self,
-    bool stable,
-    int64_t dim,
-    bool descending) {
-  Tensor values, indices;
-  sort_stable_meta(self, values, indices, dim);
-  return std::get<1>(native::xpu::sort_stable_kernel(
-      self, stable, values, indices, dim, descending));
-}
+namespace at {
+namespace native {
+REGISTER_XPU_DISPATCH(sort_stub, xpu::sort_stable_kernel);
 
 std::tuple<Tensor&, Tensor&> median_with_indices_impl(
     Tensor& values,
@@ -161,7 +101,7 @@ Tensor median_impl(const Tensor& self, bool ignore_nan) {
   }
 }
 
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::median_out(
+std::tuple<Tensor&, Tensor&> median_out_xpu(
     const Tensor& self,
     int64_t dim,
     bool keepdim,
@@ -171,11 +111,11 @@ std::tuple<Tensor&, Tensor&> XPUNativeFunctions::median_out(
       values, indices, self, dim, keepdim, /*ignore_nan=*/false);
 }
 
-Tensor XPUNativeFunctions::median(const Tensor& self) {
+Tensor median_xpu(const Tensor& self) {
   return median_impl(self, /*ignore_nan=*/false);
 }
 
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::nanmedian_out(
+std::tuple<Tensor&, Tensor&> nanmedian_out_xpu(
     const Tensor& self,
     int64_t dim,
     bool keepdim,
@@ -185,8 +125,9 @@ std::tuple<Tensor&, Tensor&> XPUNativeFunctions::nanmedian_out(
       values, indices, self, dim, keepdim, /*ignore_nan=*/true);
 }
 
-Tensor XPUNativeFunctions::nanmedian(const Tensor& self) {
+Tensor nanmedian_xpu(const Tensor& self) {
   return median_impl(self, /*ignore_nan=*/true);
 }
 
-} // namespace at
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/SummaryOps.cpp b/src/ATen/native/xpu/SummaryOps.cpp
index cf4cf3f27..87ee9f74a 100644
--- a/src/ATen/native/xpu/SummaryOps.cpp
+++ b/src/ATen/native/xpu/SummaryOps.cpp
@@ -1,9 +1,10 @@
 #include <ATen/native/xpu/sycl/SummaryOpsKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/SYCLContext.h>
 
+#include <xpu/ATen/ops/bincount_native.h>
 namespace at {
-Tensor XPUNativeFunctions::bincount(
+namespace native {
+Tensor _bincount_xpu(
     const Tensor& self,
     const c10::optional<Tensor>& weights_opt,
     int64_t minlength) {
@@ -20,5 +21,6 @@ Tensor XPUNativeFunctions::bincount(
 
   return native::xpu::bincount_kernel(self, weights, minlength);
 }
+} // namespace native
 
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
index 99d50acbf..69c09804d 100644
--- a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
+++ b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
@@ -1,131 +1,64 @@
-#include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/ScalarOps.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
-#include <ATen/native/IndexingUtils.h>
-#include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/ReductionType.h>
-#include <ATen/native/ScatterGatherChecks.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/IndexKernel.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/TensorAdvancedIndexingUtils.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/IndexingKernels.h>
 #include <ATen/native/xpu/sycl/ScatterGatherKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-#include <comm/ReduceOpsUtils.h>
+#include <comm/RegisterUtils.h>
+#include <comm/xpu_aten.h>
+#include <torch/library.h>
 
-namespace at {
-
-using namespace at::native;
-using namespace at::native::xpu;
-
-// TODO: Should reuse source in stock PyTorch when in-tree.
-
-static bool all_strides_match(TensorList tensors) {
-  TORCH_CHECK(!tensors.empty());
-  auto strides = tensors[0].strides();
-  for (auto& tensor : tensors.slice(1)) {
-    if (!strides.equals(tensor.strides())) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Replace indexed dimensions in src with stride 0 and the size of the result
-// tensor. The offset in these dimensions is computed by the kernel using the
-// index tensor's values and the stride of src. The new shape is not meaningful.
-// It's used to make the shape compatible with the result tensor.
-static Tensor restride_src(
-    const Tensor& src,
-    int64_t dims_before,
-    int64_t dims_indexed,
-    IntArrayRef replacement_shape) {
-  auto shape = DimVector(src.sizes());
-  auto strides = DimVector(src.strides());
-  int64_t end = dims_before + dims_indexed;
-  shape.erase(shape.begin() + dims_before, shape.begin() + end);
-  strides.erase(strides.begin() + dims_before, strides.begin() + end);
-  shape.insert(
-      shape.begin() + dims_before,
-      replacement_shape.begin(),
-      replacement_shape.end());
-  strides.insert(strides.begin() + dims_before, replacement_shape.size(), 0);
-  return src.as_strided(shape, strides);
-}
-
-// Add dimensions of size 1 to an index tensor so that it can be broadcast to
-// the result shape and iterated over element-wise like the result tensor and
-// the restrided src.
-static Tensor reshape_indexer(
-    const Tensor& index,
-    int64_t dims_before,
-    int64_t dims_after) {
-  auto orig_shape = index.sizes();
-  auto shape = DimVector();
-  shape.append(dims_before, 1);
-  shape.append(orig_shape.begin(), orig_shape.end());
-  shape.append(dims_after, 1);
-  return index.reshape(shape);
-}
-
-native::AdvancedIndex::AdvancedIndex(
-    const Tensor& src,
-    TensorList indices_list) {
-  int64_t element_size_bytes = src.element_size();
-  int64_t dims_before = 0, dims_after = 0, dims_indexed = 0;
-  IntArrayRef replacement_shape;
-  for (const auto dim : c10::irange(indices_list.size())) {
-    if (!indices_list[dim].defined()) {
-      if (dims_indexed == 0) {
-        dims_before++;
-      } else {
-        dims_after++;
-      }
-    } else {
-      dims_indexed++;
-      replacement_shape = indices_list[dim].sizes();
-      indexed_sizes.push_back(src.size(dim));
-      indexed_strides.push_back(src.stride(dim) * element_size_bytes);
-    }
-  }
-
-  // Check if the indexed subspace contains a dim of size 0, but the replacement
-  // shape does not. This implies that an index is out of bounds, because there
-  // is no number that's a valid index for an empty tensor. Normally, out of
-  // bounds is handled in the indexing kernel, but this case fails earlier in
-  // restride_src with an unhelpful error message.
-  if (std::find(indexed_sizes.begin(), indexed_sizes.end(), 0) !=
-          indexed_sizes.end() &&
-      std::find(replacement_shape.begin(), replacement_shape.end(), 0) ==
-          replacement_shape.end()) {
-    TORCH_CHECK_INDEX(
-        false, "index is out of bounds for dimension with size 0");
-  }
+#include <ATen/ops/index_add_meta.h>
+#include <xpu/ATen/ops/index_add_native.h>
 
-  this->dims_before = dims_before;
-  this->dims_after = dims_after;
-  this->src = restride_src(src, dims_before, dims_indexed, replacement_shape);
-
-  for (auto& index : indices_list) {
-    if (index.defined()) {
-      indices.push_back(reshape_indexer(index, dims_before, dims_after));
-    }
-  }
+namespace at {
 
-  if (indices.size() >= 2 && (this->src.device().type() == kXPU)) {
-    if (!all_strides_match(indices)) {
-      for (auto& indice : indices) {
-        indice = indice.contiguous();
-      }
-    }
-  }
+namespace native {
+REGISTER_XPU_DISPATCH(index_stub, &xpu::index_kernel);
+REGISTER_XPU_DISPATCH(index_put_stub, &xpu::index_put_kernel);
+REGISTER_XPU_DISPATCH(
+    index_put_with_sort_stub,
+    &xpu::index_put_deterministic_kernel);
+// REGISTER_XPU_DISPATCH(index_stub, &xpu::index_kernel);
+REGISTER_XPU_DISPATCH(scatter_stub, &xpu::scatter_kernel);
+REGISTER_XPU_DISPATCH(scatter_fill_stub, &xpu::scatter_fill_kernel);
+REGISTER_XPU_DISPATCH(scatter_add_stub, &xpu::scatter_add_kernel);
+REGISTER_XPU_DISPATCH(scatter_reduce_stub, &xpu::scatter_reduce_kernel);
+REGISTER_XPU_DISPATCH(scatter_reduce_two_stub, &xpu::scatter_reduce_two_kernel);
+REGISTER_XPU_DISPATCH(
+    scatter_scalar_reduce_stub,
+    &xpu::scatter_scalar_reduce_kernel);
+REGISTER_XPU_DISPATCH(gather_stub, &xpu::gather_kernel);
+REGISTER_XPU_DISPATCH(index_fill_stub, &xpu::index_fill_kernel);
+
+TORCH_IMPL_FUNC(index_add_xpu_out)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& source,
+ const Scalar& alpha,
+ const Tensor& result) {
+  std::optional<Device> common_device = std::nullopt;
+  c10::impl::check_and_update_common_device(
+      common_device, self, "xpu::index_add_out", "self");
+  c10::impl::check_and_update_common_device(
+      common_device, index, "xpu::index_add_out", "index");
+  c10::impl::check_and_update_common_device(
+      common_device, source, "xpu::index_add_out", "source");
+  dim = maybe_wrap_dim(dim, self.dim());
+  //   index_func_meta_impl(result, self, dim, index, source, "index_add");
+  native::xpu::index_add_kernel(self, dim, index, source, alpha, result);
 }
 
-Tensor& XPUNativeFunctions::masked_fill_(
+Tensor& masked_fill__xpu(
     Tensor& self,
     const Tensor& mask,
     const Scalar& value) {
@@ -160,12 +93,12 @@ Tensor& XPUNativeFunctions::masked_fill_(
                   .add_const_input(*b_mask)
                   .build();
 
-  native::xpu::masked_fill_kernel(iter, value);
+  xpu::masked_fill_kernel(iter, value);
   namedinference::propagate_names_if_nonempty(self, maybe_outnames);
   return self;
 }
 
-Tensor& XPUNativeFunctions::masked_fill_(
+Tensor& masked_fill__xpu(
     Tensor& self,
     const Tensor& mask,
     const Tensor& value) {
@@ -182,1278 +115,12 @@ Tensor& XPUNativeFunctions::masked_fill_(
   TORCH_CHECK(
       self.device().is_xpu(),
       "masked_fill_: Expected inputs to be on same device")
-  return XPUNativeFunctions::masked_fill_(self, mask, value.item());
-}
-
-void index_func_meta_impl(
-    Tensor& result,
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& source,
-    c10::string_view func) {
-  auto numel = index.numel();
-
-  TORCH_CHECK_INDEX(
-      index.dim() <= 1,
-      func,
-      "_(): Index is supposed to be a vector, but got dim: ",
-      index.dim(),
-      " with type: ",
-      index.scalar_type(),
-      " and size: ",
-      index.sizes());
-  TORCH_CHECK(
-      index.scalar_type() == ScalarType::Long ||
-          index.scalar_type() == ScalarType::Int,
-      func,
-      "_(): Expected dtype int32/int64 for index but got: ",
-      index.scalar_type());
-  TORCH_CHECK(
-      self.scalar_type() == source.scalar_type(),
-      func,
-      "_(): self (",
-      self.scalar_type(),
-      ") and source (",
-      source.scalar_type(),
-      ") must have the same scalar type");
-  TORCH_CHECK(
-      dim == 0 || dim < source.dim(),
-      func,
-      "_(): Indexing dim ",
-      dim,
-      " is out of bounds of the source tensor with dim ",
-      source.dim());
-  TORCH_CHECK(
-      numel == (source.dim() == 0 ? 1 : source.size(dim)),
-      func,
-      "_(): Number of indices (",
-      numel,
-      ") should be equal to source.size(dim): (",
-      source.size(dim),
-      "), for dim: ",
-      dim);
-
-  auto self_sizes = self.sizes().vec();
-  auto source_sizes = source.sizes().vec();
-  if (source.dim() != 0 && self.dim() != 0) {
-    self_sizes.erase(self_sizes.begin() + dim);
-    source_sizes.erase(source_sizes.begin() + dim);
-  }
-  TORCH_CHECK(
-      self_sizes == source_sizes,
-      "source tensor shape must match self tensor shape, excluding the specified dimension. Got self.shape = ",
-      self.sizes(),
-      " source.shape = ",
-      source.sizes());
-
-  bool is_defined = result.defined();
-
-  // set_output_raw_strided
-  auto options = self.options();
-  auto sizes = self.sizes();
-  if (is_defined) {
-    at::xpu::resize_out(result, sizes, {}, options);
-  } else {
-    result = at::xpu::create_out(sizes, {}, options);
-  }
-
-  if (is_defined) {
-    at::assert_no_internal_overlap(result);
-    at::assert_no_overlap(result, index);
-    at::assert_no_overlap(result, source);
-  }
-
-  // A hack to run TensorIterator checks in the meta function.
-  // See comment:
-  // https://github.com/pytorch/pytorch/pull/65993#discussion_r760307417
-  // TODO: (@krshrimali) Try inheriting from TensorIteratorBase instead.
-  if (result.device() == kMeta && result.dim() > 0) {
-    auto selfSlice = result.select(dim, 0);
-    auto sourceSlice = source.select(dim, 0);
-    auto iter =
-        TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice);
-  }
-}
-
-Tensor& XPUNativeFunctions::index_add_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& source,
-    const Scalar& alpha,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::index_add_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::index_add_out", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, source, "xpu::index_add_out", "source");
-  dim = maybe_wrap_dim(dim, self.dim());
-  index_func_meta_impl(out, self, dim, index, source, "index_add");
-  native::xpu::index_add_kernel(self, dim, index, source, alpha, out);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::index_add_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& source,
-    const Scalar& alpha) {
-  return index_add_out(self, dim, index, source, alpha, self);
-}
-
-Tensor XPUNativeFunctions::index_add(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& source,
-    const Scalar& alpha) {
-  Tensor out;
-  return index_add_out(self, dim, index, source, alpha, out);
-}
-
-Tensor& XPUNativeFunctions::index_fill_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& source) {
-  at::NoNamesGuard guard;
-
-  TORCH_CHECK_INDEX(
-      index.scalar_type() == ScalarType::Long,
-      "index_fill_(): Expected dtype int64 for index.");
-
-  at::assert_no_overlap(self, index);
-  if (at::has_internal_overlap(self) == at::MemOverlap::Yes) {
-    TORCH_WARN(
-        "Use of index_fill_ on expanded tensors is deprecated. "
-        "Please clone() the tensor before performing this operation. "
-        "This also applies to advanced indexing e.g. tensor[mask] = scalar");
-  }
-
-  if (!self.is_complex() && source.isComplex()) {
-    TORCH_CHECK(
-        false,
-        "index_fill_(): Converting complex Scalar to non-complex type is not supported");
-  }
-
-  TORCH_CHECK(
-      self.device() == index.device(),
-      "index_fill_(): self and index value tensors ",
-      "should have same device type, but got self tensor device type ",
-      self.device(),
-      " and index value ",
-      "tensor device type ",
-      index.device());
-
-  // Handle the case when `self` is 0-dim
-  Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self;
-  dim = at::maybe_wrap_dim(dim, self_nonzero_dim);
-
-  native::xpu::index_fill_kernel(self, dim, index, source);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::index_fill_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& source) {
-  TORCH_CHECK(
-      source.dim() == 0,
-      "index_fill_ only supports a 0-dimensional value tensor, but got tensor "
-      "with ",
-      source.dim(),
-      " dimension(s).");
-  return self.index_fill_(dim, index, source.item());
-}
-
-void check_indices_on_cpu_or_selfdevice(
-    const Tensor& self,
-    const c10::List<c10::optional<Tensor>>& indices) {
-  auto dev = self.device();
-  bool indices_on_cpu_or_dev = std::all_of(
-      indices.begin(), indices.end(), [=](const c10::optional<Tensor>& opt) {
-        if (opt.has_value()) {
-          // for optional<Undefined tensor> cases
-          if (!opt->defined()) {
-            return true;
-          }
-          return (opt->is_cpu() || opt->device() == dev);
-        } else {
-          return true;
-        }
-      });
-  TORCH_CHECK(
-      indices_on_cpu_or_dev,
-      "indices should be either on ",
-      at::kCPU,
-      " or on the same device as the indexed tensor (",
-      dev,
-      ")");
-}
-
-static void build_index_op(
-    TensorIteratorBase& iter,
-    const native::AdvancedIndex& info,
-    Tensor& result) {
-  TensorIteratorConfig config;
-  // info.src is a restrided view of result
-  config.set_check_mem_overlap(false)
-      .check_all_same_dtype(false)
-      .add_output(result)
-      .add_input(info.src);
-  for (auto& index : info.indices) {
-    config.add_owned_const_input(index);
-  }
-  if (!result.defined()) {
-    config.declare_static_dtype_and_device(
-        info.src.scalar_type(), info.src.device());
-  }
-  iter.build(config);
-}
-
-Tensor& XPUNativeFunctions::index_out(
-    const Tensor& self,
-    const c10::List<c10::optional<Tensor>>& indices,
-    Tensor& result) {
-  TORCH_CHECK(
-      indices.size() <= (size_t)self.dim(),
-      "too many indices for tensor of dimension ",
-      self.dim(),
-      " (got ",
-      indices.size(),
-      ")");
-
-  check_indices_on_cpu_or_selfdevice(self, indices);
-
-  if (result.defined()) {
-    TORCH_CHECK(
-        self.scalar_type() == result.scalar_type(),
-        "index_out: self (",
-        self.scalar_type(),
-        ") and result (",
-        result.scalar_type(),
-        ") must have the same scalar type");
-    at::assert_no_internal_overlap(result);
-    at::assert_no_overlap(result, self);
-    for (const c10::optional<Tensor>& index : indices) {
-      if (index.has_value()) {
-        at::assert_no_overlap(result, *index);
-      }
-    }
-  }
-  auto info = native::make_info(self, std::move(indices));
-  TensorIterator iter;
-  build_index_op(iter, info, result);
-
-  native::xpu::index_kernel(
-      iter,
-      info.indexed_sizes,
-      info.indexed_strides,
-      IntArrayRef{},
-      IntArrayRef{});
-
-  return result;
-}
-
-Tensor XPUNativeFunctions::index(
-    const Tensor& self,
-    const c10::List<c10::optional<Tensor>>& indices) {
-  Tensor result;
-  TORCH_CHECK(
-      indices.size() <= (size_t)self.dim(),
-      "too many indices for tensor of dimension ",
-      self.dim(),
-      " (got ",
-      indices.size(),
-      ")");
-
-  check_indices_on_cpu_or_selfdevice(self, indices);
-
-  auto info = native::make_info(self, std::move(indices));
-  TensorIterator iter;
-  build_index_op(iter, info, result);
-
-  native::xpu::index_kernel(
-      iter,
-      info.indexed_sizes,
-      info.indexed_strides,
-      IntArrayRef{},
-      IntArrayRef{});
-
-  return iter.output();
-}
-
-// PyTorch defines it in cpp source. Copy it.
-static TensorIterator make_index_put_iterator(
-    const native::AdvancedIndex& info,
-    const Tensor& value) {
-  TORCH_CHECK(
-      is_expandable_to(value.sizes(), info.src.sizes()),
-      "shape mismatch: value tensor of shape ",
-      value.sizes(),
-      " cannot be broadcast to indexing result of shape ",
-      info.src.sizes());
-  TORCH_CHECK(
-      value.scalar_type() == info.src.scalar_type(),
-      "Index put requires the source and destination dtypes match, "
-      "got ",
-      info.src.scalar_type(),
-      " for the destination "
-      "and ",
-      value.scalar_type(),
-      " for the source.");
-  TensorIteratorConfig config;
-  // info.src is restrided by restride_src with 0 strided dimensions
-  config.set_check_mem_overlap(false);
-  config.resize_outputs(false);
-  config.check_all_same_dtype(false);
-  config.add_output(info.src);
-  config.add_input(value);
-  for (auto& index : info.indices) {
-    config.add_input(index);
-  }
-  return config.build();
-}
-
-Tensor& XPUNativeFunctions::_index_put_impl_(
-    Tensor& self,
-    const torch::List<c10::optional<Tensor>>& indices,
-    const Tensor& value,
-    const bool accumulate,
-    const bool unsafe) {
-  TORCH_CHECK_INDEX(
-      indices.size() <= (size_t)self.dim(),
-      "too many indices for tensor of dimension ",
-      self.dim(),
-      " (got ",
-      indices.size(),
-      ")");
-  if (at::has_internal_overlap(self) == MemOverlap::Yes) {
-    TORCH_WARN(
-        "Use of index_put_ on expanded tensors is deprecated. "
-        "Please clone() the tensor before performing this operation. "
-        "This also applies to advanced indexing e.g. tensor[indices] = tensor");
-  }
-  if (!accumulate) {
-    auto masked_fill_dispatch =
-        native::canDispatchToMaskedFill(self, indices, value);
-    if (std::get<0>(masked_fill_dispatch)) {
-      return self.masked_fill_(std::get<1>(masked_fill_dispatch), value.item());
-    }
-  }
-  auto value_ = value;
-  if (value.device() != self.device() && value.numel() == 1 &&
-      value.dim() == 0) {
-    value_ = value.to(self.device());
-  }
-  at::assert_no_overlap(self, value);
-  // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
-  for (const c10::optional<Tensor>& index : indices) {
-    if (index.has_value()) {
-      at::assert_no_overlap(self, *index);
-    }
-  }
-
-  if (accumulate || globalContext().deterministicAlgorithms()) {
-    TORCH_CHECK(
-        value_.device() == self.device(),
-        "expected device ",
-        self.device(),
-        " but got device ",
-        value_.device(),
-        " for value tensor");
-    native::xpu::index_put_deterministic_kernel(
-        self, indices, value_, accumulate, unsafe);
-    return self;
-  }
-
-  auto info = native::make_info(self, indices);
-  auto iter = make_index_put_iterator(info, value_);
-  native::xpu::index_put_kernel(
-      iter,
-      info.indexed_sizes,
-      info.indexed_strides,
-      IntArrayRef{},
-      IntArrayRef{},
-      accumulate);
-  return self;
-}
-
-// ============================= scatter =============================
-
-static void scatter_reduce_exclude_self_helper(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const ReductionType& op) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      at::ScalarType::Half,
-      at::ScalarType::BFloat16,
-      at::ScalarType::Bool,
-      self.scalar_type(),
-      "scatter_reduce_exclude_input_init",
-      [&] {
-        scalar_t init_val;
-        switch (op) {
-          case ReductionType::SUM:
-            init_val = (scalar_t)0;
-            break;
-          case ReductionType::PROD:
-            init_val = (scalar_t)1;
-            break;
-          case ReductionType::MAX:
-            init_val = std::numeric_limits<scalar_t>::has_infinity
-                ? -std::numeric_limits<scalar_t>::infinity()
-                : std::numeric_limits<scalar_t>::lowest();
-            break;
-          case ReductionType::MIN:
-            init_val = std::numeric_limits<scalar_t>::has_infinity
-                ? std::numeric_limits<scalar_t>::infinity()
-                : std::numeric_limits<scalar_t>::max();
-            break;
-          case ReductionType::MEAN:
-            init_val = (scalar_t)0;
-            break;
-        }
-        self.scatter_(dim, index, init_val);
-      });
-}
-
-static void _scatter_via_index_put(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    const Tensor& mut_out,
-    bool accumulate) {
-  if (self.dim() == 1) {
-    torch::List<c10::optional<Tensor>> indices;
-    indices.reserve(1);
-    indices.push_back(index);
-    mut_out.index_put_(indices, src, accumulate);
-  } else {
-    Tensor mut_out_contig = mut_out.contiguous();
-
-    auto index_coords_sizes = index.sizes().vec();
-    index_coords_sizes.push_back(self.dim());
-    auto index_coords = at::empty(
-        index_coords_sizes,
-        at::TensorOptions().dtype(at::ScalarType::Long).device(self.device()));
-
-    for (int64_t dim_other = 0; dim_other < self.dim(); dim_other++) {
-      if (dim_other == dim) {
-        continue;
-      }
-      auto dim_coord_vals = at::arange(
-          index.size(dim_other), at::TensorOptions().device(self.device()));
-
-      for (int64_t dim_unsqueeze = 0; dim_unsqueeze < self.dim() - 1;
-           dim_unsqueeze++) {
-        dim_coord_vals =
-            dim_coord_vals.unsqueeze((dim_unsqueeze >= dim_other) ? -1 : 0);
-      }
-
-      auto view_sizes = index.sizes().vec();
-      view_sizes.push_back(1);
-      auto view_strides = index_coords.strides().vec();
-      view_strides[self.dim()] = self.dim();
-
-      at::as_strided(index_coords, view_sizes, view_strides, dim_other)
-          .copy_(dim_coord_vals.unsqueeze(-1));
-    }
-
-    auto view_sizes = index.sizes().vec();
-    view_sizes.push_back(1);
-    auto view_strides = index_coords.strides().vec();
-    view_strides[self.dim()] = self.dim();
-
-    at::as_strided(index_coords, view_sizes, view_strides, dim)
-        .copy_(index.unsqueeze(-1));
-
-    Tensor index_coords_flat = index_coords.flatten(0, -2);
-
-    // Copy mut_out_contig's strides into a tensor
-    // TODO: Is there a utility function that already does this?
-    IntArrayRef mut_out_contig_strides = mut_out_contig.strides();
-    Tensor coord_strides = at::empty(
-        {mut_out_contig.dim()},
-        TensorOptions().dtype(at::ScalarType::Long).device(at::kCPU));
-    std::memcpy(
-        coord_strides.mutable_data_ptr(),
-        mut_out_contig_strides.data(),
-        coord_strides.nbytes());
-    coord_strides = coord_strides.to(mut_out_contig.device());
-
-    // `index_flat` contains the 1-D indices corresponding with the
-    // flattened `mut_out`
-    Tensor index_flat = (index_coords_flat * coord_strides).sum({-1});
-    Tensor mut_out_flat = mut_out_contig.flatten();
-    Tensor src_flat =
-        at::as_strided(src, index.sizes(), src.strides()).flatten();
-
-    torch::List<c10::optional<Tensor>> indices;
-    indices.reserve(1);
-    indices.push_back(index_flat);
-
-    mut_out_flat.index_put_(indices, src_flat, accumulate);
-
-    if (!mut_out.is_contiguous()) {
-      mut_out.copy_(mut_out_flat.reshape(mut_out.sizes()));
-    }
-  }
-}
-
-template <
-    bool use_new_options = false,
-    typename T,
-    typename ReduceStub,
-    typename FillStub>
-void scatter_impl(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const T& src,
-    const Tensor& out,
-    ReduceStub& reduce_stub,
-    FillStub& fill_stub,
-    const c10::optional<c10::string_view> reduce = nullopt,
-    bool reduce_includes_self = true) {
-  dim = at::maybe_wrap_dim(dim, self.dim());
-  auto mut_out = const_cast<Tensor&>(out);
-
-  if (!self.is_same(mut_out)) {
-    mut_out.copy_(self);
-  }
-
-  if (index.numel() == 0)
-    return;
-
-  auto op = ReductionType::SUM;
-  bool deterministic = globalContext().deterministicAlgorithms() &&
-      self.device().type() == DeviceType::XPU;
-
-  if (reduce.has_value()) {
-    op = get_operator_enum(reduce.value(), use_new_options);
-    if (!reduce_includes_self) {
-      // scatter inits for reduction to appropriate indices (used by
-      // scatter_reduce.two)
-      scatter_reduce_exclude_self_helper(mut_out, dim, index, op);
-    }
-    // _scatter_via_index_put can only handle sum and mean reduction type
-    deterministic = deterministic &&
-        (op == ReductionType::SUM || op == ReductionType::MEAN);
-  }
-
-  // Scalar src should already be deterministic
-  if (deterministic && std::is_same_v<T, Tensor>) {
-    // both runtime and compile check are required
-    if constexpr (std::is_same_v<T, Tensor>) {
-      bool accumulate = reduce.has_value();
-      _scatter_via_index_put(self, dim, index, src, mut_out, accumulate);
-      return;
-    }
-  }
-
-  if (reduce.has_value()) {
-    reduce_stub(mut_out, dim, index, src, op);
-  } else {
-    fill_stub(mut_out, dim, index, src);
-  }
-}
-
-template <bool use_new_options = false>
-Tensor& scatter_meta_impl(
-    Tensor& output,
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const c10::optional<Tensor>& src = nullopt,
-    const c10::optional<c10::string_view> reduce = nullopt) {
-  int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim());
-  at::native::scatter_gather_dtype_check("scatter", self, index, src);
-  at::native::scatter_shape_check(self, wrapped_dim, index, src);
-
-  if (output.defined()) {
-    at::assert_no_internal_overlap(output);
-    at::assert_no_overlap(output, index);
-    if (src.has_value()) {
-      at::assert_no_overlap(output, src.value());
-    }
-  }
-
-  if (output.defined()) {
-    at::xpu::resize_out(output, self.sizes(), {}, self.options());
-  } else {
-    output = at::xpu::create_out(self.sizes(), {}, self.options());
-  }
-
-  if (reduce.has_value()) {
-    // Check if we have a valid reduce operator.
-    at::native::get_operator_enum(reduce.value(), use_new_options);
-  }
-
-  return output;
-}
-
-Tensor& scatter_src_meta(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    Tensor& out) {
-  return scatter_meta_impl(out, self, dim, index, src);
-}
-
-Tensor& scatter_value_meta(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& value,
-    Tensor& out) {
-  return scatter_meta_impl(out, self, dim, index);
-}
-
-Tensor& scatter_reduce_meta(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    const c10::string_view reduce,
-    Tensor& out) {
-  TORCH_WARN_ONCE(
-      "The reduce argument of torch.scatter with Tensor src is deprecated and will be removed ",
-      "in a future PyTorch release. Use torch.scatter_reduce instead for more reduction options.");
-  return scatter_meta_impl(out, self, dim, index, src, reduce);
-}
-
-Tensor& scatter_value_reduce_meta(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& src,
-    const c10::string_view reduce,
-    Tensor& out) {
-  return scatter_meta_impl(out, self, dim, index, nullopt, reduce);
-}
-
-Tensor& scatter_add_meta(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    Tensor& out) {
-  return scatter_meta_impl(out, self, dim, index, src, "add");
-}
-
-Tensor& scatter_reduce_two_meta(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    const c10::string_view reduce,
-    bool include_self,
-    Tensor& out) {
-  (void)include_self;
-  return scatter_meta_impl</*use_new_options=*/true>(
-      out, self, dim, index, src, reduce);
-}
-
-Tensor XPUNativeFunctions::scatter(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_src", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_src", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_src", "src");
-  Tensor out;
-  out = scatter_src_meta(self, dim, index, src, out);
-  scatter_impl(
-      self, dim, index, src, out, scatter_reduce_kernel, scatter_kernel);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::scatter_out_src_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_out_src_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_out_src_out", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_out_src_out", "src");
-  out = scatter_src_meta(self, dim, index, src, out);
-  scatter_impl(
-      self, dim, index, src, out, scatter_reduce_kernel, scatter_kernel);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter__src", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter__src", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter__src", "src");
-  self = scatter_src_meta(self, dim, index, src, self);
-  scatter_impl(
-      self, dim, index, src, self, scatter_reduce_kernel, scatter_kernel);
-  return self;
-}
-
-Tensor XPUNativeFunctions::scatter(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& value) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_value", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_value", "index");
-  Tensor out;
-  out = scatter_value_meta(self, dim, index, value, out);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      value,
-      out,
-      scatter_scalar_reduce_kernel,
-      scatter_fill_kernel);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& value,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::scatter_out_value_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_out_value_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_out_value_out", "index");
-  out = scatter_value_meta(self, dim, index, value, out);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      value,
-      out,
-      scatter_scalar_reduce_kernel,
-      scatter_fill_kernel);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& value) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter__value", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter__value", "index");
-  self = scatter_value_meta(self, dim, index, value, self);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      value,
-      self,
-      scatter_scalar_reduce_kernel,
-      scatter_fill_kernel);
-  return self;
-}
-
-Tensor XPUNativeFunctions::scatter(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    c10::string_view reduce) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_reduce", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_reduce", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_reduce", "src");
-  Tensor out;
-  out = scatter_reduce_meta(self, dim, index, src, reduce, out);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      src,
-      out,
-      scatter_reduce_kernel,
-      scatter_kernel,
-      reduce);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    c10::string_view reduce,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::scatter_out_reduce_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_out_reduce_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_out_reduce_out", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_out_reduce_out", "src");
-  out = scatter_reduce_meta(self, dim, index, src, reduce, out);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      src,
-      out,
-      scatter_reduce_kernel,
-      scatter_kernel,
-      reduce);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    c10::string_view reduce) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter__reduce", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter__reduce", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter__reduce", "src");
-  self = scatter_reduce_meta(self, dim, index, src, reduce, self);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      src,
-      self,
-      scatter_reduce_kernel,
-      scatter_kernel,
-      reduce);
-  return self;
-}
-
-Tensor XPUNativeFunctions::scatter(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& value,
-    c10::string_view reduce) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_value_reduce", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_value_reduce", "index");
-  Tensor out;
-  out = scatter_value_reduce_meta(self, dim, index, value, reduce, out);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      value,
-      out,
-      scatter_scalar_reduce_kernel,
-      scatter_fill_kernel,
-      reduce);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& value,
-    c10::string_view reduce,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::scatter_out_value_reduce_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_out_value_reduce_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_out_value_reduce_out", "index");
-  out = scatter_value_reduce_meta(self, dim, index, value, reduce, out);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      value,
-      out,
-      scatter_scalar_reduce_kernel,
-      scatter_fill_kernel,
-      reduce);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Scalar& value,
-    c10::string_view reduce) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter__value_reduce", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter__value_reduce", "index");
-  self = scatter_value_reduce_meta(self, dim, index, value, reduce, self);
-  scatter_impl(
-      self,
-      dim,
-      index,
-      value,
-      self,
-      scatter_scalar_reduce_kernel,
-      scatter_fill_kernel,
-      reduce);
-  return self;
-}
-
-Tensor& scatter_add_impl(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    Tensor& out) {
-  auto mut_out = const_cast<Tensor&>(out);
-  dim = maybe_wrap_dim(dim, self.dim());
-
-  if (!self.is_same(mut_out)) {
-    mut_out.copy_(self);
-  }
-
-  if (index.numel() == 0)
-    return out;
-
-  // See Note [Enabling Deterministic Operations]
-  // Avoid gpuAtomicAdd for XPU if deterministic mode is turned on
-  if (globalContext().deterministicAlgorithms() &&
-      self.device().type() == DeviceType::XPU) {
-    _scatter_via_index_put(self, dim, index, src, mut_out, /*accumulate*/ true);
-  } else {
-    // TODO: enable fast paths for GNN usage (scatter_add_expanded_index_kernel)
-    scatter_add_kernel(mut_out, dim, index, src);
-  }
-  return out;
-}
-
-Tensor XPUNativeFunctions::scatter_add(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_add", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_add", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_add", "src");
-  Tensor out;
-  out = scatter_add_meta(self, dim, index, src, out);
-  out = scatter_add_impl(self, dim, index, src, out);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_add_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::scatter_add_out_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_add_out_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_add_out_out", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_add_out_out", "src");
-  out = scatter_add_meta(self, dim, index, src, out);
-  out = scatter_add_impl(self, dim, index, src, out);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_add_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_add_", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_add_", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_add_", "src");
-  self = scatter_add_meta(self, dim, index, src, self);
-  self = scatter_add_impl(self, dim, index, src, self);
-  return self;
-}
-
-Tensor& scatter_reduce_two_impl(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    const c10::string_view reduce,
-    bool include_self,
-    Tensor& out) {
-  dim = at::maybe_wrap_dim(dim, self.dim());
-
-  if (!self.is_same(out)) {
-    out.copy_(self);
-  }
-
-  const auto op = get_operator_enum(reduce, true);
-
-  // TODO: enable scatter_reduce_expanded_index_kernel
-
-  scatter_impl</*use_new_options=*/true>(
-      self,
-      dim,
-      index,
-      src,
-      out,
-      scatter_reduce_two_kernel,
-      scatter_kernel,
-      reduce,
-      include_self);
-
-  if (op == ReductionType::MEAN) {
-    auto ones = at::ones_like(src);
-    auto count = include_self ? at::ones_like(out) : at::zeros_like(out);
-    count.scatter_add_(dim, index, ones);
-    count.masked_fill_(count == 0, 1);
-
-    if (out.is_floating_point() || out.is_complex()) {
-      out.div_(count);
-    } else {
-      out.div_(count, "floor");
-    }
-  }
-
-  return out;
-}
-
-Tensor XPUNativeFunctions::scatter_reduce(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    c10::string_view reduce,
-    bool include_self) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_reduce_two", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_reduce_two", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_reduce_two", "src");
-  Tensor out;
-  out =
-      scatter_reduce_two_meta(self, dim, index, src, reduce, include_self, out);
-  out =
-      scatter_reduce_two_impl(self, dim, index, src, reduce, include_self, out);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_reduce_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    c10::string_view reduce,
-    bool include_self,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::scatter_reduce_out_two_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_reduce_out_two_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_reduce_out_two_out", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_reduce_out_two_out", "src");
-  out =
-      scatter_reduce_two_meta(self, dim, index, src, reduce, include_self, out);
-  out =
-      scatter_reduce_two_impl(self, dim, index, src, reduce, include_self, out);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::scatter_reduce_(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    c10::string_view reduce,
-    bool include_self) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::scatter_reduce__two", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::scatter_reduce__two", "index");
-  c10::impl::check_and_update_common_device(
-      common_device, src, "xpu::scatter_reduce__two", "src");
-  self = scatter_reduce_two_meta(
-      self, dim, index, src, reduce, include_self, self);
-  self = scatter_reduce_two_impl(
-      self, dim, index, src, reduce, include_self, self);
-  return self;
-}
-
-// ============================= gather =============================
-
-Tensor& gather_meta(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    bool sparse_grad,
-    Tensor& result) {
-  int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim());
-
-  // Memory overlap checks need to be done after resizing (if required) is done.
-  // But it only makes sense to do these checks when result was defined, hence
-  // the boolean variable `check_result` here.
-  // For more details, see:
-  // https://github.com/pytorch/pytorch/pull/63312#discussion_r694794832 and
-  // https://github.com/pytorch/pytorch/issues/63837
-  bool check_result = result.defined();
-
-  if (result.defined()) {
-    at::xpu::resize_out(result, index.sizes(), {}, self.options());
-  } else {
-    result = at::xpu::create_out(index.sizes(), {}, self.options());
-  }
-
-  if (check_result) {
-    at::assert_no_internal_overlap(result);
-    at::assert_no_overlap(result, self);
-    at::assert_no_partial_overlap(result, index);
-  }
-
-  auto is_index_empty = index.numel() == 0;
-  if (!is_index_empty) {
-    TORCH_CHECK(
-        index.scalar_type() == at::ScalarType::Long,
-        "gather",
-        "(): Expected dtype int64 for index");
-  }
-  if (is_index_empty)
-    return result;
-  at::native::gather_shape_check(self, wrapped_dim, index);
-
-  return result;
-}
-
-Tensor XPUNativeFunctions::gather(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    bool sparse_grad) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::gather", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::gather", "index");
-  Tensor out;
-  out = gather_meta(self, dim, index, sparse_grad, out);
-
-  if (index.numel() == 0)
-    return out;
-  dim = at::maybe_wrap_dim(dim, self.dim());
-  // TODO: enable gather_expanded_index_kernel
-  gather_kernel(out, self, dim, index);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::gather_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    bool sparse_grad,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  (void)common_device; // Suppress unused variable warning
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::gather_out_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::gather_out_out", "self");
-  c10::impl::check_and_update_common_device(
-      common_device, index, "xpu::gather_out_out", "index");
-  out = gather_meta(self, dim, index, sparse_grad, out);
-
-  if (index.numel() == 0)
-    return out;
-  dim = at::maybe_wrap_dim(dim, self.dim());
-  // TODO: enable gather_expanded_index_kernel
-  gather_kernel(out, self, dim, index);
-  return out;
+  return masked_fill__xpu(self, mask, value.item());
 }
 
-Tensor XPUNativeFunctions::count_nonzero(const Tensor& self, IntArrayRef dims) {
+Tensor count_nonzero_xpu(const Tensor& self, IntArrayRef dims) {
   return (self != 0).sum(dims);
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorCompare.cpp b/src/ATen/native/xpu/TensorCompare.cpp
index 3331c31a2..ef6798c14 100644
--- a/src/ATen/native/xpu/TensorCompare.cpp
+++ b/src/ATen/native/xpu/TensorCompare.cpp
@@ -5,492 +5,17 @@
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/TypeProperties.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
-#include <ATen/native/xpu/sycl/MaxMinElementwiseKernels.h>
 #include <ATen/native/xpu/sycl/ReduceMaxValuesKernels.h>
 #include <ATen/native/xpu/sycl/ReduceMinValuesKernels.h>
 #include <ATen/native/xpu/sycl/TensorCompareKernels.h>
 #include <comm/ReduceOpsUtils.h>
 
-namespace at {
-
-template <typename... Args>
-Device out_device(Args&... inps) {
-  for (const auto& i : {inps...}) {
-    if (i.device() != at::kCPU) {
-      return i.device();
-    }
-  }
-  return at::kCPU;
-}
-
-Tensor& where_self_out(
-    const Tensor& condition,
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  const auto result_type = at::native::result_type(self, other);
-  TORCH_CHECK(
-      out.scalar_type() == result_type,
-      "Expected out type to be ",
-      result_type,
-      " but got ",
-      out.scalar_type());
-
-  auto self_ = self.scalar_type() != result_type ? self.to(result_type) : self;
-  auto other_ =
-      other.scalar_type() != result_type ? other.to(result_type) : other;
-  auto condition_ = condition;
-  auto device = out_device(condition, self_, other_);
-  if (device != at::kCPU) { // allow CPU scalars on non-cpu device
-    if (condition.device() != device && condition.ndimension() == 0) {
-      condition_ = condition.to(device);
-    }
-    if (self_.device() != device && self_.ndimension() == 0) {
-      self_ = self_.to(device);
-    }
-    if (other_.device() != device && other_.ndimension() == 0) {
-      other_ = other_.to(device);
-    }
-  }
-  if (condition_.scalar_type() == ScalarType::Byte) {
-    TORCH_WARN_ONCE(
-        "where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead.");
-    condition_ = condition_.to(kBool);
-  }
-  TORCH_CHECK(
-      condition_.scalar_type() == kBool,
-      "where expected condition to be a boolean tensor, but got a tensor with dtype ",
-      condition_.scalar_type());
-  // if there's still a device mismatch, let tensoriterator error out with it
-  auto iter = at::TensorIteratorConfig()
-                  .check_all_same_dtype(false)
-                  .add_output(out)
-                  .add_const_input(condition_)
-                  .add_const_input(self_)
-                  .add_const_input(other_)
-                  .build();
-  native::xpu::where_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::where_out(
-    const Tensor& condition,
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  return where_self_out(condition, self, other, out);
-}
-
-Tensor XPUNativeFunctions::where(
-    const Tensor& condition,
-    const Tensor& self,
-    const Tensor& other) {
-  auto device = out_device(condition, self, other);
-  auto result_type = at::native::result_type(self, other);
-  Tensor ret = at::empty({0}, self.options().dtype(result_type).device(device));
-  where_self_out(condition, self, other, ret);
-  return ret;
-}
-
-TensorIterator clamp_meta(
-    const Tensor& self,
-    const OptionalScalarRef min,
-    const OptionalScalarRef max,
-    Tensor& result) {
-  TensorIterator iter;
-  if (!min && !max) {
-    TORCH_CHECK(
-        false, "torch.clamp: At least one of 'min' or 'max' must not be None");
-  }
-  // Manual type promotion, since scalars have to participate in it
-  ScalarType result_type = self.scalar_type();
-  TORCH_CHECK(
-      !isComplexType(result_type), "clamp is not supported for complex types");
-  // Floating is the highest supported
-  if (!isFloatingType(result_type)) {
-    at::native::ResultTypeState state = {};
-    state = at::native::update_result_type_state(self, state);
-
-    if (min) {
-      state = at::native::update_result_type_state(min.get(), state);
-    }
-    if (max) {
-      state = at::native::update_result_type_state(max.get(), state);
-    }
-    result_type = at::native::result_type(state);
-    // disallow type promoting inplace op
-    TORCH_CHECK(
-        (result_type == self.scalar_type()) ||
-            (!(result.defined()) || !(result.is_same(self))),
-        "result type ",
-        result_type,
-        " can't be cast to the desired output type ",
-        self.dtype());
-  }
-  // make sure scalars weren't complex
-  TORCH_CHECK(
-      !isComplexType(result_type), "clamp is not supported for complex types");
-  iter.build_unary_op(result, self.to(result_type));
-  return iter;
-}
-
-Tensor& clamp_out_impl(
-    const Tensor& self,
-    TensorIteratorBase& iter,
-    const OptionalScalarRef min,
-    const OptionalScalarRef max,
-    Tensor& result) {
-  using at::native::detail::ClampLimits;
-  if (min && max) {
-    if (min.get().toDouble() != min.get().toDouble() ||
-        max.get().toDouble() != max.get().toDouble()) {
-      at::fill_(
-          const_cast<Tensor&>(result),
-          std::numeric_limits<double>::quiet_NaN());
-    } else {
-      native::xpu::clamp_scalar_kernel(iter, min.get(), max.get());
-    }
-  } else if (max) {
-    native::xpu::clamp_max_scalar_kernel(iter, max.get());
-  } else if (min) {
-    native::xpu::clamp_min_scalar_kernel(iter, min.get());
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::clamp(
-    const Tensor& self,
-    const ::std::optional<at::Scalar>& min,
-    const ::std::optional<at::Scalar>& max) {
-  auto min_ =
-      (min.has_value() ? at::OptionalScalarRef(&(min.value()))
-                       : at::OptionalScalarRef());
-  auto max_ =
-      (max.has_value() ? at::OptionalScalarRef(&(max.value()))
-                       : at::OptionalScalarRef());
-  Tensor result;
-  auto iter = clamp_meta(self, min_, max_, result);
-  result = clamp_out_impl(self, iter, min_, max_, result);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::clamp_out(
-    const Tensor& self,
-    const ::std::optional<Scalar>& min,
-    const ::std::optional<Scalar>& max,
-    Tensor& result) {
-  auto min_ =
-      (min.has_value() ? at::OptionalScalarRef(&(min.value()))
-                       : at::OptionalScalarRef());
-  auto max_ =
-      (max.has_value() ? at::OptionalScalarRef(&(max.value()))
-                       : at::OptionalScalarRef());
-  auto iter = clamp_meta(self, min_, max_, result);
-  result = clamp_out_impl(self, iter, min_, max_, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::clamp_(
-    Tensor& self,
-    const ::std::optional<at::Scalar>& min,
-    const ::std::optional<at::Scalar>& max) {
-  auto min_ =
-      (min.has_value() ? at::OptionalScalarRef(&(min.value()))
-                       : at::OptionalScalarRef());
-  auto max_ =
-      (max.has_value() ? at::OptionalScalarRef(&(max.value()))
-                       : at::OptionalScalarRef());
-  auto iter = clamp_meta(self, min_, max_, self);
-  self = clamp_out_impl(self, iter, min_, max_, self);
-  return self;
-}
-
-TensorIterator clamp_tensor_meta(
-    const Tensor& self,
-    const OptionalTensorRef min,
-    const OptionalTensorRef max,
-    Tensor& result) {
-  TensorIterator iter;
-  TORCH_CHECK(
-      min || max,
-      "torch.clamp: At least one of 'min' or 'max' must not be None");
-  TORCH_CHECK(
-      !isComplexType(self.scalar_type()),
-      "clamp is not supported for complex types");
-#define CLAMP_CONFIG()                      \
-  TensorIteratorConfig()                    \
-      .set_check_mem_overlap(true)          \
-      .add_output(result)                   \
-      .add_const_input(self)                \
-      .promote_inputs_to_common_dtype(true) \
-      .cast_common_dtype_to_outputs(true)   \
-      .enforce_safe_casting_to_output(true)
-
-  if (min && max) {
-    iter.build(CLAMP_CONFIG().add_const_input(*min).add_const_input(*max));
-  } else if (min) {
-    iter.build(CLAMP_CONFIG().add_const_input(*min));
-  } else if (max) {
-    iter.build(CLAMP_CONFIG().add_const_input(*max));
-  }
-  return iter;
-}
-
-Tensor& clamp_tensor_out_impl(
-    const Tensor& self,
-    TensorIteratorBase& iter,
-    const OptionalTensorRef min,
-    const OptionalTensorRef max,
-    Tensor& result) {
-  if (min && max) {
-    native::xpu::clamp_kernel(iter);
-  } else if (min) {
-    native::xpu::maximum_kernel(iter);
-  } else if (max) {
-    native::xpu::minimum_kernel(iter);
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::clamp(
-    const Tensor& self,
-    const ::std::optional<at::Tensor>& min,
-    const ::std::optional<at::Tensor>& max) {
-  auto min_ =
-      ((min.has_value() && (*min).defined()) ? at::OptionalTensorRef(*min)
-                                             : at::OptionalTensorRef());
-  auto max_ =
-      ((max.has_value() && (*max).defined()) ? at::OptionalTensorRef(*max)
-                                             : at::OptionalTensorRef());
-  Tensor result;
-  auto iter = clamp_tensor_meta(self, min_, max_, result);
-  result = clamp_tensor_out_impl(self, iter, min_, max_, result);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::clamp_out(
-    const Tensor& self,
-    const ::std::optional<at::Tensor>& min,
-    const ::std::optional<at::Tensor>& max,
-    Tensor& result) {
-  auto min_ =
-      ((min.has_value() && (*min).defined()) ? at::OptionalTensorRef(*min)
-                                             : at::OptionalTensorRef());
-  auto max_ =
-      ((max.has_value() && (*max).defined()) ? at::OptionalTensorRef(*max)
-                                             : at::OptionalTensorRef());
-  auto iter = clamp_tensor_meta(self, min_, max_, result);
-  result = clamp_tensor_out_impl(self, iter, min_, max_, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::clamp_(
-    Tensor& self,
-    const ::std::optional<at::Tensor>& min,
-    const ::std::optional<at::Tensor>& max) {
-  auto min_ =
-      ((min.has_value() && (*min).defined()) ? at::OptionalTensorRef(*min)
-                                             : at::OptionalTensorRef());
-  auto max_ =
-      ((max.has_value() && (*max).defined()) ? at::OptionalTensorRef(*max)
-                                             : at::OptionalTensorRef());
-  auto iter = clamp_tensor_meta(self, min_, max_, self);
-  self = clamp_tensor_out_impl(self, iter, min_, max_, self);
-  return self;
-}
-
-TensorIterator clamp_max_meta(
-    const Tensor& self,
-    const Scalar& max,
-    Tensor& result) {
-  TensorIterator iter;
-  // we could wrap max into tensor and send to tensor overload,
-  // but relu is implemented via clamp_min, so for perf an uniformity reasons
-  // do a faster but correct thing
-  ScalarType result_type = self.scalar_type();
-  TORCH_CHECK(
-      !isComplexType(result_type), "clamp is not supported for complex types");
-  TORCH_CHECK(!max.isComplex(), "clamp is not supported for complex types");
-  // Floating is the highest supported
-  if (!isFloatingType(result_type)) {
-    auto result_type = at::native::result_type(self, max);
-    TORCH_CHECK(
-        (result_type == self.scalar_type()) ||
-            (!(result.defined()) || !(result.is_same(self))),
-        "result type ",
-        result_type,
-        " can't be cast to the desired output type ",
-        self.dtype());
-    iter.build_unary_op(result, self.to(result_type));
-  } else {
-    iter.build_borrowing_unary_op(result, self);
-  }
-  return iter;
-}
-
-Tensor& clamp_max_out_impl(
-    const Tensor& self,
-    TensorIteratorBase& iter,
-    const Scalar& max,
-    Tensor& result) {
-  if (max.toDouble() != max.toDouble()) {
-    // TODO this is not great, building TI again is expensive, but I can't use
-    // fill_stub because fill is not structured
-    // this is a corner case anyway
-    at::fill_(const_cast<Tensor&>(result), native::wrapped_scalar_tensor(max));
-  } else {
-    native::xpu::clamp_max_scalar_kernel(iter, max);
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::clamp_max(const Tensor& self, const Scalar& max) {
-  Tensor result;
-  auto iter = clamp_max_meta(self, max, result);
-  result = clamp_max_out_impl(self, iter, max, result);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::clamp_max_out(
-    const Tensor& self,
-    const Scalar& max,
-    Tensor& result) {
-  auto iter = clamp_max_meta(self, max, result);
-  result = clamp_max_out_impl(self, iter, max, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::clamp_max_(Tensor& self, const Scalar& max) {
-  auto iter = clamp_max_meta(self, max, self);
-  self = clamp_max_out_impl(self, iter, max, self);
-  return self;
-}
-
-TensorIterator clamp_max_tensor_meta(
-    const Tensor& self,
-    const Tensor& max,
-    Tensor& result) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_op(result, self, max);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::clamp_max(const Tensor& self, const Tensor& max) {
-  Tensor result;
-  auto iter = clamp_max_tensor_meta(self, max, result);
-  native::xpu::minimum_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::clamp_max_out(
-    const Tensor& self,
-    const Tensor& max,
-    Tensor& result) {
-  auto iter = clamp_max_tensor_meta(self, max, result);
-  native::xpu::minimum_kernel(iter);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::clamp_max_(Tensor& self, const Tensor& max) {
-  auto iter = clamp_max_tensor_meta(self, max, self);
-  native::xpu::minimum_kernel(iter);
-  return self;
-}
-
-TensorIterator clamp_min_meta(
-    const Tensor& self,
-    const Scalar& min,
-    Tensor& result) {
-  TensorIterator iter;
-  ScalarType result_type = self.scalar_type();
-  TORCH_CHECK(
-      !isComplexType(result_type), "clamp is not supported for complex types");
-  TORCH_CHECK(!min.isComplex(), "clamp is not supported for complex types");
-  // Floating is the highest supported
-  if (!isFloatingType(result_type)) {
-    auto result_type = at::native::result_type(self, min);
-    TORCH_CHECK(
-        (result_type == self.scalar_type() || !(result.defined()) ||
-         !(result.is_same(self))),
-        "result type ",
-        result_type,
-        " can't be cast to the desired output type ",
-        self.dtype());
-    iter.build_unary_op(result, self.to(result_type));
-  } else {
-    iter.build_borrowing_unary_op(result, self);
-  }
-  return iter;
-}
-
-Tensor& clamp_min_out_impl(
-    const Tensor& self,
-    TensorIteratorBase& iter,
-    const Scalar& min,
-    Tensor& result) {
-  if (min.toDouble() != min.toDouble()) {
-    at::fill_(const_cast<Tensor&>(result), min);
-  } else {
-    native::xpu::clamp_min_scalar_kernel(iter, min);
-  }
-  return result;
-}
-
-Tensor XPUNativeFunctions::clamp_min(const Tensor& self, const Scalar& min) {
-  Tensor result;
-  auto iter = clamp_min_meta(self, min, result);
-  result = clamp_min_out_impl(self, iter, min, result);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::clamp_min_out(
-    const Tensor& self,
-    const Scalar& min,
-    Tensor& result) {
-  auto iter = clamp_min_meta(self, min, result);
-  result = clamp_min_out_impl(self, iter, min, result);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::clamp_min_(Tensor& self, const Scalar& min) {
-  auto iter = clamp_min_meta(self, min, self);
-  self = clamp_min_out_impl(self, iter, min, self);
-  return self;
-}
+#include <ATen/ops/result_type_native.h>
 
-TensorIterator clamp_min_tensor_meta(
-    const Tensor& self,
-    const Tensor& min,
-    Tensor& result) {
-  TensorIterator iter;
-  iter.build_borrowing_binary_op(result, self, min);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::clamp_min(const Tensor& self, const Tensor& min) {
-  Tensor result;
-  auto iter = clamp_min_tensor_meta(self, min, result);
-  native::xpu::maximum_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::clamp_min_out(
-    const Tensor& self,
-    const Tensor& min,
-    Tensor& result) {
-  auto iter = clamp_min_tensor_meta(self, min, result);
-  native::xpu::maximum_kernel(iter);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::clamp_min_(Tensor& self, const Tensor& min) {
-  auto iter = clamp_min_tensor_meta(self, min, self);
-  native::xpu::maximum_kernel(iter);
-  return self;
-}
+namespace at {
 
+namespace native {
+namespace xpu {
 void min_kernel_impl(
     const Tensor& result,
     const Tensor& indice,
@@ -531,281 +56,15 @@ void minmax_out_impl(
     }
   }
 }
+} // namespace xpu
 
-static void check_unsupported_complex(const char* name, const Tensor& self) {
-  TORCH_CHECK(!self.is_complex(), name, ": does not support complex input");
-}
-
-::std::tuple<Tensor&, Tensor&> XPUNativeFunctions::min_out(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    Tensor& values,
-    Tensor& indices) {
-  dim = maybe_wrap_dim(dim, self.dim());
-  at::native::zero_numel_check_dims(self, dim, "min()");
-  check_unsupported_complex("min()", self);
-  at::xpu::resize_reduction_with_indices(
-      values, indices, self, dim, keepdim, self.scalar_type());
-
-  minmax_out_impl(self, dim, keepdim, values, indices, min_kernel_impl);
-  return {values, indices};
-}
-
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::max_out(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    Tensor& values,
-    Tensor& indices) {
-  dim = maybe_wrap_dim(dim, self.dim());
-  at::native::zero_numel_check_dims(self, dim, "max()");
-  check_unsupported_complex("max()", self);
-  at::xpu::resize_reduction_with_indices(
-      values, indices, self, dim, keepdim, self.scalar_type());
-
-  minmax_out_impl(self, dim, keepdim, values, indices, max_kernel_impl);
-  return {values, indices};
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::_aminmax(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim) {
-  TORCH_WARN_ONCE(
-      "_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead."
-      " This warning will only appear once per process.");
-  return XPUNativeFunctions::aminmax(self, dim, keepdim);
-}
-
-static inline void check_for_unsupported_isin_dtype(const ScalarType type) {
-  // Bail out for dtypes unsupported by the sorting algorithm to keep the
-  // interface consistent.
-  TORCH_CHECK(
-      type != ScalarType::Bool && type != ScalarType::BFloat16 &&
-          type != ScalarType::ComplexFloat && type != ScalarType::ComplexDouble,
-      "Unsupported input type encountered for isin(): ",
-      type);
-}
-
-// Sorting-based algorithm for isin(); used when the number of test elements is
-// large.
-static void isin_sorting(
-    const Tensor& elements,
-    const Tensor& test_elements,
-    bool assume_unique,
-    bool invert,
-    const Tensor& out) {
-  // 1. Concatenate unique elements with unique test elements in 1D form. If
-  //    assume_unique is true, skip calls to unique().
-  Tensor elements_flat, test_elements_flat, unique_order;
-  if (assume_unique) {
-    elements_flat = elements.ravel();
-    test_elements_flat = test_elements.ravel();
-  } else {
-    std::tie(elements_flat, unique_order) =
-        at::_unique(elements, /*sorted=*/false, /*return_inverse=*/true);
-    std::tie(test_elements_flat, std::ignore) =
-        at::_unique(test_elements, /*sorted=*/false);
-  }
-
-  // 2. Stable sort all elements, maintaining order indices to reverse the
-  //    operation. Stable sort is necessary to keep elements before test
-  //    elements within the sorted list.
-  Tensor all_elements =
-      at::cat({std::move(elements_flat), std::move(test_elements_flat)});
-  auto [sorted_elements, sorted_order] = all_elements.sort(
-      /*stable=*/true, /*dim=*/0, /*descending=*/false);
-
-  // 3. Create a mask for locations of adjacent duplicate values within the
-  //    sorted list. Duplicate values are in both elements and test elements.
-  Tensor duplicate_mask =
-      at::empty_like(sorted_elements, TensorOptions(ScalarType::Bool));
-  Tensor sorted_except_first = sorted_elements.slice(0, 1, at::indexing::None);
-  Tensor sorted_except_last = sorted_elements.slice(0, 0, -1);
-  duplicate_mask.slice(0, 0, -1).copy_(
-      invert ? sorted_except_first.ne(sorted_except_last)
-             : sorted_except_first.eq(sorted_except_last));
-  duplicate_mask.index_put_({-1}, invert);
-
-  // 4. Reorder the mask to match the pre-sorted element order.
-  Tensor mask = at::empty_like(duplicate_mask);
-  mask.index_copy_(0, sorted_order, duplicate_mask);
-
-  // 5. Index the mask to match the pre-unique element order. If
-  //    assume_unique is true, just take the first N items of the mask,
-  //    where N is the original number of elements.
-  if (assume_unique) {
-    out.copy_(mask.slice(0, 0, elements.numel()).view_as(out));
-  } else {
-    out.copy_(at::index(mask, {std::optional<Tensor>(unique_order)}));
-  }
-}
-
-void isin_Tensor_Tensor_meta(
-    const Tensor& elements,
-    Tensor test_elements,
-    bool assume_unique,
-    bool invert,
-    Tensor& out) {
-  check_for_unsupported_isin_dtype(elements.scalar_type());
-  check_for_unsupported_isin_dtype(test_elements.scalar_type());
-  auto output_options =
-      TensorOptions(elements.device()).dtype(ScalarType::Bool);
-  if (out.defined()) {
-    xpu::resize_out(out, elements.sizes(), {}, output_options);
-  } else {
-    out = xpu::create_out(elements.sizes(), {}, output_options);
-  }
-}
-
-void isin_Tensor_Tensor_impl(
-    const Tensor& elements,
-    Tensor test_elements,
-    bool assume_unique,
-    bool invert,
-    const Tensor& out) {
-  if (elements.numel() == 0) {
-    return;
-  }
-
-  // Heuristic taken from numpy's implementation.
-  if (test_elements.numel() <
-      static_cast<int64_t>(
-          10.0f * std::pow(static_cast<double>(elements.numel()), 0.145))) {
-    out.fill_(invert);
-    native::xpu::isin_kernel(elements, test_elements, invert, out);
-  } else {
-    isin_sorting(elements, test_elements, assume_unique, invert, out);
-  }
-}
-
-Tensor& XPUNativeFunctions::isin_out(
-    const Tensor& elements,
-    const Tensor& test_elements,
-    bool assume_unique,
-    bool invert,
-    Tensor& out) {
-  isin_Tensor_Tensor_meta(elements, test_elements, assume_unique, invert, out);
-  isin_Tensor_Tensor_impl(elements, test_elements, assume_unique, invert, out);
-  return out;
-}
-
-Tensor XPUNativeFunctions::isin(
-    const Tensor& elements,
-    const Tensor& test_elements,
-    bool assume_unique,
-    bool invert) {
-  Tensor out;
-  isin_Tensor_Tensor_meta(elements, test_elements, assume_unique, invert, out);
-  isin_Tensor_Tensor_impl(elements, test_elements, assume_unique, invert, out);
-  return out;
-}
-
-void isin_Tensor_Scalar_meta(
-    const Tensor& elements,
-    const Scalar& test_elements,
-    bool assume_unique,
-    bool invert,
-    Tensor& out) {
-  check_for_unsupported_isin_dtype(elements.scalar_type());
-  check_for_unsupported_isin_dtype(test_elements.type());
-  auto output_options =
-      TensorOptions(elements.device()).dtype(ScalarType::Bool);
-  if (out.defined()) {
-    xpu::resize_out(out, elements.sizes(), {}, output_options);
-  } else {
-    out = xpu::create_out(elements.sizes(), {}, output_options);
-  }
-}
-
-void isin_Tensor_Scalar_impl(
-    const Tensor& elements,
-    const Scalar& test_elements,
-    bool assume_unique,
-    bool invert,
-    const Tensor& out) {
-  if (invert) {
-    at::ne_out(const_cast<Tensor&>(out), elements, test_elements);
-  } else {
-    at::eq_out(const_cast<Tensor&>(out), elements, test_elements);
-  }
-}
-
-Tensor& XPUNativeFunctions::isin_out(
-    const Tensor& elements,
-    const Scalar& test_elements,
-    bool assume_unique,
-    bool invert,
-    Tensor& out) {
-  isin_Tensor_Scalar_meta(elements, test_elements, assume_unique, invert, out);
-  isin_Tensor_Scalar_impl(elements, test_elements, assume_unique, invert, out);
-  return out;
-}
-
-Tensor XPUNativeFunctions::isin(
-    const Tensor& elements,
-    const Scalar& test_elements,
-    bool assume_unique,
-    bool invert) {
-  Tensor out;
-  isin_Tensor_Scalar_meta(elements, test_elements, assume_unique, invert, out);
-  isin_Tensor_Scalar_impl(elements, test_elements, assume_unique, invert, out);
-  return out;
-}
-
-void isin_Scalar_Tensor_meta(
-    const Scalar& elements,
-    const Tensor& test_elements,
-    bool assume_unique,
-    bool invert,
-    Tensor& out) {
-  check_for_unsupported_isin_dtype(elements.type());
-  check_for_unsupported_isin_dtype(test_elements.scalar_type());
-  auto output_options =
-      TensorOptions(test_elements.device()).dtype(ScalarType::Bool);
-  if (out.defined()) {
-    xpu::resize_out(out, {0}, {}, output_options);
-  } else {
-    out = xpu::create_out({0}, {}, output_options);
-  }
-}
-
-void isin_Scalar_Tensor_impl(
-    const Scalar& elements,
-    const Tensor& test_elements,
-    bool assume_unique,
-    bool invert,
-    const Tensor& out) {
-  // redispatch
-  at::isin_out(
-      const_cast<Tensor&>(out),
-      at::native::wrapped_scalar_tensor(elements, test_elements.device()),
-      test_elements,
-      assume_unique,
-      invert);
-}
-
-Tensor& XPUNativeFunctions::isin_out(
-    const Scalar& elements,
-    const Tensor& test_elements,
-    bool assume_unique,
-    bool invert,
-    Tensor& out) {
-  isin_Scalar_Tensor_meta(elements, test_elements, assume_unique, invert, out);
-  isin_Scalar_Tensor_impl(elements, test_elements, assume_unique, invert, out);
-  return out;
-}
-
-Tensor XPUNativeFunctions::isin(
-    const Scalar& elements,
-    const Tensor& test_elements,
-    bool assume_unique,
-    bool invert) {
-  Tensor out;
-  isin_Scalar_Tensor_meta(elements, test_elements, assume_unique, invert, out);
-  isin_Scalar_Tensor_impl(elements, test_elements, assume_unique, invert, out);
-  return out;
-}
-
+REGISTER_XPU_DISPATCH(where_kernel, &xpu::where_kernel);
+REGISTER_XPU_DISPATCH(clamp_min_scalar_stub, &xpu::clamp_min_scalar_kernel);
+REGISTER_XPU_DISPATCH(clamp_max_scalar_stub, &xpu::clamp_max_scalar_kernel);
+REGISTER_XPU_DISPATCH(clamp_scalar_stub, &xpu::clamp_scalar_kernel);
+REGISTER_XPU_DISPATCH(clamp_stub, &xpu::clamp_kernel);
+REGISTER_XPU_DISPATCH(max_stub, &xpu::max_kernel_impl);
+REGISTER_XPU_DISPATCH(min_stub, &xpu::min_kernel_impl)
+REGISTER_XPU_DISPATCH(isin_default_stub, &xpu::isin_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index 44da487f7..62e9fdfce 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -1,16 +1,11 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorFactories.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <c10/xpu/XPUFunctions.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/empty_strided_native.h>
-#endif
+#include <xpu/ATen/ops/_efficientzerotensor_native.h>
 
 #include <ATen/native/xpu/sycl/ComplexKernels.h>
 #include <ATen/native/xpu/sycl/RandpermKernel.h>
@@ -18,11 +13,12 @@
 
 namespace at {
 
-Tensor& XPUNativeFunctions::eye_out(int64_t n, Tensor& result) {
-  return XPUNativeFunctions::eye_out(n, n, result);
-}
+namespace native {
+
+REGISTER_XPU_DISPATCH(complex_stub, &xpu::complex_kernel);
+REGISTER_XPU_DISPATCH(polar_stub, &xpu::polar_kernel);
 
-Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) {
+Tensor& eye_out_xpu(int64_t n, int64_t m, Tensor& result) {
   TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
   TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m);
 
@@ -37,7 +33,11 @@ Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) {
   return result;
 }
 
-Tensor XPUNativeFunctions::empty(
+Tensor& eye_out_xpu(int64_t n, Tensor& result) {
+  return eye_out_xpu(n, n, result);
+}
+
+Tensor empty_xpu(
     IntArrayRef size,
     c10::optional<ScalarType> dtype_opt,
     c10::optional<Layout> layout_opt,
@@ -60,7 +60,7 @@ Tensor XPUNativeFunctions::empty(
   return result;
 }
 
-Tensor XPUNativeFunctions::empty_strided(
+Tensor empty_strided_xpu(
     IntArrayRef size,
     IntArrayRef stride,
     c10::optional<ScalarType> dtype_opt,
@@ -78,13 +78,7 @@ Tensor XPUNativeFunctions::empty_strided(
   return result;
 }
 
-Tensor XPUNativeFunctions::clone(
-    const Tensor& self,
-    c10::optional<MemoryFormat> memory_format) {
-  return at::native::clone(self, memory_format);
-}
-
-Tensor XPUNativeFunctions::_efficientzerotensor(
+Tensor _efficientzerotensor_xpu(
     IntArrayRef size,
     std::optional<ScalarType> dtype,
     std::optional<Layout> layout,
@@ -103,70 +97,7 @@ Tensor XPUNativeFunctions::_efficientzerotensor(
   return out;
 }
 
-static void complex_check_floating(const Tensor& a, const Tensor& b) {
-  TORCH_CHECK(
-      (a.scalar_type() == kFloat || a.scalar_type() == kDouble ||
-       a.scalar_type() == kHalf) &&
-          (b.scalar_type() == kFloat || b.scalar_type() == kDouble ||
-           b.scalar_type() == kHalf),
-      "Expected both inputs to be Half, Float or Double tensors but got ",
-      a.scalar_type(),
-      " and ",
-      b.scalar_type());
-}
-
-static void complex_check_dtype(
-    const Tensor& result,
-    const Tensor& a,
-    const Tensor& b) {
-  complex_check_floating(a, b);
-  TORCH_CHECK(
-      a.scalar_type() == b.scalar_type(),
-      "Expected object of scalar type ",
-      a.scalar_type(),
-      " but got scalar type ",
-      b.scalar_type(),
-      " for second argument");
-  TORCH_CHECK(
-      result.scalar_type() == toComplexType(a.scalar_type()),
-      "Expected object of scalar type ",
-      toComplexType(a.scalar_type()),
-      " but got scalar type ",
-      result.scalar_type(),
-      " for argument 'out'");
-}
-
-Tensor& XPUNativeFunctions::complex_out(
-    const Tensor& real,
-    const Tensor& imag,
-    Tensor& result) {
-  complex_check_dtype(result, real, imag);
-  auto iter = TensorIteratorConfig()
-                  .add_output(result)
-                  .add_const_input(real)
-                  .add_const_input(imag)
-                  .check_all_same_dtype(false)
-                  .build();
-  native::xpu::complex_kernel(iter);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::polar_out(
-    const Tensor& abs,
-    const Tensor& angle,
-    Tensor& result) {
-  complex_check_dtype(result, abs, angle);
-  auto iter = TensorIteratorConfig()
-                  .add_output(result)
-                  .add_const_input(abs)
-                  .add_const_input(angle)
-                  .check_all_same_dtype(false)
-                  .build();
-  native::xpu::polar_kernel(iter);
-  return result;
-}
-
-Tensor& XPUNativeFunctions::randperm_out(
+Tensor& randperm_out_xpu(
     int64_t n,
     c10::optional<Generator> generator,
     Tensor& result) {
@@ -183,4 +114,5 @@ Tensor& XPUNativeFunctions::randperm_out(
   return result;
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorProperties.cpp b/src/ATen/native/xpu/TensorProperties.cpp
index 428d18fcd..ca35f0a41 100644
--- a/src/ATen/native/xpu/TensorProperties.cpp
+++ b/src/ATen/native/xpu/TensorProperties.cpp
@@ -1,16 +1,2 @@
-#include <ATen/xpu/XPUNativeFunctions.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/is_set_to_native.h>
-#endif
-
-namespace at {
-
-bool XPUNativeFunctions::is_set_to(const Tensor& self, const Tensor& src) {
-  return at::native::is_set_to(self, src);
-}
-
-} // namespace at
+namespace at {} // namespace at
diff --git a/src/ATen/native/xpu/TensorShape.cpp b/src/ATen/native/xpu/TensorShape.cpp
index 84c005121..0e5ee6e62 100644
--- a/src/ATen/native/xpu/TensorShape.cpp
+++ b/src/ATen/native/xpu/TensorShape.cpp
@@ -7,31 +7,22 @@
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/xpu/sycl/ShapeKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/as_strided_ops.h>
-#endif
+#include <xpu/ATen/ops/as_strided_copy_native.h>
+#include <xpu/ATen/ops/as_strided_native.h>
+#include <xpu/ATen/ops/cat_native.h>
 
 namespace at {
 
-Tensor XPUNativeFunctions::view(const Tensor& self, IntArrayRef size) {
-  return at::native::view(self, size);
-}
+namespace native {
 
-Tensor XPUNativeFunctions::view_as_real(const at::Tensor& self) {
-  return at::native::view_as_real(self);
-}
-
-Tensor XPUNativeFunctions::view_as_complex(const Tensor& self) {
-  return at::native::view_as_complex(self);
-}
+TORCH_API at::Tensor as_strided_qtensorimpl(
+    const at::Tensor& self,
+    at::IntArrayRef size,
+    at::IntArrayRef stride,
+    ::std::optional<int64_t> storage_offset);
 
-Tensor XPUNativeFunctions::as_strided(
+Tensor as_strided_xpu(
     const Tensor& self,
     IntArrayRef size,
     IntArrayRef stride,
@@ -43,227 +34,20 @@ Tensor XPUNativeFunctions::as_strided(
   return at::native::as_strided_tensorimpl(self, size, stride, storage_offset);
 }
 
-Tensor XPUNativeFunctions::_reshape_alias(
-    const Tensor& self,
-    IntArrayRef size,
-    IntArrayRef stride) {
-  return at::native::_reshape_alias(self, size, stride);
-}
-
-Tensor XPUNativeFunctions::unfold(
-    const Tensor& self,
-    int64_t dimension,
-    int64_t size,
-    int64_t step) {
-  return at::native::unfold(self, dimension, size, step);
-}
-
-inline c10::MemoryFormat cat_compute_output_memory_format(
-    const MaterializedITensorListRef& inputs) {
-  c10::optional<c10::MemoryFormat> format = c10::nullopt;
-  for (const Tensor& t : inputs) {
-    auto f = t.suggest_memory_format();
-    if (f == c10::MemoryFormat::Contiguous) {
-      return f;
-    }
-    if (format.has_value() && format.value() != f) {
-      return c10::MemoryFormat::Contiguous;
-    }
-    format = f;
-  }
-  return format.value();
-}
-
-inline void cat_check_no_zero_dim(const MaterializedITensorListRef& tensors) {
-  size_t i = 0;
-  for (const Tensor& t : tensors) {
-    TORCH_CHECK(
-        t.dim() > 0,
-        "zero-dimensional tensor (at position ",
-        i,
-        ") cannot be concatenated");
-    i++;
-  }
-}
-
-void cat_meta(
-    const ITensorListRef& tensors,
-    int64_t& dim,
-    Tensor& result,
-    size_t& valid,
-    bool& all_contiguous,
-    bool& all_same_dtype,
-    bool& all_same_sizes_and_stride,
-    c10::MemoryFormat& memory_format) {
-  // previously, size [0] tensors were the only possible empty tensors; thus, it
-  // wasn't possible to cat empty tensors unless all the other tensors were
-  // 1-dimensional, so we allowed these tensors to be "skipped".  We maintain
-  // this behavior for backwards compatibility, but only for this specific size
-  // (i.e. other empty sizes are not skipped).
-  auto materialized = tensors.materialize();
-
-  cat_check_no_zero_dim(materialized);
-  dim = at::legacy_cat_wrap_dim(dim, materialized);
-
-  // Checking names before the actual dimensions.
-  auto maybe_outnames = namedinference::compute_cat_outnames(materialized);
-
-  TORCH_CHECK(
-      !materialized.empty(),
-      "torch.cat(): expected a non-empty list of Tensors");
-
-  // Look for the first valid tensor.
-  valid = materialized.size();
-  for (const auto i : c10::irange(materialized.size())) {
-    if (!at::native::cat_should_skip_tensor(materialized[i].get())) {
-      valid = i;
-      break;
-    }
-  }
-
-  all_contiguous = true;
-  all_same_dtype = true;
-  all_same_sizes_and_stride = true;
-  memory_format = cat_compute_output_memory_format(materialized);
-
-  // Compute what the output dtype should be:
-  auto is_out_defined = result.defined();
-  auto out_dtype = at::native::result_type(tensors);
-
-  // If the output tensor is defined, we need to take it into account
-  // when computing the actual output dtype and the flags.
-  if (is_out_defined) {
-    // Check for type promotion, if the output tensor is defined.
-    TORCH_CHECK(
-        canCast(out_dtype, result.scalar_type()),
-        "torch.cat(): input types can't be cast to the desired output type ",
-        result.scalar_type());
-    out_dtype = result.scalar_type();
-    all_contiguous = result.is_contiguous(memory_format);
-  }
-
-  // Fallback 'set_output' parameters.
-  // (in case we don't find a valid tensor)
-  DimVector sizes{0};
-  TensorOptions options =
-      materialized[0].get().options().dtype(out_dtype).memory_format(
-          memory_format);
-
-  // If we found a valid tensor, check whether the input tensors
-  // are compatible, i.e. we can execute `cat` on them.
-  bool found_valid_tensor = valid < materialized.size();
-  if (found_valid_tensor) {
-    TORCH_CHECK(
-        dim <= materialized[valid].get().dim(),
-        "torch.cat(): dimension ",
-        dim,
-        "out of range");
-
-    // Compute the output tensor size.
-    // It should have the same shape as any other valid tensor,
-    // except in the dimension 'dim'.
-    size_t size_at_dim = 0;
-    for (const auto i : c10::irange(materialized.size())) {
-      const Tensor& t = materialized[i];
-      all_same_dtype = all_same_dtype && out_dtype == t.scalar_type();
-      if (!at::native::cat_should_skip_tensor(t)) {
-        at::native::check_cat_shape_except_dim(materialized[valid], t, dim, i);
-        size_at_dim += t.size(dim);
-        all_contiguous = all_contiguous && t.is_contiguous(memory_format);
-        all_same_sizes_and_stride = all_same_sizes_and_stride &&
-            t.sizes() == materialized[valid].get().sizes() &&
-            t.strides() == materialized[valid].get().strides();
-      } else {
-        all_contiguous = false;
-      }
-    }
-
-    // Actually set the output.
-    sizes = materialized[valid].get().sizes().vec();
-    sizes[dim] = size_at_dim;
-    options =
-        materialized[valid].get().options().dtype(out_dtype).memory_format(
-            memory_format);
-  }
-
-  if (is_out_defined) {
-    at::xpu::resize_out(result, sizes, {}, options);
-  } else {
-    result = at::xpu::create_out(sizes, {}, options);
+TORCH_IMPL_FUNC(cat_out_xpu)
+(const ITensorListRef& tensors,
+ int64_t dim,
+ int64_t valid,
+ bool all_contiguous,
+ bool all_same_dtype,
+ bool all_same_sizes_and_stride,
+ MemoryFormat memory_format,
+ const Tensor& result) {
+  if (result.numel() == 0) {
+    return;
   }
 
-  if (!maybe_outnames.empty()) {
-    namedinference::propagate_names(result, maybe_outnames);
-  }
-  // Checks for overlaps between the inputs and the output tensor.
-  if (is_out_defined && found_valid_tensor) {
-    at::assert_no_internal_overlap(result);
-    for (const Tensor& t : materialized) {
-      at::assert_no_overlap(result, t);
-    }
-  }
-}
-
-Tensor& XPUNativeFunctions::cat_out(
-    const ITensorListRef& tensors,
-    int64_t dim,
-    Tensor& result) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, result, "xpu::cat_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, tensors, "xpu::cat_out", "tensors");
-
-  size_t valid;
-  bool all_contiguous;
-  bool all_same_dtype;
-  bool all_same_sizes_and_stride;
-  c10::MemoryFormat memory_format;
-  cat_meta(
-      tensors,
-      dim,
-      result,
-      valid,
-      all_contiguous,
-      all_same_dtype,
-      all_same_sizes_and_stride,
-      memory_format);
-
-  at::native::xpu::cat_out_kernel(
-      tensors,
-      dim,
-      valid,
-      all_contiguous,
-      all_same_dtype,
-      all_same_sizes_and_stride,
-      memory_format,
-      result);
-
-  return result;
-}
-
-Tensor XPUNativeFunctions::cat(const ITensorListRef& tensors, int64_t dim) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, tensors, "xpu::cat", "tensors");
-
-  Tensor result;
-  size_t valid;
-  bool all_contiguous;
-  bool all_same_dtype;
-  bool all_same_sizes_and_stride;
-  c10::MemoryFormat memory_format;
-  cat_meta(
-      tensors,
-      dim,
-      result,
-      valid,
-      all_contiguous,
-      all_same_dtype,
-      all_same_sizes_and_stride,
-      memory_format);
-
-  at::native::xpu::cat_out_kernel(
+  xpu::cat_out_kernel(
       tensors,
       dim,
       valid,
@@ -272,8 +56,7 @@ Tensor XPUNativeFunctions::cat(const ITensorListRef& tensors, int64_t dim) {
       all_same_sizes_and_stride,
       memory_format,
       result);
-
-  return result;
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorTopK.cpp b/src/ATen/native/xpu/TensorTopK.cpp
index 3961160bf..ab3fc5250 100644
--- a/src/ATen/native/xpu/TensorTopK.cpp
+++ b/src/ATen/native/xpu/TensorTopK.cpp
@@ -2,55 +2,22 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/TensorTopKKernel.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-#include <comm/RegisterUtils.h>
-
-namespace at {
-
-void topk_meta(
-    const Tensor& self,
-    int64_t k,
-    int64_t dim_,
-    bool largest,
-    bool sorted,
-    Tensor& values,
-    Tensor& indices) {
-  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
-  TORCH_CHECK(
-      k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
-      "selected index k out of range");
-  int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);
-  TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension");
 
-  // Build the output size, which is the dim being selected set to
-  // size k
-  DimVector topKSize(self.sizes().vec());
-  if (!topKSize.empty()) {
-    topKSize[dim] = k;
-  }
+#include <comm/RegisterUtils.h>
 
-  if (values.defined()) {
-    at::xpu::resize_out(values, topKSize, {}, self.options());
-  } else {
-    values = at::xpu::create_out(topKSize, {}, self.options());
-  }
+#include <xpu/ATen/ops/topk_native.h>
 
-  if (indices.defined()) {
-    at::xpu::resize_out(indices, topKSize, {}, self.options().dtype(at::kLong));
-  } else {
-    indices =
-        at::xpu::create_out(topKSize, {}, self.options().dtype(at::kLong));
-  }
-}
+namespace at {
 
-void topk_out_impl(
-    const Tensor& self,
-    int64_t k,
-    int64_t dim_,
-    bool largest,
-    bool sorted,
-    Tensor& values,
-    Tensor& indices) {
+namespace native {
+TORCH_IMPL_FUNC(topk_out_xpu)
+(const Tensor& self,
+ int64_t k,
+ int64_t dim_,
+ bool largest,
+ bool sorted,
+ const Tensor& values,
+ const Tensor& indices) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
   TORCH_CHECK(
       k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
@@ -69,30 +36,6 @@ void topk_out_impl(
     native::xpu::topk_kernel(self, k, dim, largest, sorted, values, indices);
   }
 }
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::topk(
-    const Tensor& self,
-    int64_t k,
-    int64_t dim,
-    bool largest,
-    bool sorted) {
-  Tensor values, indices;
-  topk_meta(self, k, dim, largest, sorted, values, indices);
-  topk_out_impl(self, k, dim, largest, sorted, values, indices);
-  return std::tuple<Tensor, Tensor>(values, indices);
-}
-
-std::tuple<Tensor&, Tensor&> XPUNativeFunctions::topk_out(
-    const Tensor& self,
-    int64_t k,
-    int64_t dim,
-    bool largest,
-    bool sorted,
-    Tensor& values,
-    Tensor& indices) {
-  topk_meta(self, k, dim, largest, sorted, values, indices);
-  topk_out_impl(self, k, dim, largest, sorted, values, indices);
-  return std::forward_as_tuple(values, indices);
-}
+} // namespace native
 
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorTransformations.cpp b/src/ATen/native/xpu/TensorTransformations.cpp
index 2ac3bee4f..eb6950e74 100644
--- a/src/ATen/native/xpu/TensorTransformations.cpp
+++ b/src/ATen/native/xpu/TensorTransformations.cpp
@@ -1,88 +1,18 @@
 #include <ATen/TensorIterator.h>
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/IndexKernel.h>
 #include <ATen/native/TensorTransformations.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/TensorTransformationsKernels.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
+namespace native {
 
-Tensor XPUNativeFunctions::flip(const Tensor& self, IntArrayRef dims) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::flip", "self");
-
-  const int64_t total_dims = self.dim();
-  // It wraps the dims and checks that there are no repeated dims
-  auto flip_dims_b = at::dim_list_to_bitset(dims, total_dims);
-
-  Tensor out_tensor = at::empty_like(self, MemoryFormat::Preserve);
-
-  // Count dimensions in which we need to do work
-  int n = 0;
-  auto strides = DimVector(self.strides());
-  for (int64_t i = 0; i < total_dims; i++) {
-    if (flip_dims_b[i] && self.size(i) > 1 && self.stride(i) != 0) {
-      n++;
-      strides[i] = 0;
-    }
-  }
-
-  // Nothing to do, we return fast
-  if (n == 0 || self.numel() <= 1) {
-    out_tensor.copy_(self);
-    return out_tensor;
-  }
-
-  // create dummy output with 0 strides at flipped dimension, to prevent
-  // tensorIterator from coalescing flipped dims
-  const auto restrided_self = self.as_strided(self.sizes(), strides);
-  auto iter =
-      TensorIteratorConfig()
-          .set_check_mem_overlap(false)
-          .check_all_same_dtype(false)
-          .declare_static_dtype_and_device(self.scalar_type(), self.device())
-          .add_output(out_tensor)
-          .add_input(self)
-          .add_input(restrided_self)
-          .build();
-
-  auto* data = reinterpret_cast<char*>(iter.data_ptr(0));
-  const auto sizes = iter.shape();
-  // This is a SmallVector of _signed_ ints
-  auto strides_bytes = DimVector(iter.strides(0));
-  const auto strides_self = iter.strides(1);
-  const auto strides_dummy = iter.strides(2);
-
-  // To understand this transformation, think of a 3D cube.
-  //   - The data ptr points to the lower-left most vertex of the cube
-  //   - The strides tell us how to move in each dimension,
-  //     that is, data + stride[i] advances one element in the dimension i
-  // To flip a dimension:
-  //   - We move the pointer to the opposite vertex of the cube
-  //   - We iterate in the opposite direction (invert the strides)
-  for (int i = 0; i < iter.ndim(); i++) {
-    // We know that an dimension has a zero stride and self[i] does not, as we
-    // defined above Note that it may be the case that strides_dummy[i] = 0
-    // not because we set it, but because strides_self[i] == 0. We do not want
-    // to do anything there
-    if (strides_dummy[i] == 0 && strides_self[i] != 0) {
-      data += strides_bytes[i] * (sizes[i] - 1);
-      strides_bytes[i] *= -1;
-    }
-  }
-  iter._unsafe_set_arg_strides(0, strides_bytes);
-  iter._unsafe_set_arg_data(0, reinterpret_cast<void*>(data));
-
-  at::native::xpu::flip_kernel(iter);
-  return out_tensor;
-}
+REGISTER_XPU_DISPATCH(flip_stub, &xpu::flip_kernel);
 
-Tensor XPUNativeFunctions::roll(
-    const Tensor& self,
-    IntArrayRef shifts,
-    IntArrayRef dims) {
+Tensor roll_xpu(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
   if (dims.size() != 1 || shifts.size() != 1) {
     return at::native::roll_common(self, shifts, dims);
   }
@@ -96,9 +26,10 @@ Tensor XPUNativeFunctions::roll(
     return out_tensor;
   }
 
-  native::xpu::roll_kernel(in_tensor, out_tensor, shifts, dims);
+  xpu::roll_kernel(in_tensor, out_tensor, shifts, dims);
 
   return out_tensor;
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/TriangluarOps.cpp b/src/ATen/native/xpu/TriangluarOps.cpp
index affba5665..3db5e967b 100644
--- a/src/ATen/native/xpu/TriangluarOps.cpp
+++ b/src/ATen/native/xpu/TriangluarOps.cpp
@@ -1,77 +1,30 @@
-#include <ATen/ATen.h>
+
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/xpu/sycl/TriangularOpsKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
+#include <comm/xpu_aten.h>
 
-namespace at {
-
-void tril_meta(const Tensor& self, int64_t k) {
-  TORCH_CHECK(
-      self.dim() >= 2, "tril: input tensor must have at least 2 dimensions");
-}
-
-Tensor& XPUNativeFunctions::tril_out(
-    const Tensor& self,
-    int64_t diagonal,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::tril_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::tril_out", "self");
-  tril_meta(self, diagonal);
-  xpu::resize_out(out, self.sizes(), {}, self.options());
-  return native::xpu::tril_kernel(out, self, diagonal);
-}
-
-Tensor XPUNativeFunctions::tril(const Tensor& self, int64_t diagonal) {
-  tril_meta(self, diagonal);
-  Tensor out = xpu::create_out(self.sizes(), {}, self.options());
-  return tril_out(self, diagonal, out);
-}
-
-Tensor& XPUNativeFunctions::tril_(Tensor& self, int64_t diagonal) {
-  tril_meta(self, diagonal);
-  xpu::check_inplace(self, self.sizes(), self.options());
-  return tril_out(self, diagonal, self);
-}
+#include <xpu/ATen/ops/tril_native.h>
+#include <xpu/ATen/ops/triu_native.h>
 
-void triu_meta(const Tensor& self, int64_t k) {
-  TORCH_CHECK(
-      self.dim() >= 2, "triu: input tensor must have at least 2 dimensions");
-}
-
-Tensor& XPUNativeFunctions::triu_out(
-    const Tensor& self,
-    int64_t diagonal,
-    Tensor& out) {
-  std::optional<Device> common_device = std::nullopt;
-  c10::impl::check_and_update_common_device(
-      common_device, out, "xpu::triu_out", "out");
-  c10::impl::check_and_update_common_device(
-      common_device, self, "xpu::triu_out", "self");
-  triu_meta(self, diagonal);
-  xpu::resize_out(out, self.sizes(), {}, self.options());
-  return native::xpu::triu_kernel(out, self, diagonal);
-}
+namespace at::native {
 
-Tensor XPUNativeFunctions::triu(const Tensor& self, int64_t diagonal) {
-  triu_meta(self, diagonal);
-  Tensor out = xpu::create_out(self.sizes(), {}, self.options());
-  return triu_out(self, diagonal, out);
+TORCH_IMPL_FUNC(tril_xpu)(const Tensor& self, int64_t k, const Tensor& result) {
+  if (self.numel() != 0) {
+    xpu::tril_kernel(result, self, k);
+  }
 }
 
-Tensor& XPUNativeFunctions::triu_(Tensor& self, int64_t diagonal) {
-  triu_meta(self, diagonal);
-  xpu::check_inplace(self, self.sizes(), self.options());
-  return triu_out(self, diagonal, self);
+TORCH_IMPL_FUNC(triu_xpu)(const Tensor& self, int64_t k, const Tensor& result) {
+  if (self.numel() != 0) {
+    xpu::triu_kernel(result, self, k);
+  }
 }
 
-Tensor XPUNativeFunctions::trace(const Tensor& self) {
+Tensor trace_xpu(const Tensor& self) {
   TORCH_CHECK(self.dim() == 2, "expected a matrix");
   return self.diagonal().sum();
 }
 
-} // namespace at
+} // namespace at::native
\ No newline at end of file
diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
index f1bb12ba3..119b7bab9 100644
--- a/src/ATen/native/xpu/UnaryOps.cpp
+++ b/src/ATen/native/xpu/UnaryOps.cpp
@@ -1,8 +1,10 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
+
+#include <ATen/native/DispatchStub.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <ATen/native/UnaryOps.h>
 
 #include <ATen/native/xpu/sycl/AbsKernel.h>
 #include <ATen/native/xpu/sycl/UnaryComplexKernels.h>
@@ -20,1153 +22,61 @@
 #include <ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricTanKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h>
+
 #include <ATen/native/xpu/sycl/UnaryKernels.h>
 #include <ATen/native/xpu/sycl/UnaryLogKernels.h>
 #include <ATen/native/xpu/sycl/UnarySignKernels.h>
 #include <ATen/native/xpu/sycl/UnarySpecialOpsKernels.h>
 
-namespace at {
-
-template <typename Stub>
-static inline Tensor& unary_op_impl_out(
-    Tensor& result,
-    const Tensor& self,
-    Stub& stub) {
-  auto iter = TensorIterator::unary_op(result, self);
-  stub(iter);
-  return result;
-}
-
-template <typename Stub, typename... Args>
-static inline Tensor& unary_op_impl_float_out(
-    Tensor& result,
-    const Tensor& self,
-    Stub& stub,
-    Args... args) {
-  auto iter = TensorIterator::unary_float_op(result, self);
-  stub(iter, args...);
-  iter.cast_outputs();
-  return result;
-}
-
-template <typename Stub>
-static inline Tensor& unary_op_impl_with_complex_to_float_out(
-    Tensor& result,
-    const Tensor& self,
-    Stub& stub,
-    bool promotes_integer_to_float) {
-  if (self.is_complex() && !result.is_complex()) {
-    // Checks if the corresponding float type can be cast to the desired dtype
-    const auto float_type = c10::toRealValueType(self.scalar_type());
-    TORCH_CHECK(
-        canCast(float_type, result.scalar_type()),
-        "result type ",
-        float_type,
-        " can't be cast to the desired output type ",
-        result.scalar_type());
-
-    // Runs the function complex->complex, as TensorIterator expects
-    Tensor complex_result = at::empty({0}, self.options());
-    auto iter = TensorIterator::unary_op(complex_result, self);
-    stub(iter);
-
-    // Copies the complex result to the actual result and returns it
-    at::native::resize_output(result, complex_result.sizes());
-    result.copy_(at::real(complex_result));
-    return result;
-  }
-
-  if (promotes_integer_to_float) {
-    return unary_op_impl_float_out(result, self, stub);
-  }
-
-  return unary_op_impl_out(result, self, stub);
-}
-
-// out_impl passed into unary_op_impl and unary_op_impl_  must go through at::
-// device dispatch otherwise it won't dispatch to out-of-source devices like
-// XLA. For example it must be at::bitwise_not_out instead of
-// bitwise_not_out(which is at::native!).
-template <typename OutImpl>
-static inline Tensor unary_op_impl(const Tensor& self, OutImpl& out_impl) {
-  Tensor result = at::empty({0}, self.options());
-  return out_impl(result, self);
-}
-
-// An alternate version of unary_op_impl that follows the same pattern
-// for non-complex inputs, but returns a floating point tensor
-// for complex inputs by default.
-template <typename OutImpl>
-static inline Tensor unary_op_impl_with_complex_to_float(
-    const Tensor& self,
-    OutImpl& out_impl) {
-  if (self.is_complex()) {
-    const auto float_type = c10::toRealValueType(self.scalar_type());
-    Tensor result = at::empty_like(self, self.options().dtype(float_type));
-    return out_impl(result, self);
-  }
-
-  Tensor result = at::empty({0}, self.options());
-  return out_impl(result, self);
-}
-
-template <typename OutImpl>
-static inline Tensor& unary_op_impl_(Tensor& self, OutImpl& out_impl) {
-  return out_impl(self, self);
-}
-
-Tensor XPUNativeFunctions::abs(const Tensor& self) {
-  return unary_op_impl_with_complex_to_float(self, at::abs_out);
-}
-
-Tensor& XPUNativeFunctions::abs_(Tensor& self) {
-  TORCH_CHECK(
-      !self.is_complex(), "In-place abs is not supported for complex tensors.");
-  return unary_op_impl_(self, at::abs_out);
-}
-
-Tensor& XPUNativeFunctions::abs_out(const Tensor& self, Tensor& out) {
-  return unary_op_impl_with_complex_to_float_out(
-      out,
-      self,
-      native::xpu::abs_kernel,
-      /*promotes_integer_to_float=*/false);
-}
-
-Tensor XPUNativeFunctions::sin(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sin_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sin_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::sin_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::sin_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sin_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::cos(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::cos_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::cos_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::cos_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::cos_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::cos_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::digamma(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::digamma_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::digamma_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::digamma_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::digamma_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::digamma_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::polygamma(int64_t n, const Tensor& self) {
-  TORCH_CHECK(n >= 0, "polygamma(n, x) does not support negative n.");
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::polygamma_kernel(iter, n);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::polygamma_out(
-    int64_t n,
-    const Tensor& self,
-    Tensor& out) {
-  TORCH_CHECK(n >= 0, "polygamma(n, x) does not support negative n.");
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::polygamma_kernel(iter, n);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::polygamma_(Tensor& self, int64_t n) {
-  return polygamma_out(n, self, self);
-}
-
-Tensor XPUNativeFunctions::lgamma(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::lgamma_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::lgamma_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::lgamma_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::lgamma_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::lgamma_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::log(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::log_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::log_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::log_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::log10(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log10_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::log10_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::log10_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::log10_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log10_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::log1p(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log1p_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::log1p_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::log1p_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::log1p_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log1p_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::log2(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log2_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::log2_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::log2_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::log2_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::log2_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::sqrt(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sqrt_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sqrt_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::sqrt_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::sqrt_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sqrt_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::rsqrt(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::rsqrt_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::rsqrt_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::rsqrt_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::rsqrt_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::rsqrt_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::tanh(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::tanh_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::tanh_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::tanh_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::tanh_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::tanh_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::neg(const Tensor& self) {
-  TORCH_CHECK(
-      self.scalar_type() != kBool,
-      "Negation, the `-` operator, on a bool tensor is not supported. "
-      "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::neg_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::neg_(Tensor& self) {
-  TORCH_CHECK(
-      self.scalar_type() != kBool,
-      "Negation, the `-` operator, on a bool tensor is not supported. "
-      "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(self, self);
-  native::xpu::neg_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::neg_out(const Tensor& self, Tensor& out) {
-  TORCH_CHECK(
-      self.scalar_type() != kBool,
-      "Negation, the `-` operator, on a bool tensor is not supported. "
-      "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::neg_kernel(iter);
-  return out;
-}
-
-TensorIterator logical_not_meta(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build(TensorIteratorConfig()
-                 .check_all_same_dtype(false)
-                 .add_output(out)
-                 .add_const_input(self));
-  return iter;
-}
-
-Tensor XPUNativeFunctions::logical_not(const Tensor& self) {
-  Tensor out = at::empty({0}, self.options().dtype(kBool));
-  return at::logical_not_out(out, self);
-}
-
-Tensor& XPUNativeFunctions::logical_not_(Tensor& self) {
-  return at::logical_not_out(self, self);
-}
-
-Tensor& XPUNativeFunctions::logical_not_out(const Tensor& self, Tensor& out) {
-  auto iter = logical_not_meta(self, out);
-  native::xpu::logical_not_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::reciprocal(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::reciprocal_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::reciprocal_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::reciprocal_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::reciprocal_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::reciprocal_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::bitwise_not_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::bitwise_not_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::exp(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::exp_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::exp_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::exp_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::exp_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::exp_kernel(iter);
-  return self;
-}
-
-Tensor XPUNativeFunctions::sigmoid(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sigmoid_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sigmoid_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::sigmoid_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::sigmoid_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sigmoid_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::sign(const Tensor& self) {
-  TORCH_CHECK(
-      !self.is_complex(),
-      "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::sign_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sign_(Tensor& self) {
-  TORCH_CHECK(
-      !self.is_complex(),
-      "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(self, self);
-  native::xpu::sign_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::sign_out(const Tensor& self, Tensor& out) {
-  TORCH_CHECK(
-      !self.is_complex(),
-      "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::sign_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::signbit(const Tensor& self) {
-  TORCH_CHECK(
-      !self.is_complex(), "signbit is not implemented for complex tensors.");
-
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_force_boolean_op(out, self);
-
-  if (self.dtype() == at::kBool) {
-    iter.output().fill_(false);
-  } else {
-    native::xpu::signbit_kernel(iter);
-  }
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::signbit_out(const Tensor& self, Tensor& out) {
-  TORCH_CHECK(
-      !self.is_complex(), "signbit is not implemented for complex tensors.");
-  TORCH_CHECK(
-      out.dtype() == at::kBool,
-      "signbit does not support non-boolean outputs.");
-
-  TensorIterator iter;
-  iter.build_borrowing_unary_force_boolean_op(out, self);
-
-  if (self.dtype() == at::kBool) {
-    out.fill_(false);
-  } else {
-    native::xpu::signbit_kernel(iter);
-  }
-  return out;
-}
-
-Tensor& XPUNativeFunctions::logit_out(
-    const Tensor& self,
-    std::optional<double> eps,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::logit_kernel(iter, Scalar(eps ? eps.value() : -1.0));
-  return out;
-}
-
-Tensor XPUNativeFunctions::logit(
-    const Tensor& self,
-    std::optional<double> eps) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::logit_kernel(iter, Scalar(eps ? eps.value() : -1.0));
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::logit_(Tensor& self, std::optional<double> eps) {
-  return at::logit_out(self, self, eps);
-}
-
-Tensor XPUNativeFunctions::sgn(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  if (self.is_complex()) {
-    native::xpu::sgn_kernel(iter);
-  } else {
-    native::xpu::sign_kernel(iter);
-  }
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sgn_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(self, self);
-  if (self.is_complex()) {
-    native::xpu::sgn_kernel(iter);
-  } else {
-    native::xpu::sign_kernel(iter);
-  }
-  return self;
-}
-
-Tensor& XPUNativeFunctions::sgn_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  if (self.is_complex()) {
-    native::xpu::sgn_kernel(iter);
-  } else {
-    native::xpu::sign_kernel(iter);
-  }
-  return out;
-}
-
-Tensor XPUNativeFunctions::acos(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::acos_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::acos_(Tensor& self) {
-  TensorIterator iter;
-
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::acos_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::acos_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::acos_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::acosh(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::acosh_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::acosh_(Tensor& self) {
-  TensorIterator iter;
-
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::acosh_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::acosh_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::acosh_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::erf(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::erf_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::erf_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::erf_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::erf_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::erf_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::erfc(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::erfc_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::erfc_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::erfc_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::erfc_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::erfinv(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::erfinv_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::erfinv_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::erfinv_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::erfinv_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::erfinv_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::exp2(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::exp2_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::exp2_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::exp2_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::exp2_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::exp2_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::expm1(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::expm1_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::expm1_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::expm1_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::expm1_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::expm1_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::frac(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::frac_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::frac_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(self, self);
-  native::xpu::frac_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::frac_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::frac_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::sinh(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sinh_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::sinh_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::sinh_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::sinh_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::sinh_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::asinh(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::asinh_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::asinh_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::asinh_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::asinh_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::asinh_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::asin(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::asin_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::asin_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::asin_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::asin_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::asin_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::tan(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::tan_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::tan_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::tan_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::tan_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::tan_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::atan(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::atan_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::atan_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::atan_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::atan_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::atan_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::atanh(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::atanh_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::atanh_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::atanh_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::atanh_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::atanh_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::cosh(const Tensor& self) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::cosh_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::cosh_(Tensor& self) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(self, self);
-  native::xpu::cosh_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::cosh_out(const Tensor& self, Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_float_op(out, self);
-  native::xpu::cosh_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::conj_physical_out(const Tensor& self, Tensor& out) {
-  auto iter = TensorIterator::unary_op(out, self);
-  native::xpu::conj_physical_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::conj_physical_(Tensor& self) {
-  if (!self.is_complex())
-    return self;
-  return XPUNativeFunctions::conj_physical_out(self, self);
-}
-
-TensorIterator ceil_meta(const Tensor& self, Tensor& out) {
-  TORCH_CHECK(!self.is_complex(), "ceil is not supported for complex inputs");
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::ceil(const Tensor& self) {
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    return self.clone();
-  }
-  Tensor out;
-  auto iter = ceil_meta(self, out);
-  native::xpu::ceil_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::ceil_(Tensor& self) {
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    return self;
-  }
-  auto iter = ceil_meta(self, self);
-  native::xpu::ceil_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::ceil_out(const Tensor& self, Tensor& out) {
-  auto iter = ceil_meta(self, out);
-
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    out.copy_(self);
-    return out;
-  }
-  native::xpu::ceil_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::round(const Tensor& self) {
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    return self.clone();
-  }
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::round_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::round_(Tensor& self) {
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    return self;
-  }
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(self, self);
-  native::xpu::round_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::round_out(const Tensor& self, Tensor& out) {
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    out.copy_(self);
-    return out;
-  }
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  native::xpu::round_kernel(iter);
-  return out;
-}
-
-Tensor XPUNativeFunctions::round(const Tensor& self, int64_t decimals) {
-  Tensor out;
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  if (decimals != 0) {
-    native::xpu::round_decimals_kernel(iter, decimals);
-  } else {
-    native::xpu::round_kernel(iter);
-  }
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::round_(Tensor& self, int64_t decimals) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(self, self);
-  if (decimals != 0) {
-    native::xpu::round_decimals_kernel(iter, decimals);
-  } else {
-    native::xpu::round_kernel(iter);
-  }
-  return self;
-}
-
-Tensor& XPUNativeFunctions::round_out(
-    const Tensor& self,
-    int64_t decimals,
-    Tensor& out) {
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  if (decimals != 0) {
-    native::xpu::round_decimals_kernel(iter, decimals);
-  } else {
-    native::xpu::round_kernel(iter);
-  }
-  return out;
-}
-
-TensorIterator meta_floor(const Tensor& self, Tensor& out) {
-  // Note: this is consistent with NumPy
-  TORCH_CHECK(!self.is_complex(), "floor is not supported for complex inputs");
-  TensorIterator iter;
-  iter.build_borrowing_unary_op(out, self);
-  return iter;
-}
-
-Tensor XPUNativeFunctions::floor(const Tensor& self) {
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    return self.clone();
-  }
-  Tensor out;
-  auto iter = meta_floor(self, out);
-  native::xpu::floor_kernel(iter);
-  return iter.output();
-}
-
-Tensor& XPUNativeFunctions::floor_(Tensor& self) {
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    return self;
-  }
-  auto iter = meta_floor(self, self);
-  native::xpu::floor_kernel(iter);
-  return self;
-}
-
-Tensor& XPUNativeFunctions::floor_out(const Tensor& self, Tensor& out) {
-  auto iter = meta_floor(self, out);
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
-    out.copy_(self);
-    return out;
-  }
-
-  native::xpu::floor_kernel(iter);
-  return out;
-}
-
-Tensor& XPUNativeFunctions::nan_to_num_out(
-    const Tensor& self,
-    std::optional<double> nan,
-    std::optional<double> pos_inf,
-    std::optional<double> neg_inf,
-    Tensor& result) {
-  TORCH_CHECK(
-      self.scalar_type() == result.scalar_type(),
-      "nan_to_num: dtype of out: ",
-      result.scalar_type(),
-      " should be same as input: ",
-      self.scalar_type());
-
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
-    at::native::resize_output(result, self.sizes());
-    result.copy_(self);
-    return result;
-  }
-
-  auto iter = TensorIterator::unary_op(result, self);
-  native::xpu::nan_to_num_kernel(iter, nan, pos_inf, neg_inf);
-  return result;
-}
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/real.h>
 
+namespace at {
+namespace native {
+REGISTER_XPU_DISPATCH(abs_stub, &xpu::abs_kernel);
+REGISTER_XPU_DISPATCH(sin_stub, &xpu::sin_kernel);
+REGISTER_XPU_DISPATCH(cos_stub, &xpu::cos_kernel);
+REGISTER_XPU_DISPATCH(digamma_stub, &xpu::digamma_kernel);
+REGISTER_XPU_DISPATCH(polygamma_stub, &xpu::polygamma_kernel);
+REGISTER_XPU_DISPATCH(lgamma_stub, &xpu::lgamma_kernel);
+REGISTER_XPU_DISPATCH(log_stub, &xpu::log_kernel);
+REGISTER_XPU_DISPATCH(log10_stub, &xpu::log10_kernel);
+REGISTER_XPU_DISPATCH(log1p_stub, &xpu::log1p_kernel);
+REGISTER_XPU_DISPATCH(log2_stub, &xpu::log2_kernel);
+REGISTER_XPU_DISPATCH(sqrt_stub, &xpu::sqrt_kernel);
+REGISTER_XPU_DISPATCH(rsqrt_stub, &xpu::rsqrt_kernel);
+REGISTER_XPU_DISPATCH(tanh_stub, &xpu::tanh_kernel);
+REGISTER_XPU_DISPATCH(neg_stub, &xpu::neg_kernel);
+REGISTER_XPU_DISPATCH(logical_not_stub, &xpu::logical_not_kernel);
+REGISTER_XPU_DISPATCH(reciprocal_stub, &xpu::reciprocal_kernel);
+REGISTER_XPU_DISPATCH(bitwise_not_stub, &xpu::bitwise_not_kernel);
+REGISTER_XPU_DISPATCH(exp_stub, &xpu::exp_kernel);
+REGISTER_XPU_DISPATCH(sigmoid_stub, &xpu::sigmoid_kernel);
+REGISTER_XPU_DISPATCH(logit_stub, &xpu::logit_kernel);
+REGISTER_XPU_DISPATCH(sgn_stub, &xpu::sgn_kernel);
+REGISTER_XPU_DISPATCH(sign_stub, &xpu::sign_kernel);
+REGISTER_XPU_DISPATCH(signbit_stub, &xpu::signbit_kernel);
+REGISTER_XPU_DISPATCH(acos_stub, &xpu::acos_kernel);
+REGISTER_XPU_DISPATCH(acosh_stub, &xpu::acosh_kernel);
+REGISTER_XPU_DISPATCH(erf_stub, &xpu::erf_kernel);
+REGISTER_XPU_DISPATCH(erfc_stub, &xpu::erfc_kernel);
+REGISTER_XPU_DISPATCH(erfinv_stub, &xpu::erfinv_kernel);
+REGISTER_XPU_DISPATCH(exp2_stub, &xpu::exp2_kernel);
+REGISTER_XPU_DISPATCH(expm1_stub, &xpu::expm1_kernel);
+REGISTER_XPU_DISPATCH(frac_stub, &xpu::frac_kernel);
+REGISTER_XPU_DISPATCH(conj_physical_stub, &xpu::conj_physical_kernel);
+REGISTER_XPU_DISPATCH(ceil_stub, &xpu::ceil_kernel);
+REGISTER_XPU_DISPATCH(sinh_stub, &xpu::sinh_kernel);
+REGISTER_XPU_DISPATCH(asinh_stub, &xpu::asinh_kernel);
+REGISTER_XPU_DISPATCH(asin_stub, &xpu::asin_kernel);
+REGISTER_XPU_DISPATCH(tan_stub, &xpu::tan_kernel);
+REGISTER_XPU_DISPATCH(atan_stub, &xpu::atan_kernel);
+REGISTER_XPU_DISPATCH(atanh_stub, &xpu::atanh_kernel);
+REGISTER_XPU_DISPATCH(cosh_stub, &xpu::cosh_kernel);
+REGISTER_XPU_DISPATCH(nan_to_num_stub, &xpu::nan_to_num_kernel);
+REGISTER_XPU_DISPATCH(round_stub, &xpu::round_kernel);
+REGISTER_XPU_DISPATCH(round_decimals_stub, &xpu::round_decimals_kernel);
+REGISTER_XPU_DISPATCH(floor_stub, &xpu::floor_kernel);
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/UnfoldBackward.cpp b/src/ATen/native/xpu/UnfoldBackward.cpp
index d97546367..4f54e07ca 100644
--- a/src/ATen/native/xpu/UnfoldBackward.cpp
+++ b/src/ATen/native/xpu/UnfoldBackward.cpp
@@ -1,26 +1,12 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/UnfoldBackward.h>
 #include <ATen/native/xpu/sycl/UnfoldBackwardKernels.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
 
-Tensor XPUNativeFunctions::unfold_backward(
-    const Tensor& grad,
-    IntArrayRef input_sizes,
-    int64_t dim,
-    int64_t size,
-    int64_t step) {
-  auto grad_input = at::zeros(input_sizes, grad.options());
-  if (step >= size) {
-    auto gI_unfolded = grad_input.unfold(dim, size, step);
-    gI_unfolded.copy_(grad);
-    return grad_input;
-  }
-  native::xpu::unfold_backward_kernel(grad_input, grad, dim, size, step);
-
-  return grad_input;
+namespace native {
+REGISTER_XPU_DISPATCH(unfold_backward_stub, &xpu::unfold_backward_kernel);
 }
-
 } // namespace at
diff --git a/src/ATen/native/xpu/Unique.cpp b/src/ATen/native/xpu/Unique.cpp
index 2442370bd..423bae002 100644
--- a/src/ATen/native/xpu/Unique.cpp
+++ b/src/ATen/native/xpu/Unique.cpp
@@ -1,53 +1,55 @@
 #include <ATen/native/xpu/sycl/UniqueKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 
 namespace at {
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::unique_consecutive(
+namespace native {
+
+std::tuple<Tensor, Tensor, Tensor> unique_dim_xpu(
     const Tensor& self,
+    const int64_t dim,
+    const bool sorted,
     const bool return_inverse,
-    const bool return_counts,
-    std::optional<int64_t> dim) {
-  if (!dim.has_value()) {
-    return native::xpu::unique_consecutive_kernel(
-        self, return_inverse, return_counts, dim);
-  }
-  return native::xpu::unique_dim_consecutive_kernel(
-      self, dim.value(), return_inverse, return_counts);
+    const bool return_counts) {
+  return xpu::unique_dim_kernel(self, dim, return_inverse, return_counts);
+}
+
+std::tuple<Tensor, Tensor> _unique_xpu(
+    const Tensor& self,
+    const bool sorted,
+    const bool return_inverse) {
+  return xpu::_unique_kernel(self, return_inverse);
 }
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::unique_dim_consecutive(
+std::tuple<Tensor, Tensor, Tensor> unique_dim_consecutive_xpu(
     const at::Tensor& self,
     int64_t dim,
     bool return_inverse,
     bool return_counts) {
-  return native::xpu::unique_dim_consecutive_kernel(
+  return xpu::unique_dim_consecutive_kernel(
       self, dim, return_inverse, return_counts);
 }
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::unique_dim(
+std::tuple<Tensor, Tensor, Tensor> unique_consecutive_xpu(
     const Tensor& self,
-    const int64_t dim,
-    const bool sorted,
     const bool return_inverse,
-    const bool return_counts) {
-  return native::xpu::unique_dim_kernel(
-      self, dim, return_inverse, return_counts);
-}
-
-std::tuple<Tensor, Tensor> XPUNativeFunctions::_unique(
-    const Tensor& self,
-    const bool sorted,
-    const bool return_inverse) {
-  return native::xpu::_unique_kernel(self, return_inverse);
+    const bool return_counts,
+    std::optional<int64_t> dim) {
+  if (!dim.has_value()) {
+    return xpu::unique_consecutive_kernel(
+        self, return_inverse, return_counts, dim);
+  }
+  return xpu::unique_dim_consecutive_kernel(
+      self, dim.value(), return_inverse, return_counts);
 }
 
-std::tuple<Tensor, Tensor, Tensor> XPUNativeFunctions::_unique2(
+std::tuple<Tensor, Tensor, Tensor> _unique2_xpu(
     const Tensor& self,
     const bool sorted,
     const bool return_inverse,
     const bool return_counts) {
-  return native::xpu::_unique2_kernel(self, return_inverse, return_counts);
+  return xpu::_unique2_kernel(self, return_inverse, return_counts);
 }
 
+} // namespace native
+
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/UpSampleBicubic2d.cpp b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
index 509d6e449..8ab810eb9 100644
--- a/src/ATen/native/xpu/UpSampleBicubic2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
@@ -2,66 +2,20 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
+#include <xpu/ATen/ops/upsample_bicubic2d_native.h>
 namespace at {
-
-void upsample_bicubic2d_meta(
-    Tensor& output,
-    const Tensor& input,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales_h,
-    std::optional<double> scales_w) {
-  auto full_output_size =
-      native::xpu::upsample_2d_common_check(input.sizes(), output_size);
-
-  // Allow for empty batch size but not other dimensions
-  TORCH_CHECK(
-      input.numel() != 0 ||
-          c10::multiply_integers(
-              input.sizes().begin() + 1, input.sizes().end()),
-      "Non-empty 4D data tensor expected but got a tensor with sizes ",
-      input.sizes());
-  auto memory_format = input.suggest_memory_format();
-  if (output.defined()) {
-    xpu::resize_out(
-        output,
-        full_output_size,
-        {},
-        input.options().memory_format(memory_format));
-  } else {
-    output = at::xpu::create_out(
-        full_output_size, {}, input.options().memory_format(memory_format));
-  }
-}
-
-Tensor& XPUNativeFunctions::upsample_bicubic2d_out(
-    const Tensor& self,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales_h,
-    std::optional<double> scales_w,
-    Tensor& output) {
-  upsample_bicubic2d_meta(
-      output, self, output_size, align_corners, scales_h, scales_w);
-  native::xpu::upsample_bicubic2d_kernel(
-      output, self, output_size, align_corners, scales_h, scales_w);
-  return output;
+namespace native {
+TORCH_IMPL_FUNC(upsample_bicubic2d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ bool align_corners,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& output) {
+  xpu::upsample_bicubic2d_kernel(
+      output, input, output_size, align_corners, scales_h, scales_w);
 }
-
-Tensor XPUNativeFunctions::upsample_bicubic2d(
-    const Tensor& self,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales_h,
-    std::optional<double> scales_w) {
-  Tensor output;
-  upsample_bicubic2d_out(
-      self, output_size, align_corners, scales_h, scales_w, output);
-
-  return output;
-}
-
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
index f0ace4344..67fed551c 100644
--- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -1,133 +1,34 @@
 #include <ATen/Context.h>
 #include <ATen/core/Tensor.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-namespace at {
-
-void upsample_bilinear2d_meta(
-    const Tensor& input,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales_h,
-    std::optional<double> scales_w,
-    Tensor& output) {
-  auto full_output_size =
-      native::xpu::upsample_2d_common_check(input.sizes(), output_size);
-
-  // Allow for empty batch size but not other dimensions
-  TORCH_CHECK(
-      input.numel() != 0 ||
-          c10::multiply_integers(
-              input.sizes().begin() + 1, input.sizes().end()),
-      "Non-empty 4D data tensor expected but got a tensor with sizes ",
-      input.sizes());
-
-  auto memory_format = input.suggest_memory_format();
-  if (output.defined()) {
-    xpu::resize_out(
-        output,
-        full_output_size,
-        {},
-        input.options().memory_format(memory_format));
-  } else {
-    output = at::xpu::create_out(
-        full_output_size, {}, input.options().memory_format(memory_format));
-  }
-}
-
-void upsample_bilinear2d_backward_meta(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    std::optional<double> scales_h,
-    std::optional<double> scales_w,
-    Tensor& grad_input) {
-  auto full_output_size =
-      native::xpu::upsample_2d_common_check(input_size, output_size);
+#include <xpu/ATen/ops/upsample_bilinear2d_backward_native.h>
+#include <xpu/ATen/ops/upsample_bilinear2d_native.h>
 
-  TORCH_CHECK(
-      grad_output.dim() == 4,
-      "Expected grad_output to be a tensor of dimension 4 but got: dimension ",
-      grad_output.dim());
-
-  for (const auto i : c10::irange(4)) {
-    TORCH_CHECK(
-        grad_output.size(i) == full_output_size[i],
-        "Expected grad_output to have the same shape as output;",
-        " output.size(",
-        i,
-        ") = ",
-        full_output_size[i],
-        " but got grad_output.size(",
-        i,
-        ") = ",
-        grad_output.size(i));
-  }
-
-  auto memory_format = grad_output.suggest_memory_format();
-  if (grad_input.defined()) {
-    xpu::resize_out(
-        grad_input,
-        input_size,
-        {},
-        grad_output.options().memory_format(memory_format));
-  } else {
-    grad_input = at::xpu::create_out(
-        input_size, {}, grad_output.options().memory_format(memory_format));
-  }
-}
-
-Tensor& XPUNativeFunctions::upsample_bilinear2d_out(
-    const Tensor& self,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales_h,
-    std::optional<double> scales_w,
-    Tensor& output) {
-  upsample_bilinear2d_meta(
-      self, output_size, align_corners, scales_h, scales_w, output);
-  native::xpu::upsample_bilinear2d_out_kernel(
-      output, self, output_size, align_corners, scales_h, scales_w);
-  return output;
-}
-
-Tensor XPUNativeFunctions::upsample_bilinear2d(
-    const Tensor& self,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales_h,
-    std::optional<double> scales_w) {
-  Tensor output;
-  upsample_bilinear2d_out(
-      self, output_size, align_corners, scales_h, scales_w, output);
-  return output;
+namespace at {
+namespace native {
+TORCH_IMPL_FUNC(upsample_bilinear2d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ bool align_corners,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& output) {
+  xpu::upsample_bilinear2d_out_kernel(
+      output, input, output_size, align_corners, scales_h, scales_w);
 }
 
-Tensor& XPUNativeFunctions::upsample_bilinear2d_backward_out(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    Tensor& grad_input) {
-  globalContext().alertNotDeterministic("upsample_bilinear2d_backward_xpu");
-
-  upsample_bilinear2d_backward_meta(
-      grad_output,
-      output_size,
-      input_size,
-      align_corners,
-      scales_h,
-      scales_w,
-      grad_input);
-
-  native::xpu::upsample_bilinear2d_backward_out_kernel(
+TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_xpu)
+(const Tensor& grad_output,
+ IntArrayRef output_size,
+ IntArrayRef input_size,
+ bool align_corners,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& grad_input) {
+  xpu::upsample_bilinear2d_backward_out_kernel(
       grad_input,
       grad_output,
       output_size,
@@ -135,26 +36,7 @@ Tensor& XPUNativeFunctions::upsample_bilinear2d_backward_out(
       align_corners,
       scales_h,
       scales_w);
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::upsample_bilinear2d_backward(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  Tensor grad_input;
-  upsample_bilinear2d_backward_out(
-      grad_output,
-      output_size,
-      input_size,
-      align_corners,
-      scales_h,
-      scales_w,
-      grad_input);
-  return grad_input;
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/UpSampleLinear1d.cpp b/src/ATen/native/xpu/UpSampleLinear1d.cpp
index fcce31524..13dfa33de 100644
--- a/src/ATen/native/xpu/UpSampleLinear1d.cpp
+++ b/src/ATen/native/xpu/UpSampleLinear1d.cpp
@@ -1,111 +1,42 @@
 #include <ATen/ATen.h>
 #include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/UpSampleLinear1dKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
 #include <comm/RegisterUtils.h>
 #include "ATen/core/ATen_fwd.h"
 
-namespace at {
-
-void upsample_linear1d_meta(
-    const Tensor& input,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales,
-    Tensor& output) {
-  auto full_output_size =
-      at::native::xpu::upsample_1d_common_check(input.sizes(), output_size);
-
-  // Allow for empty batch size but not other dimensions
-  TORCH_CHECK(
-      (input.size(1) != 0 && input.size(2) != 0) && input.dim() == 3,
-      "Non-empty 3D data tensor expected but got a tensor with sizes ",
-      input.sizes());
-
-  if (output.defined()) {
-    at::xpu::resize_out(output, full_output_size, {}, input.options());
-  } else {
-    output = at::xpu::create_out(full_output_size, {}, input.options());
-  }
-}
-void upsample_linear1d_backward_meta(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    std::optional<double> scales,
-    Tensor& grad_input) {
-  auto full_output_size =
-      at::native::xpu::upsample_1d_common_check(input_size, output_size);
-
-  TORCH_CHECK(
-      input_size.size() == 3,
-      "It is expected input_size equals to 3, but got size ",
-      input_size.size());
-
-  check_dim_size(grad_output, 3, 0, full_output_size[0]);
-  check_dim_size(grad_output, 3, 1, full_output_size[1]);
-  check_dim_size(grad_output, 3, 2, full_output_size[2]);
-
-  if (grad_input.defined()) {
-    at::xpu::resize_out(grad_input, input_size, {}, grad_output.options());
-  } else {
-    grad_input = at::xpu::create_out(input_size, {}, grad_output.options());
-  }
-}
-
-Tensor XPUNativeFunctions::upsample_linear1d(
-    const Tensor& input,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales) {
-  Tensor output;
-  return upsample_linear1d_out(
-      input, output_size, align_corners, scales, output);
-}
-
-Tensor& XPUNativeFunctions::upsample_linear1d_out(
-    const Tensor& input,
-    IntArrayRef output_size,
-    bool align_corners,
-    std::optional<double> scales,
-    Tensor& output) {
-  upsample_linear1d_meta(input, output_size, align_corners, scales, output);
+#include <xpu/ATen/ops/upsample_linear1d_backward_native.h>
+#include <xpu/ATen/ops/upsample_linear1d_native.h>
 
+namespace at {
+namespace native {
+
+TORCH_IMPL_FUNC(upsample_linear1d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ bool align_corners,
+ std::optional<double> scales,
+ const Tensor& output) {
   TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
   checkAllSameGPU(__func__, {input_arg, output_arg});
 
-  native::xpu::upsample_linear1d_kernel(
+  xpu::upsample_linear1d_kernel(
       input, output_size, align_corners, scales, output);
-  return output;
 }
-Tensor XPUNativeFunctions::upsample_linear1d_backward(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    std::optional<double> scales) {
-  Tensor grad_input;
-  return upsample_linear1d_backward_out(
-      grad_output, output_size, input_size, align_corners, scales, grad_input);
-}
-
-Tensor& XPUNativeFunctions::upsample_linear1d_backward_out(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    std::optional<double> scales,
-    Tensor& grad_input) {
-  upsample_linear1d_backward_meta(
-      grad_output, output_size, input_size, align_corners, scales, grad_input);
 
+TORCH_IMPL_FUNC(upsample_linear1d_backward_out_xpu)
+(const Tensor& grad_output,
+ IntArrayRef output_size,
+ IntArrayRef input_size,
+ bool align_corners,
+ std::optional<double> scales,
+ const Tensor& grad_input) {
   TensorArg grad_output_arg{grad_output, "grad_output", 1},
       grad_input_arg{grad_input, "grad_input", 2};
   checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
-  native::xpu::upsample_linear1d_backward_kernel(
+  xpu::upsample_linear1d_backward_kernel(
       grad_output, output_size, input_size, align_corners, scales, grad_input);
-  return grad_input;
 }
+} // namespace native
 
 } // namespace at
diff --git a/src/ATen/native/xpu/UpSampleNearest1d.cpp b/src/ATen/native/xpu/UpSampleNearest1d.cpp
index e2684fcac..30287e4b2 100644
--- a/src/ATen/native/xpu/UpSampleNearest1d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest1d.cpp
@@ -1,173 +1,51 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/UpSampleNearest1dKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 
-namespace at {
-
-Tensor& upsample_nearest1d_meta(
-    const Tensor& input,
-    Tensor& output,
-    IntArrayRef output_size) {
-  auto input_size = input.sizes();
-  TORCH_CHECK(
-      output_size.size() == 1,
-      "It is expected output_size equals to 1, but got size ",
-      output_size.size());
-
-  TORCH_CHECK(
-      input_size.size() == 3,
-      "It is expected input_size equals to 3, but got size ",
-      input_size.size());
-
-  int64_t output_width = output_size[0];
-
-  int64_t nbatch = input_size[0];
-  int64_t channels = input_size[1];
-  int64_t input_width = input_size[2];
-
-  TORCH_CHECK(
-      input_width > 0 && output_width > 0,
-      "Input and output sizes should be greater than 0, but got input (W: ",
-      input_width,
-      ") and output (W: ",
-      output_width,
-      ")");
-  TORCH_CHECK(
-      (input.size(1) != 0 && input.size(2) != 0) && input.dim() == 3,
-      "Non-empty 3D data tensor expected but got a tensor with sizes ",
-      input.sizes());
-
-  if (!output.defined())
-    output = at::empty({nbatch, channels, output_width}, input.options());
-  return output;
-}
-
-Tensor& upsample_nearest1d_backward_meta(
-    const Tensor& grad_output,
-    Tensor& grad_input,
-    IntArrayRef input_size,
-    IntArrayRef output_size) {
-  TORCH_CHECK(
-      output_size.size() == 1,
-      "It is expected output_size equals to 1, but got size ",
-      output_size.size());
-  TORCH_CHECK(
-      input_size.size() == 3,
-      "It is expected input_size equals to 3, but got size ",
-      input_size.size());
-  int64_t output_width = output_size[0];
-  int64_t nbatch = input_size[0];
-  int64_t channels = input_size[1];
-  int64_t input_width = input_size[2];
-  TORCH_CHECK(
-      input_width > 0 && output_width > 0,
-      "Input and output sizes should be greater than 0, but got input (W: ",
-      input_width,
-      ") and output (W: ",
-      output_width,
-      ")");
-  check_dim_size(grad_output, 3, 0, nbatch);
-  check_dim_size(grad_output, 3, 1, channels);
-  check_dim_size(grad_output, 3, 2, output_width);
-  if (!grad_input.defined())
-    grad_input = at::empty(input_size, grad_output.options());
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::_upsample_nearest_exact1d(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales) {
-  Tensor output;
-  output = upsample_nearest1d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest1d_kernel(
-      output, input, output_size, scales, true);
-  return output;
-}
+#include <xpu/ATen/ops/_upsample_nearest_exact1d_backward_native.h>
+#include <xpu/ATen/ops/_upsample_nearest_exact1d_native.h>
+#include <xpu/ATen/ops/upsample_nearest1d_backward_native.h>
+#include <xpu/ATen/ops/upsample_nearest1d_native.h>
 
-Tensor& XPUNativeFunctions::_upsample_nearest_exact1d_out(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales,
-    Tensor& output) {
-  upsample_nearest1d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest1d_kernel(
-      output, input, output_size, scales, true);
-  return output;
-}
-
-Tensor XPUNativeFunctions::upsample_nearest1d(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales) {
-  Tensor output;
-  output = upsample_nearest1d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest1d_kernel(
-      output, input, output_size, scales, false);
-  return output;
-}
-
-Tensor& XPUNativeFunctions::upsample_nearest1d_out(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales,
-    Tensor& output) {
-  upsample_nearest1d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest1d_kernel(
-      output, input, output_size, scales, false);
-  return output;
-}
-
-Tensor XPUNativeFunctions::_upsample_nearest_exact1d_backward(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    std::optional<double> scales) {
-  Tensor grad_input;
-  grad_input = upsample_nearest1d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest1d_backward_kernel(
-      grad_input, grad_output, output_size, input_size, scales, true);
-  return grad_input;
-}
-Tensor& XPUNativeFunctions::_upsample_nearest_exact1d_backward_out(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    std::optional<double> scales,
-    Tensor& grad_input) {
-  upsample_nearest1d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest1d_backward_kernel(
+namespace at {
+namespace native {
+TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ std::optional<double> scales,
+ const Tensor& output) {
+  xpu::upsample_nearest1d_kernel(output, input, output_size, scales, true);
+}
+
+TORCH_IMPL_FUNC(upsample_nearest1d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ std::optional<double> scales,
+ const Tensor& output) {
+  xpu::upsample_nearest1d_kernel(output, input, output_size, scales, false);
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_xpu)
+(const Tensor& grad_output,
+ IntArrayRef output_size,
+ IntArrayRef input_size,
+ std::optional<double> scales,
+ const Tensor& grad_input) {
+  grad_input.zero_();
+  xpu::upsample_nearest1d_backward_kernel(
       grad_input, grad_output, output_size, input_size, scales, true);
-  return grad_input;
 }
 
-Tensor XPUNativeFunctions::upsample_nearest1d_backward(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    std::optional<double> scales) {
-  Tensor grad_input;
-  grad_input = upsample_nearest1d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest1d_backward_kernel(
+TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_xpu)
+(const Tensor& grad_output,
+ IntArrayRef output_size,
+ IntArrayRef input_size,
+ std::optional<double> scales,
+ const Tensor& grad_input) {
+  grad_input.zero_();
+  xpu::upsample_nearest1d_backward_kernel(
       grad_input, grad_output, output_size, input_size, scales, false);
-  return grad_input;
-}
-
-Tensor& XPUNativeFunctions::upsample_nearest1d_backward_out(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    std::optional<double> scales,
-    Tensor& grad_input) {
-  upsample_nearest1d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest1d_backward_kernel(
-      grad_input, grad_output, output_size, input_size, scales, true);
-  return grad_input;
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/UpSampleNearest2d.cpp b/src/ATen/native/xpu/UpSampleNearest2d.cpp
index da9f9474f..9ebbd74b1 100644
--- a/src/ATen/native/xpu/UpSampleNearest2d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest2d.cpp
@@ -1,224 +1,44 @@
-#include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/UpSampleNearest2dKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/xpu_aten.h>
 
+#include <xpu/ATen/ops/_upsample_nearest_exact2d_backward_native.h>
+#include <xpu/ATen/ops/_upsample_nearest_exact2d_native.h>
+#include <xpu/ATen/ops/upsample_nearest2d_backward_native.h>
+#include <xpu/ATen/ops/upsample_nearest2d_native.h>
 namespace at {
 
-Tensor& upsample_nearest2d_meta(
-    const Tensor& input,
-    Tensor& output,
-    IntArrayRef output_size) {
-  auto input_size = input.sizes();
+namespace native {
 
-  TORCH_CHECK(
-      output_size.size() == 2,
-      "It is expected output_size equals to 2, but got size ",
-      output_size.size());
-
-  TORCH_CHECK(
-      input_size.size() == 4,
-      "It is expected input_size equals to 4, but got size ",
-      input_size.size());
-
-  int64_t output_height = output_size[0];
-  int64_t output_width = output_size[1];
-
-  int64_t nbatch = input_size[0];
-  int64_t channels = input_size[1];
-  int64_t input_height = input_size[2];
-  int64_t input_width = input_size[3];
-
-  TORCH_CHECK(
-      input_height > 0 && input_width > 0 && output_height > 0 &&
-          output_width > 0,
-      "Input and output sizes should be greater than 0,"
-      " but got input (H: ",
-      input_height,
-      ", W: ",
-      input_width,
-      ") output (H: ",
-      output_height,
-      ", W: ",
-      output_width,
-      ")");
-
-  // Allow for empty batch size but not other dimensions
-  TORCH_CHECK(
-      input.numel() != 0 ||
-          c10::multiply_integers(
-              input.sizes().begin() + 1, input.sizes().end()),
-      "Non-empty 4D data tensor expected but got a tensor with sizes ",
-      input.sizes());
-
-  if (!output.defined())
-    output = at::empty(
-        {nbatch, channels, output_height, output_width},
-        input.options().memory_format(input.suggest_memory_format()));
-  return output;
-}
-
-Tensor& upsample_nearest2d_backward_meta(
-    const Tensor& grad_output,
-    Tensor& grad_input,
-    IntArrayRef input_size,
-    IntArrayRef output_size) {
-  TORCH_CHECK(
-      output_size.size() == 2,
-      "It is expected output_size equals to 2, but got size ",
-      output_size.size());
-
-  TORCH_CHECK(
-      input_size.size() == 4,
-      "It is expected input_size equals to 4, but got size ",
-      input_size.size());
-
-  int64_t output_height = output_size[0];
-  int64_t output_width = output_size[1];
-
-  int64_t nbatch = input_size[0];
-  int64_t channels = input_size[1];
-  int64_t input_height = input_size[2];
-  int64_t input_width = input_size[3];
-
-  TORCH_CHECK(
-      input_height > 0 && input_width > 0 && output_height > 0 &&
-          output_width > 0,
-      "Input and output sizes should be greater than 0,"
-      " but got input (H: ",
-      input_height,
-      ", W: ",
-      input_width,
-      ") output (H: ",
-      output_height,
-      ", W: ",
-      output_width,
-      ")");
-
-  TORCH_CHECK(
-      grad_output.dim() == 4,
-      "Expected grad_output to be a tensor of dimension 4 but got: dimension ",
-      grad_output.dim());
-  std::array<int64_t, 4> full_output_size = {
-      nbatch, channels, output_height, output_width};
-  for (const auto i : c10::irange(4)) {
-    TORCH_CHECK(
-        grad_output.size(i) == full_output_size[i],
-        "Expected grad_output to have the same shape as output;",
-        " output.size(",
-        i,
-        ") = ",
-        full_output_size[i],
-        " but got grad_output.size(",
-        i,
-        ") = ",
-        grad_output.size(i));
-  }
-  if (!grad_input.defined())
-    grad_input = at::empty(
-        input_size,
-        grad_output.options().memory_format(
-            grad_output.suggest_memory_format()));
-  return grad_input;
-}
-
-Tensor XPUNativeFunctions::_upsample_nearest_exact2d(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  Tensor output;
-  output = upsample_nearest2d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest2d_kernel(
-      output, input, output_size, scales_h, scales_w, true);
-  return output;
-}
-
-Tensor& XPUNativeFunctions::_upsample_nearest_exact2d_out(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    Tensor& output) {
-  upsample_nearest2d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest2d_kernel(
-      output, input, output_size, scales_h, scales_w, true);
-  return output;
-}
-
-Tensor XPUNativeFunctions::upsample_nearest2d(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  Tensor output;
-  output = upsample_nearest2d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest2d_kernel(
+TORCH_IMPL_FUNC(upsample_nearest2d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& output) {
+  xpu::upsample_nearest2d_kernel(
       output, input, output_size, scales_h, scales_w, false);
-  return output;
 }
 
-Tensor& XPUNativeFunctions::upsample_nearest2d_out(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    Tensor& output) {
-  upsample_nearest2d_meta(input, output, output_size);
-  at::native::xpu::upsample_nearest2d_kernel(
-      output, input, output_size, scales_h, scales_w, false);
-  return output;
-}
-
-Tensor XPUNativeFunctions::_upsample_nearest_exact2d_backward(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  Tensor grad_input;
-  grad_input = upsample_nearest2d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest2d_backward_kernel(
-      grad_input,
-      grad_output,
-      output_size,
-      input_size,
-      scales_h,
-      scales_w,
-      true);
-  return grad_input;
-}
-Tensor& XPUNativeFunctions::_upsample_nearest_exact2d_backward_out(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    Tensor& grad_input) {
-  upsample_nearest2d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest2d_backward_kernel(
-      grad_input,
-      grad_output,
-      output_size,
-      input_size,
-      scales_h,
-      scales_w,
-      true);
-  return grad_input;
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_xpu)
+(const Tensor& input,
+ IntArrayRef output_size,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& output) {
+  xpu::upsample_nearest2d_kernel(
+      output, input, output_size, scales_h, scales_w, true);
 }
 
-Tensor XPUNativeFunctions::upsample_nearest2d_backward(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  Tensor grad_input;
-  grad_input = upsample_nearest2d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest2d_backward_kernel(
+TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_xpu)
+(const Tensor& grad_output,
+ IntArrayRef output_size,
+ IntArrayRef input_size,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& grad_input) {
+  grad_input.zero_();
+  xpu::upsample_nearest2d_backward_kernel(
       grad_input,
       grad_output,
       output_size,
@@ -226,19 +46,17 @@ Tensor XPUNativeFunctions::upsample_nearest2d_backward(
       scales_h,
       scales_w,
       false);
-  return grad_input;
 }
 
-Tensor& XPUNativeFunctions::upsample_nearest2d_backward_out(
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    Tensor& grad_input) {
-  upsample_nearest2d_backward_meta(
-      grad_output, grad_input, input_size, output_size);
-  at::native::xpu::upsample_nearest2d_backward_kernel(
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_xpu)
+(const Tensor& grad_output,
+ IntArrayRef output_size,
+ IntArrayRef input_size,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& grad_input) {
+  grad_input.zero_();
+  xpu::upsample_nearest2d_backward_kernel(
       grad_input,
       grad_output,
       output_size,
@@ -246,7 +64,7 @@ Tensor& XPUNativeFunctions::upsample_nearest2d_backward_out(
       scales_h,
       scales_w,
       true);
-  return grad_input;
 }
 
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/WeightNorm.cpp b/src/ATen/native/xpu/WeightNorm.cpp
index 7fec9ecfe..81f5288ab 100644
--- a/src/ATen/native/xpu/WeightNorm.cpp
+++ b/src/ATen/native/xpu/WeightNorm.cpp
@@ -1,14 +1,15 @@
 #include <ATen/native/xpu/sycl/WeightNormKernels.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
+
 namespace at {
-std::tuple<Tensor, Tensor> XPUNativeFunctions::_weight_norm_interface(
+namespace native {
+std::tuple<Tensor, Tensor> weight_norm_xpu(
     const Tensor& v,
     const Tensor& g,
     int64_t dim) {
   return native::xpu::weight_norm_kernel(v, g, dim);
 }
 
-std::tuple<Tensor, Tensor> XPUNativeFunctions::_weight_norm_interface_backward(
+std::tuple<Tensor, Tensor> weight_norm_backward_xpu(
     const Tensor& grad_w,
     const Tensor& saved_v,
     const Tensor& saved_g,
@@ -24,4 +25,6 @@ std::tuple<Tensor, Tensor> XPUNativeFunctions::_weight_norm_interface_backward(
   return native::xpu::weight_norm_backward_kernel(
       grad_w, saved_v, saved_g, saved_norms, dim);
 }
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/XPUScalar.cpp b/src/ATen/native/xpu/XPUScalar.cpp
index 17cbe66a3..d47dd7871 100644
--- a/src/ATen/native/xpu/XPUScalar.cpp
+++ b/src/ATen/native/xpu/XPUScalar.cpp
@@ -2,13 +2,12 @@
 #include <ATen/Dispatch_v2.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/core/Tensor.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <comm/SYCLContext.h>
+#include <xpu/ATen/ops/_local_scalar_dense_native.h>
 
-namespace at {
+namespace at::native {
 
-Scalar XPUNativeFunctions::_local_scalar_dense(const Tensor& self) {
+Scalar _local_scalar_dense_xpu(const Tensor& self) {
   Scalar r;
   AT_DISPATCH_V2(
       self.scalar_type(),
@@ -41,4 +40,4 @@ Scalar XPUNativeFunctions::_local_scalar_dense(const Tensor& self) {
   return r;
 }
 
-} // namespace at
+} // namespace at::native
diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
index 57b5e34b3..ee73856ee 100644
--- a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp
@@ -1,5 +1,5 @@
-#include <ATen/ATen.h>
 #include <ATen/OpMathType.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
index 5bbaa1ab0..2bfcccb69 100644
--- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
@@ -1,11 +1,10 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Activation.h>
 #include <ATen/native/TensorIterator.h>
-
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
 
diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
index 6399e6b23..c3fe41fd8 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
index 7dadbf3aa..8c6e47f77 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
@@ -1,10 +1,10 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Activation.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/ActivationHardswishKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
index 2f009757b..1dcd78b88 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Activation.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
index 81e2cb5ba..88d170352 100644
--- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Activation.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
index 09487462e..f4051184e 100644
--- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp
@@ -45,7 +45,7 @@ struct LogSigmoidBackwardFunctor {
   }
 };
 
-void log_sigmoid_backward_kernel(TensorIteratorBase& iter) {
+void log_sigmoid_backward_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
index be8e7266c..42ee9dbea 100644
--- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h
@@ -6,6 +6,6 @@ namespace at::native::xpu {
 
 TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
 
-TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
index 5e517911b..9033b103c 100644
--- a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
@@ -42,7 +42,7 @@ struct MishBackwardFunctor {
   }
 };
 
-void mish_backward_kernel(TensorIteratorBase& iter) {
+void mish_backward_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
index a2c7e499d..7c4efc0af 100644
--- a/src/ATen/native/xpu/sycl/ActivationMishKernels.h
+++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
@@ -6,6 +6,6 @@ namespace at::native::xpu {
 
 TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter);
 
-TORCH_XPU_API void mish_backward_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void mish_backward_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
index 924b75d81..7d9f0872a 100644
--- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Activation.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/XPUMathCompat.h>
diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
index 175db7753..3915a5560 100644
--- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Activation.h>
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp
new file mode 100644
index 000000000..08a1456ad
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp
@@ -0,0 +1,322 @@
+#include <ATen/AccumulateType.h>
+#include <ATen/Config.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/AdaptivePooling.h>
+#include <ATen/native/Pool.h>
+#include <comm/MemoryFormat.h>
+#include <comm/xpu_aten.h>
+#include <vector>
+
+namespace at::native::xpu {
+
+using namespace at::xpu;
+
+template <typename scalar_t, typename accscalar_t, bool is_channels_last>
+struct AdaptiveAvgPool2dBwdKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    int64_t gi = item.get_global_linear_id();
+
+    for (int64_t i = gi; i < numel; i += global_range) {
+      int64_t _iw, _ih, _ic, _ib;
+      if constexpr (is_channels_last) {
+        _ic = i % ic;
+        _iw = i / ic % iw;
+        _ih = i / ic / iw % ih;
+        _ib = i / ic / iw / ih;
+      } else {
+        _iw = i % iw;
+        _ih = i / iw % ih;
+        _ic = i / iw / ih % ic;
+        _ib = i / iw / ih / ic;
+      }
+
+      int64_t _oh0 = native::start_index(_ih, ih, oh);
+      int64_t _oh1 = native::end_index(_ih, ih, oh);
+      int64_t _ow0 = native::start_index(_iw, iw, ow);
+      int64_t _ow1 = native::end_index(_iw, iw, ow);
+      int64_t _ob = _ib;
+      int64_t _oc = _ic;
+
+      accscalar_t gx = 0;
+      accscalar_t _ikh, _ikw;
+      for (int _oh = _oh0; _oh < _oh1; _oh++) {
+        _ikh = accscalar_t(1.0) /
+            (accscalar_t)(native::end_index(_oh, oh, ih) - native::start_index(_oh, oh, ih));
+        for (int _ow = _ow0; _ow < _ow1; _ow++) {
+          _ikw = accscalar_t(1.0) /
+              (accscalar_t)(native::end_index(_ow, ow, iw) - native::start_index(_ow, ow, iw));
+          gx += gyacc[_ob][_oc][_oh][_ow] * _ikh * _ikw;
+        }
+      }
+
+      const auto store = [](PackedTensorAccessor64<scalar_t, 4> gxacc,
+                            int64_t _ib,
+                            int64_t _ic,
+                            int64_t _ih,
+                            int64_t _iw,
+                            scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; };
+      store(gxacc, _ib, _ic, _ih, _iw, (scalar_t)gx);
+    }
+  }
+
+  AdaptiveAvgPool2dBwdKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 4> gyacc_,
+      PackedTensorAccessor64<scalar_t, 4> gxacc_)
+      : gyacc(gyacc_), gxacc(gxacc_) {
+    ib = gxacc.size(0);
+    ic = gxacc.size(1);
+    ih = gxacc.size(2);
+    iw = gxacc.size(3);
+    oh = gyacc.size(2);
+    ow = gyacc.size(3);
+
+    numel = ib * ic * ih * iw;
+    int total_item = std::min(numel, syclMaxWorkItemsPerTile());
+    local_range = syclMaxWorkItemsPerEU();
+    global_range = total_item < local_range
+        ? local_range
+        : (total_item / local_range) * local_range;
+  }
+
+  sycl::range<1> glb_range() {
+    return sycl::range<1>(global_range);
+  }
+
+  sycl::range<1> loc_range() {
+    return sycl::range<1>(local_range);
+  }
+
+ private:
+  int ib;
+  int ic;
+  int ih;
+  int iw;
+  int oh;
+  int ow;
+  int64_t numel;
+  int global_range;
+  int local_range;
+  PackedTensorAccessor64<scalar_t, 4> gyacc;
+  PackedTensorAccessor64<scalar_t, 4> gxacc;
+};
+
+template <typename scalar_t, typename accscalar_t, bool is_channels_last>
+struct AdaptiveAvgPool2dBwdSLMKernelFunctor
+    : public __SYCL_KER_CONFIG_CONVENTION__ {
+  void operator()(sycl::nd_item<1> item) const {
+    int64_t gi = item.get_global_linear_id();
+    int64_t li = item.get_local_id(0);
+
+    // for-loop order: oh*ow->ih->iw
+    // reuse oh*ow(oh0, oh1, ow0, ow1), ih(ikh), iw(ikw) in inner loop.
+    for (int _ih = li; _ih < ih; _ih += local_range) {
+      _oh0_cached[_ih] = (int)native::start_index(_ih, ih, oh);
+      _oh1_cached[_ih] = (int)native::end_index(_ih, ih, oh);
+    }
+    for (int _iw = li; _iw < iw; _iw += local_range) {
+      _ow0_cached[_iw] = (int)native::start_index(_iw, iw, ow);
+      _ow1_cached[_iw] = (int)native::end_index(_iw, iw, ow);
+    }
+    for (int _oh = li; _oh < oh; _oh += local_range) {
+      _ikh_cached[_oh] = accscalar_t(1.0) /
+          (accscalar_t)(native::end_index(_oh, oh, ih) -
+                        native::start_index(_oh, oh, ih));
+    }
+    for (int _ow = li; _ow < ow; _ow += local_range) {
+      _ikw_cached[_ow] = accscalar_t(1.0) /
+          (accscalar_t)(native::end_index(_ow, ow, iw) -
+                        native::start_index(_ow, ow, iw));
+    }
+
+    item.barrier(sycl_local_fence);
+
+    for (int64_t i = gi; i < numel; i += global_range) {
+      int64_t _iw, _ih, _ic, _ib;
+      if constexpr (is_channels_last) {
+        _ic = i % ic;
+        _iw = i / ic % iw;
+        _ih = i / ic / iw % ih;
+        _ib = i / ic / iw / ih;
+      } else {
+        _iw = i % iw;
+        _ih = i / iw % ih;
+        _ic = i / iw / ih % ic;
+        _ib = i / iw / ih / ic;
+      }
+
+      int64_t _oh0, _oh1, _ow0, _ow1;
+      _oh0 = _oh0_cached[_ih];
+      _oh1 = _oh1_cached[_ih];
+      _ow0 = _ow0_cached[_iw];
+      _ow1 = _ow1_cached[_iw];
+      int64_t _ob = _ib;
+      int64_t _oc = _ic;
+
+      accscalar_t gx = 0;
+      accscalar_t _ikh, _ikw;
+      for (int _oh = _oh0; _oh < _oh1; _oh++) {
+        _ikh = _ikh_cached[_oh];
+        for (int _ow = _ow0; _ow < _ow1; _ow++) {
+          _ikw = _ikw_cached[_ow];
+          gx += gyacc[_ob][_oc][_oh][_ow] * _ikh * _ikw;
+        }
+      }
+
+      const auto store = [](PackedTensorAccessor64<scalar_t, 4> gxacc,
+                            int64_t _ib,
+                            int64_t _ic,
+                            int64_t _ih,
+                            int64_t _iw,
+                            scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; };
+      store(gxacc, _ib, _ic, _ih, _iw, (scalar_t)gx);
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    _oh0_cached = sycl_local_acc_t<int>(ih, cgh);
+    _oh1_cached = sycl_local_acc_t<int>(ih, cgh);
+    _ow0_cached = sycl_local_acc_t<int>(iw, cgh);
+    _ow1_cached = sycl_local_acc_t<int>(iw, cgh);
+    _ikh_cached = sycl_local_acc_t<accscalar_t>(oh, cgh);
+    _ikw_cached = sycl_local_acc_t<accscalar_t>(ow, cgh);
+  }
+
+  AdaptiveAvgPool2dBwdSLMKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 4> gyacc_,
+      PackedTensorAccessor64<scalar_t, 4> gxacc_)
+      : gyacc(gyacc_), gxacc(gxacc_) {
+    ib = gxacc.size(0);
+    ic = gxacc.size(1);
+    ih = gxacc.size(2);
+    iw = gxacc.size(3);
+    oh = gyacc.size(2);
+    ow = gyacc.size(3);
+
+    numel = ib * ic * ih * iw;
+    int total_item = std::min(numel, syclMaxWorkItemsPerTile());
+
+    local_range = syclMaxWorkGroupSize(*this);
+    global_range = total_item < local_range
+        ? local_range
+        : (total_item / local_range) * local_range;
+  }
+
+  sycl::range<1> glb_range() {
+    return sycl::range<1>(global_range);
+  }
+
+  sycl::range<1> loc_range() {
+    return sycl::range<1>(local_range);
+  }
+
+ private:
+  int ib;
+  int ic;
+  int ih;
+  int iw;
+  int oh;
+  int ow;
+  int64_t numel;
+  int local_range;
+  int global_range;
+  PackedTensorAccessor64<scalar_t, 4> gyacc;
+  PackedTensorAccessor64<scalar_t, 4> gxacc;
+  sycl_local_acc_t<int> _oh0_cached;
+  sycl_local_acc_t<int> _oh1_cached;
+  sycl_local_acc_t<int> _ow0_cached;
+  sycl_local_acc_t<int> _ow1_cached;
+  sycl_local_acc_t<accscalar_t> _ikh_cached;
+  sycl_local_acc_t<accscalar_t> _ikw_cached;
+};
+
+void adaptive_avg_pool2d_backward_out_kernel(
+    Tensor& gradInput,
+    const Tensor& gradOutput,
+    const Tensor& input) {
+  TensorArg grad_input_arg{gradInput, "gradInput", 1},
+      grad_output_arg{gradOutput, "gradOutput", 2},
+      input_arg{input, "input", 3};
+  adaptive_pool_empty_output_check(gradOutput, "adaptive_avg_pool2d_backward");
+  checkAllSameGPU(__func__, {grad_input_arg, grad_output_arg, input_arg});
+
+  TORCH_CHECK(
+      (input.ndimension() == 3 || input.ndimension() == 4),
+      "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  auto outputHeight = gradOutput.size(-2);
+  auto outputWidth = gradOutput.size(-1);
+
+  const auto nInputPlane = input.size(-3);
+  const auto inputHeight = input.size(-2);
+  const auto inputWidth = input.size(-1);
+
+  int dH = std::floor((float)2 * inputHeight / outputHeight) -
+      (inputHeight / outputHeight);
+  int dW = std::floor((float)2 * inputWidth / outputWidth) -
+      (inputWidth / outputWidth);
+  std::vector<int64_t> stride_vec = {dH, dW};
+
+  int kH = std::ceil((float)2 * inputHeight / outputHeight) -
+      (inputHeight / outputHeight);
+  int kW = std::ceil((float)2 * inputWidth / outputWidth) -
+      (inputWidth / outputWidth);
+  std::vector<int64_t> kernel_size_vec = {kH, kW};
+
+  int padH = (dH * (outputHeight - 1) + kH - inputHeight) / 2;
+  int padW = (dW * (outputWidth - 1) + kW - inputWidth) / 2;
+  std::vector<int64_t> padding_vec = {padH, padW};
+
+  bool is_3d = gradOutput.ndimension() == 3;
+  if (is_3d) {
+    gradOutput.resize_({1, nInputPlane, outputHeight, outputWidth});
+    gradInput.resize_({1, nInputPlane, inputHeight, inputWidth});
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      gradOutput.scalar_type(),
+      "adaptive_avg_pool2d_backward_xpu",
+      [&]() {
+        using accscalar_t = acc_type<scalar_t, false>;
+        auto gyacc = gradOutput.packed_accessor64<scalar_t, 4>();
+        auto gxacc = gradInput.packed_accessor64<scalar_t, 4>();
+
+        int64_t ohw01_shared_size =
+            ((inputHeight + inputWidth) * 2) * sizeof(int);
+        int64_t ikhw_shared_size =
+            (outputHeight + outputWidth) * sizeof(accscalar_t);
+        bool using_shared =
+            syclLocalMemSize() >= ohw01_shared_size + ikhw_shared_size;
+
+        auto& q = getCurrentSYCLQueue();
+        if (is_smf_channels_last(gradOutput)) {
+          if (using_shared) {
+            AdaptiveAvgPool2dBwdSLMKernelFunctor<scalar_t, accscalar_t, true>
+                kfn(gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          } else {
+            AdaptiveAvgPool2dBwdKernelFunctor<scalar_t, accscalar_t, true> kfn(
+                gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          }
+        } else {
+          if (using_shared) {
+            AdaptiveAvgPool2dBwdSLMKernelFunctor<scalar_t, accscalar_t, false>
+                kfn(gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          } else {
+            AdaptiveAvgPool2dBwdKernelFunctor<scalar_t, accscalar_t, false> kfn(
+                gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          }
+        }
+      });
+
+  if (is_3d) {
+    gradOutput.resize_({nInputPlane, outputHeight, outputWidth});
+    gradInput.resize_({nInputPlane, inputHeight, inputWidth});
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
index 0221ceec0..86fd7edfe 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -1,9 +1,9 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/AdaptivePooling.h>
 #include <ATen/native/Pool.h>
 #include <comm/MemoryFormat.h>
+#include <comm/xpu_aten.h>
 #include <vector>
 
 #include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
index cb2c6b083..dacae3e68 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
@@ -153,8 +153,8 @@ void launch_adaptive_max_pool2d_kernel(
 void adaptive_max_pool2d_kernel(
     const Tensor& input,
     IntArrayRef output_size,
-    Tensor& output,
-    Tensor& indices) {
+    const Tensor& output,
+    const Tensor& indices) {
   int64_t osizeH = output_size[0];
   int64_t osizeW = output_size[1];
 
@@ -327,7 +327,7 @@ void adaptive_max_pool2d_backward_kernel(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& indices,
-    Tensor& grad_input) {
+    const Tensor& grad_input) {
   globalContext().alertNotDeterministic("adaptive_max_pool2d_backward_xpu");
 
   const at::Tensor grad_output_ = grad_output.contiguous();
diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
index abba4e354..2714e6627 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h
@@ -7,13 +7,13 @@ namespace at::native::xpu {
 TORCH_XPU_API void adaptive_max_pool2d_kernel(
     const Tensor& input,
     IntArrayRef output_size,
-    Tensor& output,
-    Tensor& indices);
+    const Tensor& output,
+    const Tensor& indices);
 
 TORCH_XPU_API void adaptive_max_pool2d_backward_kernel(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& indices,
-    Tensor& grad_input);
+    const Tensor& grad_input);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/AmpKernels.h b/src/ATen/native/xpu/sycl/AmpKernels.h
index 4f828c1f2..7c703e70e 100644
--- a/src/ATen/native/xpu/sycl/AmpKernels.h
+++ b/src/ATen/native/xpu/sycl/AmpKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
index 7373935aa..e7a2c2e1f 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
@@ -243,7 +243,7 @@ void launch_avg_pool2d_channels_last_kernel(
     const int stride_w,
     const int pad_h,
     const int pad_w,
-    Tensor& output,
+    const Tensor& output,
     const int divisor_override,
     const bool count_include_pad,
     const bool use_divisor) {
@@ -291,7 +291,7 @@ void launch_avg_pool2d_kernel(
     const int stride_w,
     const int pad_h,
     const int pad_w,
-    Tensor& output,
+    const Tensor& output,
     const int divisor_override,
     const bool count_include_pad,
     const bool use_divisor) {
@@ -548,7 +548,7 @@ void launch_avg_pool2d_backward_channels_last_kernel(
     const int stride_w,
     const int pad_h,
     const int pad_w,
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const int divisor_override,
     bool count_include_pad,
     bool use_divisor) {
@@ -599,7 +599,7 @@ void launch_avg_pool2d_backward_kernel(
     const int stride_w,
     const int pad_h,
     const int pad_w,
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const int divisor_override,
     bool count_include_pad,
     bool use_divisor) {
@@ -634,35 +634,24 @@ void launch_avg_pool2d_backward_kernel(
 
 void avg_pool2d_kernel(
     const Tensor& input_,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
+    int64_t kH_,
+    int64_t kW_,
+    int64_t dH_,
+    int64_t dW_,
+    int64_t padH_,
+    int64_t padW_,
     bool ceil_mode,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override,
-    Tensor& output) {
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1
-      ? kH
-      : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW
-      : stride.size() == 1      ? dH
-                                : safe_downcast<int, int64_t>(stride[1]);
-
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW =
-      padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
+    const Tensor& output) {
   const int64_t nInputPlane = input_.size(-3);
   const int64_t inputHeight = input_.size(-2);
   const int64_t inputWidth = input_.size(-1);
 
   int64_t outputWidth =
-      pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
+      pooling_output_shape<int64_t>(inputWidth, kW_, padW_, dW_, 1, ceil_mode);
   int64_t outputHeight =
-      pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
+      pooling_output_shape<int64_t>(inputHeight, kH_, padH_, dH_, 1, ceil_mode);
   const auto memory_format = input_.suggest_memory_format();
 
   Tensor input = input_.contiguous(memory_format);
@@ -688,12 +677,12 @@ void avg_pool2d_kernel(
                   inputWidth,
                   outputHeight,
                   outputWidth,
-                  kH,
-                  kW,
-                  dH,
-                  dW,
-                  padH,
-                  padW,
+                  kH_,
+                  kW_,
+                  dH_,
+                  dW_,
+                  padH_,
+                  padW_,
                   output,
                   divisor_override_value,
                   count_include_pad,
@@ -709,12 +698,12 @@ void avg_pool2d_kernel(
                   inputWidth,
                   outputHeight,
                   outputWidth,
-                  kH,
-                  kW,
-                  dH,
-                  dW,
-                  padH,
-                  padW,
+                  kH_,
+                  kW_,
+                  dH_,
+                  dW_,
+                  padH_,
+                  padW_,
                   output,
                   divisor_override_value,
                   count_include_pad,
@@ -740,7 +729,7 @@ void avg_pool2d_backward_kernel(
     bool ceil_mode,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override,
-    Tensor& gradInput) {
+    const Tensor& gradInput) {
   const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
   const int kW = kernel_size.size() == 1
       ? kH
diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
index 84842355d..7667fe021 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h
@@ -1,18 +1,19 @@
-#pragma once
-
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
 TORCH_XPU_API void avg_pool2d_kernel(
     const Tensor& input_,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
+    int64_t kH_,
+    int64_t kW_,
+    int64_t dH_,
+    int64_t dW_,
+    int64_t padH_,
+    int64_t padW_,
     bool ceil_mode,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override,
-    Tensor& output);
+    const Tensor& output);
 
 TORCH_XPU_API void avg_pool2d_backward_kernel(
     const Tensor& gradOutput_,
@@ -23,6 +24,6 @@ TORCH_XPU_API void avg_pool2d_backward_kernel(
     bool ceil_mode,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override,
-    Tensor& gradInput);
+    const Tensor& gradInput);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index 084a9b65f..5e14a0a94 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/core/TensorAccessor.h>
 #include <ATen/native/CanUse32BitIndexMath.h>
+#include <ATen/native/ReduceOps.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/StridedRandomAccessor.h>
 #include <ATen/native/TensorIterator.h>
@@ -12,6 +12,7 @@
 #include <ATen/xpu/XPUContext.h>
 #include <comm/SYCLContext.h>
 #include <comm/XPUMathCompat.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/BatchNormKernels.h>
 
@@ -3060,7 +3061,7 @@ void batch_norm_mean_var(
       }
 
       // For some reason this isn't an actual operator but it exists anyway...
-      var_mean_out(
+      at::native::var_mean_out(
           save_var,
           save_mean,
           self,
diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
index 34e50222e..2c2dd1550 100644
--- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
index 3224ba54f..e0e8487f6 100644
--- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
@@ -1,6 +1,7 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
+
 #include <ATen/native/xpu/sycl/Loops.h>
 
 #include <ATen/native/xpu/sycl/BinaryGeometricKernels.h>
diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.cpp b/src/ATen/native/xpu/sycl/BinaryKernels.cpp
index 2902486bf..daafadd23 100644
--- a/src/ATen/native/xpu/sycl/BinaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryKernels.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/BinaryInternal.h>
 #include <ATen/native/xpu/sycl/Loops.h>
diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
index 18bcffca6..3268ab90f 100644
--- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp
@@ -17,7 +17,7 @@ struct LogicalAndFunctor {
   }
 };
 
-void logical_and_kernel(TensorIteratorBase& iter) {
+void logical_and_kernel(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
     AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_xpu", [&]() {
@@ -40,7 +40,7 @@ struct LogicalOrFunctor {
   }
 };
 
-void logical_or_kernel(TensorIteratorBase& iter) {
+void logical_or_kernel(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
     AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_or_xpu", [&]() {
@@ -62,7 +62,7 @@ struct LogicalXorFunctor {
   }
 };
 
-void logical_xor_kernel(TensorIteratorBase& iter) {
+void logical_xor_kernel(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
     AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_xor_xpu", [&]() {
diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
index 82271e6a1..dce1a1a5e 100644
--- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h
@@ -4,10 +4,10 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API void logical_and_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_and_kernel(TensorIterator& iter);
 
-TORCH_XPU_API void logical_or_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_or_kernel(TensorIterator& iter);
 
-TORCH_XPU_API void logical_xor_kernel(TensorIteratorBase& iter);
+TORCH_XPU_API void logical_xor_kernel(TensorIterator& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
index a7a676675..73732e02f 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
index 3e0989b24..d96e5064e 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
index 9a5320c68..92eac3da2 100644
--- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Loops.h>
diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
index 1d64e34d6..d52a65fdf 100644
--- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
@@ -1,5 +1,5 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/im2col_shape_check.h>
 #include <comm/Runtime.h>
diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.h b/src/ATen/native/xpu/sycl/Col2ImKernel.h
index a910112e0..62d87b406 100644
--- a/src/ATen/native/xpu/sycl/Col2ImKernel.h
+++ b/src/ATen/native/xpu/sycl/Col2ImKernel.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/CompareKernels.cpp b/src/ATen/native/xpu/sycl/CompareKernels.cpp
index 1096dfa40..91c3ac614 100644
--- a/src/ATen/native/xpu/sycl/CompareKernels.cpp
+++ b/src/ATen/native/xpu/sycl/CompareKernels.cpp
@@ -1,7 +1,7 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index bdddd3f44..dc2991cc5 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
@@ -18,7 +18,7 @@ struct CopyScalarFunc {
   }
 };
 
-void copy_kernel(TensorIterator& iter) {
+void copy_kernel(TensorIteratorBase& iter) {
   ScalarType dtype = iter.common_dtype();
   if (isQIntType(dtype)) {
     AT_DISPATCH_QINT_TYPES(dtype, "copy_xpu", [&] {
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.h b/src/ATen/native/xpu/sycl/CopyKernel.h
index 30232b27a..3a8e4d263 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.h
+++ b/src/ATen/native/xpu/sycl/CopyKernel.h
@@ -4,6 +4,6 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API void copy_kernel(TensorIterator& iter);
+TORCH_XPU_API void copy_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index 8c130a0b9..ba0283b8b 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -4,10 +4,10 @@
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma GCC diagnostic ignored "-Wreturn-type"
 
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/utils/ParamUtils.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Atomics.h>
 #include <ATen/native/xpu/sycl/BatchKernel.h>
@@ -498,8 +498,8 @@ void max_pool2d_with_indices_kernel(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode,
-    Tensor& output_,
-    Tensor& indices_) {
+    const Tensor& output_,
+    const Tensor& indices_) {
   NoNamesGuard guard;
 
   TensorArg output_arg{output_, "output", 1};
@@ -614,8 +614,8 @@ void max_pool2d_with_indices_kernel(
   }
 }
 
-Tensor& max_pool2d_with_indices_backward_kernel(
-    Tensor& gradInput_,
+void max_pool2d_with_indices_backward_kernel(
+    const Tensor& gradInput_,
     const Tensor& gradOutput_,
     const Tensor& input_,
     const Tensor& indices_,
@@ -733,8 +733,6 @@ Tensor& max_pool2d_with_indices_backward_kernel(
       (!is_3d && !gradInput_.is_contiguous(smf))) {
     gradInput_.copy_(gradInput);
   }
-
-  return gradInput_;
 }
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
index 9d827c642..d530560e6 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
@@ -11,11 +11,11 @@ TORCH_XPU_API void max_pool2d_with_indices_kernel(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode,
-    Tensor& output,
-    Tensor& indices);
+    const Tensor& output,
+    const Tensor& indices);
 
-TORCH_XPU_API Tensor& max_pool2d_with_indices_backward_kernel(
-    Tensor& gradInput,
+TORCH_XPU_API void max_pool2d_with_indices_backward_kernel(
+    const Tensor& gradInput,
     const Tensor& gradOutput,
     const Tensor& input,
     const Tensor& indices,
diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
index 3deddb8cf..5006dd8b4 100644
--- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/DistanceKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.h b/src/ATen/native/xpu/sycl/DistanceKernels.h
index a53b84cdf..46a34d031 100644
--- a/src/ATen/native/xpu/sycl/DistanceKernels.h
+++ b/src/ATen/native/xpu/sycl/DistanceKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
index 1a01a7cfc..c3de4c593 100644
--- a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DistributionTemplates.h>
@@ -6,6 +5,7 @@
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/DistributionKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/DistributionNormal.cpp b/src/ATen/native/xpu/sycl/DistributionNormal.cpp
index 93b938210..3aacf6639 100644
--- a/src/ATen/native/xpu/sycl/DistributionNormal.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionNormal.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DistributionTemplates.h>
@@ -8,6 +7,7 @@
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/DistributionKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
index 3c15de1d9..e4698a723 100644
--- a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DistributionTemplates.h>
@@ -8,6 +7,7 @@
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/DistributionKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/DistributionTemplates.h b/src/ATen/native/xpu/sycl/DistributionTemplates.h
index db117c021..f5a5efdb5 100644
--- a/src/ATen/native/xpu/sycl/DistributionTemplates.h
+++ b/src/ATen/native/xpu/sycl/DistributionTemplates.h
@@ -13,6 +13,8 @@
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
 
+#include <ATen/ops/empty.h>
+
 namespace at {
 namespace native {
 namespace xpu {
diff --git a/src/ATen/native/xpu/sycl/DistributionUniform.cpp b/src/ATen/native/xpu/sycl/DistributionUniform.cpp
index c38626909..17ff4d698 100644
--- a/src/ATen/native/xpu/sycl/DistributionUniform.cpp
+++ b/src/ATen/native/xpu/sycl/DistributionUniform.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DistributionTemplates.h>
@@ -8,6 +7,7 @@
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/DeviceProperties.h>
 #include <comm/Runtime.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/DistributionKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp
index ddaee4d4c..54b1b4d4c 100644
--- a/src/ATen/native/xpu/sycl/Dropout.cpp
+++ b/src/ATen/native/xpu/sycl/Dropout.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/Activation.h>
@@ -9,6 +8,10 @@
 #include <ATen/native/xpu/sycl/MemoryAccessUtils.h>
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/TensorInfo.h>
+#include <comm/xpu_aten.h>
+
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/zeros_like.h>
 
 #include <ATen/native/xpu/sycl/DropoutKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/Embedding.cpp b/src/ATen/native/xpu/sycl/Embedding.cpp
index d905a4d97..4ea45b4c2 100644
--- a/src/ATen/native/xpu/sycl/Embedding.cpp
+++ b/src/ATen/native/xpu/sycl/Embedding.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/EmbeddingBackwardKernel.h>
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index f4f35061f..0e2d02b6a 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingBag.h>
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.h b/src/ATen/native/xpu/sycl/EmbeddingBag.h
index b1ac0038c..07f9de97a 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/core/Array.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/BatchKernel.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
index 97ad6f0d0..f73dabb6b 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/FillKernel.cpp b/src/ATen/native/xpu/sycl/FillKernel.cpp
index 22e845364..6dbb55c67 100644
--- a/src/ATen/native/xpu/sycl/FillKernel.cpp
+++ b/src/ATen/native/xpu/sycl/FillKernel.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp
index 11046d93f..94571607c 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp
@@ -7,8 +7,9 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
-namespace at::native::xpu {
+#include <ATen/ops/empty_like_native.h>
 
+namespace at::native::xpu {
 template <typename T, template <class> class Op>
 std::vector<Tensor> foreach_tensor_list_op(
     TensorList tensors1,
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
index d4448ca1b..2c501a2bc 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp
index b37a4e786..fe8c06cd0 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp
@@ -7,8 +7,9 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
-namespace at::native::xpu {
+#include <ATen/ops/empty_like_native.h>
 
+namespace at::native::xpu {
 template <typename scalar_t, template <class> class Op>
 std::vector<Tensor> foreach_binary_op(
     TensorList tensors,
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
index 4aac91b47..00044fa7d 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp
index 21af7e81d..7f6c0ee99 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp
@@ -7,6 +7,7 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
+#include <ATen/ops/empty_like_native.h>
 namespace at::native::xpu {
 
 template <typename scalar_t, template <class> class Op>
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
index a863983dd..bafd220c2 100644
--- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp b/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp
index 97e5f8245..7385ab7e0 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp
@@ -1,14 +1,15 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/OpMathType.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
-namespace at::native::xpu {
+#include <ATen/ops/empty_like_native.h>
 
+namespace at::native::xpu {
 template <template <class> class Op>
 std::vector<Tensor> foreach_pointwise_template(
     TensorList input,
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
index 4f2eb478d..9623b7370 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
index cac1d235f..f15795ef5 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
index 69b89796d..8a5bf444d 100644
--- a/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
index 187246aca..cefb3d840 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
index 44b11cfd9..21168f8a3 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/Lerp.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
index 67d90a9db..7b776b1eb 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
index 2c52b99f4..9d5e252eb 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
index 791345e54..9341c0ab6 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
@@ -3,6 +3,8 @@
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/TensorIterator.h>
 
+#include <ATen/ops/empty_like_native.h>
+
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
index 7d7b79d5c..26ab95fa8 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
index a3b70383b..fb74e19bc 100644
--- a/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GcdLcmKernels.cpp
@@ -1,7 +1,7 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/Math.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp
index e0a51293f..2bfd0505a 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.cpp
+++ b/src/ATen/native/xpu/sycl/GridSampler.cpp
@@ -4,12 +4,12 @@
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma GCC diagnostic ignored "-Wreturn-type"
 
-#include <ATen/ATen.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/CanUse32BitIndexMath.h>
 #include <ATen/native/GridSamplerUtils.h>
 #include <comm/SYCLContext.h>
 #include <comm/TensorInfo.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/GridSampler.h>
diff --git a/src/ATen/native/xpu/sycl/GridSamplerKernels.h b/src/ATen/native/xpu/sycl/GridSamplerKernels.h
index 28bfc6972..b56ed8dcd 100644
--- a/src/ATen/native/xpu/sycl/GridSamplerKernels.h
+++ b/src/ATen/native/xpu/sycl/GridSamplerKernels.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
index 66b3275e1..572b4d62b 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/OpMathType.h>
@@ -9,6 +8,7 @@
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 #include <comm/MemoryFormat.h>
 #include <comm/XPUMathCompat.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/GroupNormKernels.h>
 
@@ -328,8 +328,7 @@ void group_norm_kernel(
     double eps,
     Tensor& Y,
     Tensor& mean,
-    Tensor& rstd,
-    ScalarType dtype) {
+    Tensor& rstd) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.h b/src/ATen/native/xpu/sycl/GroupNormKernels.h
index 3cc680c0b..943d6bcb7 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.h
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
@@ -15,8 +15,7 @@ TORCH_XPU_API void group_norm_kernel(
     double eps,
     Tensor& Y,
     Tensor& mean,
-    Tensor& rstd,
-    ScalarType dtype);
+    Tensor& rstd);
 
 TORCH_XPU_API void group_norm_backward_kernel(
     const Tensor& dY,
diff --git a/src/ATen/native/xpu/sycl/GroupReduceUtils.h b/src/ATen/native/xpu/sycl/GroupReduceUtils.h
index c85877da9..3eb1cd08b 100644
--- a/src/ATen/native/xpu/sycl/GroupReduceUtils.h
+++ b/src/ATen/native/xpu/sycl/GroupReduceUtils.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/core/Array.h>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
 #include <comm/SYCLContext.h>
 #include <comm/XPUMathCompat.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h
index e51f0fd10..581ee04c5 100644
--- a/src/ATen/native/xpu/sycl/HistogramKernels.h
+++ b/src/ATen/native/xpu/sycl/HistogramKernels.h
@@ -19,4 +19,10 @@ TORCH_XPU_API void histogramdd_linear_kernel(
     const TensorList& bin_edges_,
     bool local_search);
 
+TORCH_XPU_API void histogram_select_outer_bin_edges_kernel(
+    const Tensor& input,
+    const int64_t N,
+    std::vector<double>& leftmost_edges,
+    std::vector<double>& rightmost_edges);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
index 4ffa24e48..d269d94ae 100644
--- a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
+++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
@@ -256,6 +256,19 @@ void histogramdd_linear_kernel(
   }
 }
 
+void histogram_select_outer_bin_edges_kernel(
+    const Tensor& input,
+    const int64_t N,
+    std::vector<double>& leftmost_edges,
+    std::vector<double>& rightmost_edges) {
+  auto [min, max] = at::aminmax(input, 0);
+
+  for (const auto i : c10::irange(N)) {
+    leftmost_edges[i] = min[i].item().to<double>();
+    rightmost_edges[i] = max[i].item().to<double>();
+  }
+}
+
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
index 50b1a7f57..9d60bfb25 100644
--- a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
@@ -1,5 +1,5 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/im2col_shape_check.h>
 #include <comm/Runtime.h>
diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.h b/src/ATen/native/xpu/sycl/Im2ColKernel.h
index 4b7090231..3ca9a852b 100644
--- a/src/ATen/native/xpu/sycl/Im2ColKernel.h
+++ b/src/ATen/native/xpu/sycl/Im2ColKernel.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index e9b2a0b00..8c755a64b 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -4,7 +4,6 @@
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma GCC diagnostic ignored "-Wreturn-type"
 
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
@@ -34,11 +33,9 @@ struct IndexFunctor {
 };
 
 void index_kernel(
-    TensorIterator& iter,
+    TensorIteratorBase& iter,
     IntArrayRef index_size,
-    IntArrayRef index_stride,
-    IntArrayRef non_index_size,
-    IntArrayRef non_index_stride) {
+    IntArrayRef index_stride) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
       at::ScalarType::ComplexHalf,
       at::ScalarType::BFloat16,
@@ -50,12 +47,7 @@ void index_kernel(
         using dtype = OpaqueType<sizeof(scalar_t)>;
         IndexFunctor<dtype> f;
         _index_kernel(
-            iter,
-            index_size,
-            index_stride,
-            non_index_size,
-            non_index_stride,
-            f);
+            iter, index_size, index_stride, IntArrayRef{}, IntArrayRef{}, f);
       });
 }
 
@@ -445,16 +437,30 @@ struct IndexFillScalarFunctor {
 };
 
 void index_fill_kernel(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
+    TensorIterator& iter,
+    const int64_t dim,
+    const int64_t self_dim_size,
+    const int64_t self_dim_stride,
     const Scalar& source) {
-  if (self.numel() == 0 || index.numel() == 0) {
+  Tensor self = iter.tensor(0);
+  Tensor index = iter.tensor(1);
+
+  // index_fill operator generates TensorIterator as kernel input,
+  // self tensor is restrided to meet TensorIterator broadcast requirements. But
+  // xpu kernel doesn't support such restrided shape, so we restride self tensor
+  // back here.
+  auto self_sizes = self.sizes().vec();
+  auto self_strides = self.strides().vec();
+  self_sizes[dim] = self_dim_size;
+  self_strides[dim] = self_dim_stride;
+  auto self_restrided = self.as_strided(self_sizes, self_strides);
+
+  if (self_restrided.numel() == 0 || index.numel() == 0) {
     return;
   }
 
   TORCH_CHECK(
-      self.dim() <= XPU_MAX_TENSORINFO_DIMS,
+      self_restrided.dim() <= XPU_MAX_TENSORINFO_DIMS,
       "self has too many (>",
       XPU_MAX_TENSORINFO_DIMS,
       ") dims");
@@ -469,7 +475,7 @@ void index_fill_kernel(
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
       at::ScalarType::ComplexHalf,
-      self.scalar_type(),
+      self_restrided.scalar_type(),
       "index_fill_xpu",
       [&] {
         TensorInfo<int64_t, int64_t> index_info =
@@ -477,7 +483,7 @@ void index_fill_kernel(
         index_info.collapseDims();
 
         TensorInfo<scalar_t, int64_t> dst_info =
-            getTensorInfo<scalar_t, int64_t>(self);
+            getTensorInfo<scalar_t, int64_t>(self_restrided);
         int new_indexing_dim = dst_info.collapseDims(dim);
 
         // No used in index kernel frame for index_fill.
@@ -523,8 +529,6 @@ void index_put_kernel(
     TensorIterator& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride,
-    IntArrayRef non_index_size,
-    IntArrayRef non_index_stride,
     bool accumulate) {
   if (accumulate) {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
@@ -537,12 +541,7 @@ void index_put_kernel(
         [&] {
           IndexPutAccumulateFunctor<scalar_t> f;
           _index_kernel(
-              iter,
-              index_size,
-              index_stride,
-              non_index_size,
-              non_index_stride,
-              f);
+              iter, index_size, index_stride, IntArrayRef{}, IntArrayRef{}, f);
         });
   } else {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
@@ -556,12 +555,7 @@ void index_put_kernel(
           using dtype = OpaqueType<sizeof(scalar_t)>;
           IndexPutFunctor<dtype> f;
           _index_kernel(
-              iter,
-              index_size,
-              index_stride,
-              non_index_size,
-              non_index_stride,
-              f);
+              iter, index_size, index_stride, IntArrayRef{}, IntArrayRef{}, f);
         });
   }
 }
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
index fcb429ef5..e01f74b51 100644
--- a/src/ATen/native/xpu/sycl/Indexing.h
+++ b/src/ATen/native/xpu/sycl/Indexing.h
@@ -510,7 +510,7 @@ constexpr int OVER_SUBSCRIBE_DSS_FACTOR = 16;
 
 template <typename func_t>
 void small_index_kernel(
-    TensorIterator& iter,
+    TensorIteratorBase& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride,
     IntArrayRef non_index_size,
@@ -660,7 +660,7 @@ struct IndexKernelFunctor {
 
 template <typename func_t>
 void index_kernel_impl(
-    TensorIterator& iter,
+    TensorIteratorBase& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride,
     const func_t f) {
@@ -700,7 +700,7 @@ void index_kernel_impl(
 
 template <typename func_t>
 void _index_kernel(
-    TensorIterator& iter,
+    TensorIteratorBase& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride,
     IntArrayRef non_index_size,
diff --git a/src/ATen/native/xpu/sycl/IndexingKernels.h b/src/ATen/native/xpu/sycl/IndexingKernels.h
index 288cd5391..ca2b866b8 100644
--- a/src/ATen/native/xpu/sycl/IndexingKernels.h
+++ b/src/ATen/native/xpu/sycl/IndexingKernels.h
@@ -4,11 +4,9 @@
 namespace at::native::xpu {
 
 TORCH_XPU_API void index_kernel(
-    TensorIterator& iter,
+    TensorIteratorBase& iter,
     at::IntArrayRef index_size,
-    at::IntArrayRef index_stride,
-    at::IntArrayRef non_index_size,
-    at::IntArrayRef non_index_stride);
+    at::IntArrayRef index_stride);
 
 TORCH_XPU_API void index_select_kernel(
     const Tensor& self,
@@ -29,17 +27,16 @@ TORCH_XPU_API void index_add_kernel(
     const Tensor& out);
 
 TORCH_XPU_API void index_fill_kernel(
-    Tensor& self,
-    int64_t dim,
-    const Tensor& index,
+    TensorIterator& iter,
+    const int64_t dim,
+    const int64_t self_dim_size,
+    const int64_t self_dim_stride,
     const Scalar& source);
 
 TORCH_XPU_API void index_put_kernel(
     TensorIterator& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride,
-    IntArrayRef non_index_size,
-    IntArrayRef non_index_stride,
     bool accumulate);
 
 TORCH_XPU_API void index_put_deterministic_kernel(
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index 60d40b71d..6a4efc440 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -1,9 +1,9 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/CanUse32BitIndexMath.h>
 #include <ATen/native/Math.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/Norm.h>
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
index 2f8e2a790..d45d06545 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
 #include <ATen/Functions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/core/Reduction.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/LossNLLKernel.h>
 
@@ -216,9 +216,9 @@ template <typename scalar_t, typename index_t>
 void nll_loss_forward_template(
     const Tensor& input,
     const Tensor& target,
-    Tensor& output,
+    const Tensor& output,
     const Tensor& weight,
-    Tensor& total_weight,
+    const Tensor& total_weight,
     int64_t reduction,
     int64_t ignore_index) {
   int n_dims = input.dim();
@@ -522,7 +522,7 @@ static inline void nll_loss_backward_template(
     const Tensor& input,
     const Tensor& target,
     const Tensor& gradOutput,
-    Tensor& gradInput,
+    const Tensor& gradInput,
     int64_t reduction,
     const Tensor& weight,
     const Tensor& total_weight,
@@ -644,14 +644,14 @@ static inline void nll_loss_backward_template(
           AT_PRIVATE_CASE_TYPE_USING_HINT(                \
               at::ScalarType::Long, index_t, __VA_ARGS__))
 
-std::tuple<Tensor&, Tensor&> nll_loss_forward_kernel(
+void nll_loss_forward_kernel(
     const Tensor& self,
     const Tensor& target,
     const OptionalTensorRef weight_opt,
     int64_t reduction,
     int64_t ignore_index,
-    Tensor& output,
-    Tensor& total_weight) {
+    const Tensor& output,
+    const Tensor& total_weight) {
   const Tensor& weight = weight_opt.getTensorRef();
   AT_DISPATCH_ALL_TYPES_AND2(
       at::ScalarType::Half,
@@ -672,10 +672,10 @@ std::tuple<Tensor&, Tensor&> nll_loss_forward_kernel(
             });
       });
 
-  return std::tuple<Tensor&, Tensor&>(output, total_weight);
+  // return std::tuple<Tensor&, Tensor&>(output, total_weight);
 }
 
-Tensor& nll_loss_backward_kernel(
+void nll_loss_backward_kernel(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -683,7 +683,7 @@ Tensor& nll_loss_backward_kernel(
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight,
-    Tensor& grad_input) {
+    const Tensor& grad_input) {
   const Tensor& weight = weight_opt.getTensorRef();
   AT_DISPATCH_ALL_TYPES_AND2(
       at::ScalarType::Half,
@@ -704,7 +704,7 @@ Tensor& nll_loss_backward_kernel(
                   ignore_index);
             });
       });
-  return grad_input;
+  // return grad_input;
 }
 
 #undef AT_DISPATCH_NLL_LOSS_INDEX_TYPES
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.h b/src/ATen/native/xpu/sycl/LossNLLKernel.h
index fdbd7a29c..f680aeb29 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.h
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.h
@@ -3,16 +3,16 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API std::tuple<Tensor&, Tensor&> nll_loss_forward_kernel(
+TORCH_XPU_API void nll_loss_forward_kernel(
     const Tensor& self,
     const Tensor& target,
     const OptionalTensorRef weight_opt,
     int64_t reduction,
     int64_t ignore_index,
-    Tensor& output,
-    Tensor& total_weight);
+    const Tensor& output,
+    const Tensor& total_weight);
 
-TORCH_XPU_API Tensor& nll_loss_backward_kernel(
+TORCH_XPU_API void nll_loss_backward_kernel(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
@@ -20,6 +20,6 @@ TORCH_XPU_API Tensor& nll_loss_backward_kernel(
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight,
-    Tensor& grad_input);
+    const Tensor& grad_input);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/MultiTensorApply.h b/src/ATen/native/xpu/sycl/MultiTensorApply.h
index bb1abe277..ca0feb3c0 100644
--- a/src/ATen/native/xpu/sycl/MultiTensorApply.h
+++ b/src/ATen/native/xpu/sycl/MultiTensorApply.h
@@ -1,6 +1,6 @@
 #pragma once
-#include <ATen/ATen.h>
 #include <c10/core/ScalarType.h>
+#include <comm/xpu_aten.h>
 #include <vector>
 
 #include <ATen/native/xpu/sycl/MemoryAccessUtils.h>
diff --git a/src/ATen/native/xpu/sycl/NMSKernel.cpp b/src/ATen/native/xpu/sycl/NMSKernel.cpp
index 78e885307..fe42226c5 100644
--- a/src/ATen/native/xpu/sycl/NMSKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NMSKernel.cpp
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/NMSKernel.h>
 
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.h b/src/ATen/native/xpu/sycl/NonzeroKernel.h
index de92eeddb..f8e602ed2 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.h
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
index 36d1282a3..48d605a4e 100644
--- a/src/ATen/native/xpu/sycl/Norm.h
+++ b/src/ATen/native/xpu/sycl/Norm.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/core/Array.h>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
 #include <comm/SYCLContext.h>
 #include <comm/XPUMathCompat.h>
+#include <comm/xpu_aten.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
index 76e8469eb..10f3765ac 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
@@ -1,7 +1,8 @@
-#include <ATen/ATen.h>
+
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
@@ -35,7 +36,7 @@ struct AddcmulComplexFunctor {
   scalar_t alpha_;
 };
 
-void addcmul_kernel(TensorIterator& iter, Scalar value) {
+void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
     AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_xpu", [&]() {
@@ -81,7 +82,7 @@ struct AddcdivComplexFunctor {
   scalar_t alpha_;
 };
 
-void addcdiv_kernel(TensorIterator& iter, Scalar value) {
+void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
     AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_xpu", [&]() {
@@ -147,7 +148,10 @@ struct SmoothL1BackwardFunctor {
   scalar_t beta_val;
 };
 
-void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta) {
+void smooth_l1_backward_kernel(
+    TensorIterator& iter,
+    const Scalar& norm,
+    double beta) {
   AT_DISPATCH_ALL_TYPES_AND2(
       kHalf,
       kBFloat16,
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
index f4572ae70..230b693f5 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -4,9 +4,9 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API void addcmul_kernel(TensorIterator& iter, Scalar value);
+TORCH_XPU_API void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value);
 
-TORCH_XPU_API void addcdiv_kernel(TensorIterator& iter, Scalar value);
+TORCH_XPU_API void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value);
 
 TORCH_XPU_API void mse_backward_kernel(
     TensorIterator& iter,
@@ -14,7 +14,7 @@ TORCH_XPU_API void mse_backward_kernel(
 
 TORCH_XPU_API void smooth_l1_backward_kernel(
     TensorIterator& iter,
-    Scalar norm,
+    const Scalar& norm,
     double beta);
 
 TORCH_XPU_API void huber_backward_kernel(
diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp
index c0fad9f27..e080511d2 100644
--- a/src/ATen/native/xpu/sycl/PowKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PowKernels.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/Pow.h>
 #include <ATen/native/TensorIterator.h>
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.cpp b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
index d30fa3007..824cd2d0e 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/xpu/sycl/DistributionTemplates.h>
 #include <ATen/native/xpu/sycl/Indexing.h>
@@ -6,6 +5,9 @@
 #include <ATen/native/xpu/sycl/SortingKernels.h>
 #include <ATen/xpu/XPUGeneratorImpl.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
+
+#include <ATen/ops/arange.h>
 
 #include <ATen/native/xpu/sycl/RandpermKernel.h>
 
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.h b/src/ATen/native/xpu/sycl/RandpermKernel.h
index 2fa5f8e4c..ef73145d4 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.h
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
index f50ce1abf..152d75eae 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -1,9 +1,9 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/detail/FunctionTraits.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/RangeFactoriesKernel.h>
 
diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index bf2b3a431..850eb42cc 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/OpMathType.h>
 #include <ATen/core/Array.h>
 #include <ATen/detail/FunctionTraits.h>
@@ -14,6 +13,7 @@
 #include <comm/DeviceProperties.h>
 #include <comm/SYCLContext.h>
 #include <comm/XPUPair.h>
+#include <comm/xpu_aten.h>
 #include <functional>
 #include <iosfwd>
 #include <type_traits>
@@ -241,10 +241,10 @@ struct ReduceConfig {
   template <typename T, class KernelClass>
   void set_group_dimension(int64_t dim0, int64_t dim1) {
     auto max_wg_sz = syclMaxWorkGroupSize<KernelClass>();
-    // Bypass reduction on SLM by sparing workload to other SGs. As the result,
-    // reduction of small shape input only requires some shift operations
-    // in side of SG. It is functional WA. We got case failures on some
-    // platforms supporting SIMD8.
+    // Bypass reduction on SLM by sparing workload to other SGs. As the
+    // result, reduction of small shape input only requires some shift
+    // operations in side of SG. It is functional WA. We got case failures on
+    // some platforms supporting SIMD8.
     // https://github.com/intel/torch-xpu-ops/issues/698
     auto max_sg_sz = syclMinSubGroupSize() == 8 ? syclMinSubGroupSize()
                                                 : syclMaxSubGroupSize();
@@ -345,7 +345,8 @@ struct ReduceConfig {
   }
 
   int slm_sz() const {
-    // if (!should_group_y_reduce() && (!should_group_x_reduce() || group_width
+    // if (!should_group_y_reduce() && (!should_group_x_reduce() ||
+    // group_width
     // <= 32)) { return 0; }
     return element_size_bytes * num_items * output_vec_size;
   }
@@ -1316,8 +1317,8 @@ inline void gpu_reduce_kernel(
   }
 
   // XXX: Avoid all WIs in a work group contributes on one output. If so,
-  // It is inefficient to store output, each work group stores only one output.
-  // It is not friendly to collapse memory request in an EU.
+  // It is inefficient to store output, each work group stores only one
+  // output. It is not friendly to collapse memory request in an EU.
   if (config.values_per_item() >= group_height * 16 ||
       config.values_per_item() >= 512) {
     // Divide the input across SGs in a work group, if that leaves at least
diff --git a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
index ec753c66a..3c70c8951 100644
--- a/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceLogicKernels.cpp
@@ -5,7 +5,6 @@
 #include <ATen/native/xpu/sycl/ReduceOpsKernels.h>
 
 namespace at::native::xpu {
-
 template <typename scalar_t>
 struct AndFunctor {
   inline bool operator()(scalar_t a, scalar_t b) const {
diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
index 565f5868e..ef405be49 100644
--- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -1,6 +1,7 @@
 #include <ATen/Dispatch.h>
 
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/ops/imag.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/ReduceNormKernel.h>
diff --git a/src/ATen/native/xpu/sycl/ReduceOps.h b/src/ATen/native/xpu/sycl/ReduceOps.h
new file mode 100644
index 000000000..04bcc6517
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/ReduceOps.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void argmax_kernel(TensorIterator& iter);
+
+void and_kernel(TensorIterator& iter);
+
+void or_kernel(TensorIterator& iter);
+
+void mean_kernel(TensorIterator& iter);
+
+void sum_kernel(TensorIterator& iter);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
index efddb53e6..ef838a969 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
@@ -4,8 +4,6 @@
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma GCC diagnostic ignored "-Wreturn-type"
 
-#include <ATen/ATen.h>
-#include <ATen/Context.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ceil_div.h>
 #include <ATen/native/IndexingUtils.h>
@@ -13,6 +11,7 @@
 #include <ATen/native/xpu/sycl/Atomics.h>
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/ReflectionPadKernels.h>
 
@@ -514,7 +513,7 @@ void reflection_pad3d_backward_template(
 }
 
 void reflection_pad1d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding) {
   TORCH_CHECK(
@@ -559,7 +558,7 @@ void reflection_pad1d_kernel(
 }
 
 void reflection_pad1d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding) {
@@ -616,7 +615,7 @@ void reflection_pad1d_backward_kernel(
 }
 
 void reflection_pad2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding) {
   TORCH_CHECK(
@@ -713,7 +712,7 @@ void reflection_pad2d_kernel(
 }
 
 void reflection_pad2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding) {
@@ -790,7 +789,7 @@ void reflection_pad2d_backward_kernel(
 }
 
 void reflection_pad3d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding) {
   TORCH_CHECK(
@@ -826,7 +825,7 @@ void reflection_pad3d_kernel(
 }
 
 void reflection_pad3d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
index 093c5d9b5..fc41b0b75 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
@@ -1,38 +1,38 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
 TORCH_XPU_API void reflection_pad1d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
 TORCH_XPU_API void reflection_pad1d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding);
 
 TORCH_XPU_API void reflection_pad2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
 TORCH_XPU_API void reflection_pad2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     const Tensor& input,
     IntArrayRef padding);
 
 TORCH_XPU_API void reflection_pad3d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef padding);
 
 TORCH_XPU_API void reflection_pad3d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
index 2371ad695..75b9322b3 100644
--- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
@@ -414,7 +414,7 @@ void replication_pad3d_backward_template(
 }
 
 void replication_pad1d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef padding) {
   TORCH_CHECK(
@@ -447,7 +447,7 @@ void replication_pad1d_kernel(
 }
 
 void replication_pad1d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
@@ -492,7 +492,7 @@ void replication_pad1d_backward_kernel(
 }
 
 void replication_pad2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef padding) {
   TORCH_CHECK(
@@ -519,7 +519,7 @@ void replication_pad2d_kernel(
 }
 
 void replication_pad2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
@@ -591,7 +591,7 @@ void replication_pad2d_backward_kernel(
 }
 
 void replication_pad3d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef padding) {
   if (input.numel() == 0) {
@@ -707,7 +707,7 @@ static inline void shapeAndGradOutputCheck3d(
 }
 
 void replication_pad3d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
index 0269b962e..35e20c269 100644
--- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
@@ -5,34 +5,34 @@
 namespace at::native::xpu {
 
 TORCH_XPU_API void replication_pad1d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
 TORCH_XPU_API void replication_pad1d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
 TORCH_XPU_API void replication_pad2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
 TORCH_XPU_API void replication_pad2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
 
 TORCH_XPU_API void replication_pad3d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef padding);
 
 TORCH_XPU_API void replication_pad3d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding);
diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
index 98089995c..eb1f0e090 100644
--- a/src/ATen/native/xpu/sycl/Shape.cpp
+++ b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/WrapDimUtils.h>
@@ -7,6 +6,10 @@
 #include <ATen/native/TensorShape.h>
 #include <ATen/xpu/CachingHostAllocator.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
+
+#include <ATen/ops/narrow.h>
+#include <ATen/ops/size_native.h>
 
 #include <ATen/native/xpu/sycl/ShapeKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index 7c0dd11db..45c9cb016 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -1,4 +1,3 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/CanUse32BitIndexMath.h>
@@ -7,6 +6,9 @@
 #include <ATen/xpu/XPUContext.h>
 #include <comm/DeviceProperties.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
+
+#include <ATen/ops/empty_like_native.h>
 
 #include <ATen/native/xpu/sycl/SoftMaxKernels.h>
 
@@ -1358,7 +1360,10 @@ void spatial_softmax_backward_kernel(
 }
 
 template <typename scalar_t, typename accscalar_t, bool LogSoftMax>
-void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) {
+void spatial_softmax_forward(
+    const Tensor& output,
+    const Tensor& input,
+    int dim) {
   auto inner_size = input.stride(dim);
   auto dim_size = input.size(dim);
   auto outer_size = input.numel() / (inner_size * dim_size);
@@ -1546,7 +1551,7 @@ void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) {
 
 template <typename scalar_t, typename accscalar_t, bool LogSoftMax>
 void spatial_softmax_backward(
-    Tensor& gradInput,
+    const Tensor& gradInput,
     Tensor& output,
     Tensor& gradOutput,
     int dim) {
@@ -1713,20 +1718,20 @@ void spatial_softmax_backward(
 } // namespace impl
 
 template <bool LogSoftMax>
-Tensor& host_softmax(
+void host_softmax(
     const Tensor& input_,
     const int64_t dim_,
     const bool half_to_float,
-    Tensor& output) {
+    const Tensor& output) {
   AT_ASSERTM(
       !half_to_float,
       "softmax with half to float conversion is not supported on XPU");
   TORCH_CHECK(
       input_.is_contiguous(),
       "** host_softmax only supports contiguous input tensor");
-  if (!output.defined()) {
-    output = at::native::empty_like(input_);
-  }
+  // if (!output.defined()) {
+  //   output = at::native::empty_like(input_);
+  // }
   Tensor input = input_;
   if (input.dim() == 0)
     input = input.view(1);
@@ -1747,16 +1752,16 @@ Tensor& host_softmax(
               output, input, dim);
         });
   }
-  return output;
+  // return output;
 }
 
 template <bool LogSoftMax>
-Tensor& host_softmax_backward(
+void host_softmax_backward(
     const Tensor& grad_,
     const Tensor& output_,
     int64_t dim_,
     bool half_to_float,
-    Tensor& gI) {
+    const Tensor& gI) {
   AT_ASSERTM(
       !half_to_float,
       "softmax with half to float conversion is not supported on XPU");
@@ -1768,12 +1773,13 @@ Tensor& host_softmax_backward(
       "** host_softmax_backward only supports contiguous output tensor");
 
   int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
-  if (!gI.defined()) {
-    gI = at::empty_like(grad_);
-  }
+  // if (!gI.defined()) {
+  //   gI = at::empty_like(grad_);
+  // }
 
   if (output_.numel() == 0) {
-    return gI;
+    // return gI;
+    return;
   }
 
   Tensor grad = grad_;
@@ -1795,42 +1801,42 @@ Tensor& host_softmax_backward(
         impl::spatial_softmax_backward<scalar_t, accscalar_t, LogSoftMax>(
             gI, output, grad, dim);
       });
-  return gI;
+  // return gI;
 }
 
-Tensor& _softmax_kernel(
+void _softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
-    Tensor& output) {
+    const Tensor& output) {
   return host_softmax<false>(input.contiguous(), dim, half_to_float, output);
 }
 
-Tensor& _log_softmax_kernel(
+void _log_softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
-    Tensor& output) {
-  return host_softmax<true>(input.contiguous(), dim, half_to_float, output);
+    const Tensor& output) {
+  host_softmax<true>(input.contiguous(), dim, half_to_float, output);
 }
 
-Tensor& _softmax_backward_kernel(
+void _softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
     bool half_to_float,
-    Tensor& grad_input) {
+    const Tensor& grad_input) {
   return host_softmax_backward<false>(
       grad.contiguous(), output.contiguous(), dim, half_to_float, grad_input);
 }
 
-Tensor& _log_softmax_backward_kernel(
+void _log_softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
     bool half_to_float,
-    Tensor& grad_input) {
-  return host_softmax_backward<true>(
+    const Tensor& grad_input) {
+  host_softmax_backward<true>(
       grad.contiguous(), output.contiguous(), dim, half_to_float, grad_input);
 }
 
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
index 66c4c3228..87d205442 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
@@ -6,31 +6,31 @@ namespace at {
 namespace native {
 namespace xpu {
 
-TORCH_XPU_API Tensor& _softmax_kernel(
+TORCH_XPU_API void _softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
-    Tensor& output);
+    const Tensor& output);
 
-TORCH_XPU_API Tensor& _log_softmax_kernel(
+TORCH_XPU_API void _log_softmax_kernel(
     const Tensor& input,
     const int64_t dim,
     const bool half_to_float,
-    Tensor& output);
+    const Tensor& output);
 
-TORCH_XPU_API Tensor& _softmax_backward_kernel(
+TORCH_XPU_API void _softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
     bool half_to_float,
-    Tensor& grad_input);
+    const Tensor& grad_input);
 
-TORCH_XPU_API Tensor& _log_softmax_backward_kernel(
+TORCH_XPU_API void _log_softmax_backward_kernel(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim,
     bool half_to_float,
-    Tensor& grad_input);
+    const Tensor& grad_input);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/Sorting.cpp b/src/ATen/native/xpu/sycl/Sorting.cpp
index a4cf160a2..cf41810dc 100644
--- a/src/ATen/native/xpu/sycl/Sorting.cpp
+++ b/src/ATen/native/xpu/sycl/Sorting.cpp
@@ -1,9 +1,10 @@
+
 #pragma clang diagnostic push
 #pragma GCC diagnostic push
 // Avoid SYCL compiler return-type error
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma GCC diagnostic ignored "-Wreturn-type"
-#include <ATen/ATen.h>
+
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NumericUtils.h>
@@ -15,6 +16,7 @@
 #include <ATen/native/xpu/sycl/SortingRadixSelect.h>
 #include <c10/macros/Macros.h>
 
+#include <ATen/ops/empty_strided.h>
 #include <ATen/native/xpu/sycl/Sorting.h>
 
 namespace at::native::xpu {
@@ -67,21 +69,27 @@ std::vector<int64_t> infer_dense_strides_dim_last(
   return new_strides_unsort;
 }
 
-std::tuple<Tensor&, Tensor&> sort_stable_kernel(
-    const Tensor& self,
-    c10::optional<bool> stable,
-    Tensor& values,
-    Tensor& indices,
-    int dim,
-    bool descending) {
-  TORCH_INTERNAL_ASSERT(
-      stable.has_value(),
-      "sort_out(): c10::optional<bool> for stable has to have value.");
+void sort_stable_kernel(
+    const TensorBase& self_base,
+    const TensorBase& values_base,
+    const TensorBase& indices_base,
+    int64_t dim,
+    bool descending,
+    bool stable) {
+  // Macro for converting `TensorBase` -> `Tensor` without
+  // reference count bumps.
+#define TOTENSOR(BASE, VAR)           \
+  OptionalTensorRef opt_##BASE(BASE); \
+  const Tensor& VAR = *opt_##BASE;
+
+  // Converting TensorBase into Tensor.
+  // We will need Tensor's methods from this point onwards.
+  TOTENSOR(self_base, self);
+  TOTENSOR(values_base, values);
+  TOTENSOR(indices_base, indices);
 
   bool is_non_overlapping_and_dense = self.is_non_overlapping_and_dense();
   int64_t numel = self.numel();
-  int64_t ndim = self.dim();
-  dim = maybe_wrap_dim(dim, ndim);
   int64_t nsort = self.sizes()[dim];
 
   TORCH_CHECK(
@@ -93,22 +101,6 @@ std::tuple<Tensor&, Tensor&> sort_stable_kernel(
           self_dtype != ScalarType::ComplexDouble,
       "Sort currently does not support complex dtypes on XPU.");
 
-  if (ndim == 0) {
-    if (!values.defined()) {
-      values = self.clone();
-    } else {
-      values.resize_as_(self);
-      values.copy_(self);
-    }
-    if (!indices.defined()) {
-      indices = at::zeros({}, self.options().dtype(kLong));
-    } else {
-      indices.resize_as_(self);
-      indices.zero_();
-    }
-    return std::forward_as_tuple(values, indices);
-  }
-
   Tensor self_;
   bool newself = false;
   if (is_non_overlapping_and_dense && self.stride(dim) == 1) {
@@ -120,62 +112,26 @@ std::tuple<Tensor&, Tensor&> sort_stable_kernel(
     newself = true;
   }
 
-  Tensor values_tmp, indices_tmp;
-  void* values_ptr_;
-  int64_t* indices_ptr;
-  if (!values.defined()) {
-    if (is_non_overlapping_and_dense) {
-      values = at::empty_strided(self.sizes(), self.strides(), self.options());
-    } else {
-      auto strides = at::infer_dense_strides(self.sizes(), self.strides());
-      values = at::empty_strided(self.sizes(), strides, self.options());
-    }
-  } else {
-    TORCH_CHECK(
-        self_.scalar_type() == values.scalar_type(),
-        "Unexpected dtype for values, expect ",
-        self_.scalar_type(),
-        ", got ",
-        values.scalar_type());
-    values.resize_as_(self);
-  }
-
+  c10::MaybeOwned<Tensor> values_tmp, indices_tmp;
   if (values.strides() == self_.strides() &&
       (newself || get_overlap_status(self, values) == MemOverlapStatus::No)) {
-    values_ptr_ = values.data_ptr();
-  } else {
-    values_tmp =
-        at::empty_strided(self_.sizes(), self_.strides(), self_.options());
-    values_ptr_ = values_tmp.data_ptr();
-  }
-
-  if (!indices.defined()) {
-    if (is_non_overlapping_and_dense) {
-      indices = at::empty_strided(
-          self.sizes(), self.strides(), self.options().dtype(kLong));
-    } else {
-      auto strides = at::infer_dense_strides(self.sizes(), self.strides());
-      indices =
-          at::empty_strided(self.sizes(), strides, self.options().dtype(kLong));
-    }
+    values_tmp = c10::MaybeOwned<Tensor>::borrowed(values);
   } else {
-    TORCH_CHECK(
-        kLong == indices.scalar_type(),
-        "Unexpected dtype for values, expect torch.long, got ",
-        indices.scalar_type());
-    indices.resize_as_(self);
+    values_tmp = c10::MaybeOwned<Tensor>::owned(
+        at::empty_strided(self_.sizes(), self_.strides(), self_.options()));
   }
+  const Tensor& values_tensor = *values_tmp;
 
   if (indices.strides() != self_.strides()) {
-    indices_tmp = at::empty_strided(
-        self_.sizes(), self_.strides(), self_.options().dtype(kLong));
-    indices_ptr = indices_tmp.data_ptr<int64_t>();
+    indices_tmp = c10::MaybeOwned<Tensor>::owned(at::empty_strided(
+        self_.sizes(), self_.strides(), self_.options().dtype(kLong)));
   } else {
-    indices_ptr = indices.data_ptr<int64_t>();
+    indices_tmp = c10::MaybeOwned<Tensor>::borrowed(indices);
   }
+  const Tensor& indices_tensor = *indices_tmp;
 
   if (numel == 0) {
-    return std::forward_as_tuple(values, indices);
+    return;
   }
 
   AT_DISPATCH_ALL_TYPES_AND3(
@@ -189,21 +145,20 @@ std::tuple<Tensor&, Tensor&> sort_stable_kernel(
         int nsegments = numel / nsort;
         segmented_sort_pairs<scalar_t, int64_t>(
             self_ptr,
-            (scalar_t*)values_ptr_,
+            values_tensor.data_ptr<scalar_t>(),
             nullptr,
-            (int64_t*)indices_ptr,
+            indices_tensor.data_ptr<int64_t>(),
             nsegments,
             nsort,
             descending);
       });
 
-  if (values_tmp.defined()) {
-    values.copy_(values_tmp);
+  if (!values_tmp->is_same(values)) {
+    values.copy_(*values_tmp);
   }
-  if (indices_tmp.defined()) {
-    indices.copy_(indices_tmp);
+  if (!indices_tmp->is_same(indices)) {
+    indices.copy_(*indices_tmp);
   }
-  return std::forward_as_tuple(values, indices);
 }
 
 template <typename scalar_t, typename index_t, int Dim>
diff --git a/src/ATen/native/xpu/sycl/Sorting.h b/src/ATen/native/xpu/sycl/Sorting.h
index 813fe9ed3..14164ddf9 100644
--- a/src/ATen/native/xpu/sycl/Sorting.h
+++ b/src/ATen/native/xpu/sycl/Sorting.h
@@ -4,13 +4,13 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API std::tuple<Tensor&, Tensor&> sort_stable_kernel(
-    const Tensor& self,
-    c10::optional<bool> stable,
-    Tensor& values,
-    Tensor& indices,
-    int dim,
-    bool descending);
+TORCH_XPU_API void sort_stable_kernel(
+    const TensorBase& self_base,
+    const TensorBase& values_base,
+    const TensorBase& indices_base,
+    int64_t dim,
+    bool descending,
+    bool stable);
 
 TORCH_XPU_API void launch_median_kernel(
     const TensorBase& vals,
diff --git a/src/ATen/native/xpu/sycl/SortingCommon.h b/src/ATen/native/xpu/sycl/SortingCommon.h
index cf2429057..a5779cb23 100644
--- a/src/ATen/native/xpu/sycl/SortingCommon.h
+++ b/src/ATen/native/xpu/sycl/SortingCommon.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <comm/SYCLContext.h>
 #include <comm/Scalar.h>
 #include <comm/TensorInfo.h>
diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
index 1a6465ab6..50f863212 100644
--- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/TensorCompare.cpp b/src/ATen/native/xpu/sycl/TensorCompare.cpp
new file mode 100644
index 000000000..7259373bb
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/TensorCompare.cpp
@@ -0,0 +1,81 @@
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at {
+namespace native {
+namespace xpu {
+
+template <typename scalar_t>
+struct WhereFunctor {
+  scalar_t operator()(bool cond_val, scalar_t self_val, scalar_t other_val)
+      const {
+    return cond_val ? self_val : other_val;
+  }
+};
+
+template <typename scalar_t>
+struct ClampFunctor {
+  scalar_t operator()(scalar_t v, scalar_t lower, scalar_t upper) const {
+    if (at::_isnan(v)) {
+      return v;
+    }
+    if (at::_isnan(lower)) {
+      return lower;
+    }
+    if (at::_isnan(upper)) {
+      return upper;
+    } else {
+      return std::min(std::max(v, lower), upper);
+    }
+  }
+};
+
+template <typename scalar_t>
+struct ClampScalarFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(scalar_t v) const {
+    if (_isnan(static_cast<opmath_t>(v))) {
+      return v;
+    } else if (minmax_ == at::native::detail::ClampLimits::Min) {
+      return std::max(static_cast<opmath_t>(v), lim0_val_);
+    } else if (minmax_ == at::native::detail::ClampLimits::Max) {
+      return std::min(static_cast<opmath_t>(v), lim0_val_);
+    } else {
+      return std::min(std::max(static_cast<opmath_t>(v), lim0_val_), lim1_val_);
+    }
+  }
+  ClampScalarFunctor(
+      opmath_t lim0_val,
+      opmath_t lim1_val,
+      at::native::detail::ClampLimits minmax)
+      : lim0_val_(lim0_val), lim1_val_(lim1_val), minmax_(minmax) {}
+
+ private:
+  opmath_t lim0_val_;
+  opmath_t lim1_val_;
+  at::native::detail::ClampLimits minmax_;
+};
+
+void inline launch_clamp_scalar(
+    TensorIteratorBase& iter,
+    Scalar lim0,
+    Scalar lim1,
+    at::native::detail::ClampLimits minmax) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      kHalf, kBFloat16, iter.common_dtype(), "clamp_scalar_xpu", [&] {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto lim0_val = lim0.to<opmath_t>();
+        auto lim1_val = lim1.to<opmath_t>();
+        gpu_kernel(
+            iter, ClampScalarFunctor<scalar_t>(lim0_val, lim1_val, minmax));
+      });
+}
+
+} // namespace xpu
+} // namespace native
+} // namespace at
diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
index 1d2d7f772..c6471aac2 100644
--- a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 
diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
index d2ec40bb9..8bd6c7c6b 100644
--- a/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.cpp
@@ -25,16 +25,16 @@ void topk_out_with_sort(
   indices.copy_(sorted_indices.narrow(dim, 0, k));
 }
 
-std::tuple<at::Tensor&, at::Tensor&> topk_kernel(
+void topk_kernel(
     const at::Tensor& input,
     int64_t k,
     int64_t dim,
     bool largest,
     bool sorted,
-    at::Tensor& values,
-    at::Tensor& indices) {
+    const at::Tensor& values,
+    const at::Tensor& indices) {
   if (k == 0) {
-    return std::forward_as_tuple(values, indices);
+    return;
   }
 
   TORCH_CHECK(
@@ -66,7 +66,7 @@ std::tuple<at::Tensor&, at::Tensor&> topk_kernel(
 
   if (k > 256) { // The segmented_group_select_pairs supports k<=256
     topk_out_with_sort(self.contiguous(), k, dim, largest, values, indices);
-    return std::forward_as_tuple(values, indices);
+    return;
   }
 
   Tensor self_;
@@ -135,8 +135,6 @@ std::tuple<at::Tensor&, at::Tensor&> topk_kernel(
       indices_.transpose_(ndim - 1, dim);
     indices.copy_(indices_);
   }
-
-  return std::forward_as_tuple(values, indices);
 }
 
 } // namespace xpu
diff --git a/src/ATen/native/xpu/sycl/TensorTopKKernel.h b/src/ATen/native/xpu/sycl/TensorTopKKernel.h
index 44db5de70..f04d5065e 100644
--- a/src/ATen/native/xpu/sycl/TensorTopKKernel.h
+++ b/src/ATen/native/xpu/sycl/TensorTopKKernel.h
@@ -6,14 +6,14 @@ namespace at {
 namespace native {
 namespace xpu {
 
-TORCH_XPU_API std::tuple<at::Tensor&, at::Tensor&> topk_kernel(
+TORCH_XPU_API void topk_kernel(
     const at::Tensor& input,
     int64_t k,
     int64_t dim,
     bool largest,
     bool sorted,
-    at::Tensor& values,
-    at::Tensor& indices);
+    const at::Tensor& values,
+    const at::Tensor& indices);
 
 } // namespace xpu
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
index 4a1ddcb74..9cdb0dacf 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
@@ -1,10 +1,10 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/ATen.h>
+// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/TensorTransformationsKernels.h>
 
@@ -131,7 +131,10 @@ void flip_kernel_impl(TensorIterator& iter) {
   launch_kernel(iter.numel(), loop);
 }
 
-void flip_kernel(TensorIterator& iter) {
+void flip_kernel(TensorIterator& iter, bool quantized) {
+  if (quantized) {
+    TORCH_CHECK(false, "XPU current does not flip for quantized tensor");
+  }
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::Half,
       at::ScalarType::Bool,
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
index b5fdbb661..1de970634 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h
@@ -1,10 +1,10 @@
 #pragma once
-#include <ATen/ATen.h>
 #include <ATen/native/TensorIterator.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
-TORCH_XPU_API void flip_kernel(TensorIterator& iter);
+TORCH_XPU_API void flip_kernel(TensorIterator& iter, bool quantized);
 
 TORCH_XPU_API void roll_kernel(
     const Tensor& input,
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
index 4e646bcdd..45fcc655e 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
@@ -73,7 +73,10 @@ struct ApplyTriuTrilKernelFunctor {
 };
 
 template <typename scalar_t, typename IndexType, bool upper>
-void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) {
+void apply_triu_tril(
+    const Tensor& result,
+    const Tensor& self,
+    const int64_t k) {
   auto N = self.numel();
   IndexType self_size_0 = (IndexType)self.size(-2);
   IndexType self_size_1 = (IndexType)self.size(-1);
@@ -120,12 +123,13 @@ void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) {
     }                                                             \
   }
 
-Tensor& tril_kernel(Tensor& result, const Tensor& self, int64_t k) {
+void tril_kernel(const Tensor& result, const Tensor& self, int64_t k) {
   if (result.sizes() != self.sizes()) {
     result.resize_as_(self);
   }
   if (self.numel() == 0) {
-    return result;
+    // return result;
+    return;
   }
 
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
@@ -137,15 +141,16 @@ Tensor& tril_kernel(Tensor& result, const Tensor& self, int64_t k) {
       "tril_xpu",
       TRIU_TRIL_LAMBDA(false));
 
-  return result;
+  // return result;
 }
 
-Tensor& triu_kernel(Tensor& result, const Tensor& self, int64_t k) {
+void triu_kernel(const Tensor& result, const Tensor& self, int64_t k) {
   if (result.sizes() != self.sizes()) {
     result.resize_as_(self);
   }
   if (self.numel() == 0) {
-    return result;
+    // return result;
+    return;
   }
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
       at::ScalarType::Half,
@@ -156,7 +161,7 @@ Tensor& triu_kernel(Tensor& result, const Tensor& self, int64_t k) {
       "triu_xpu",
       TRIU_TRIL_LAMBDA(true));
 
-  return result;
+  // return result;
 }
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
index 8ae777429..0cf9630b9 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.h
@@ -3,13 +3,13 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API Tensor& tril_kernel(
-    Tensor& result,
+TORCH_XPU_API void tril_kernel(
+    const Tensor& result,
     const Tensor& self,
     int64_t k);
 
-TORCH_XPU_API Tensor& triu_kernel(
-    Tensor& result,
+TORCH_XPU_API void triu_kernel(
+    const Tensor& result,
     const Tensor& self,
     int64_t k);
 
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
index d7a49f7d7..31117dd17 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/core/Tensor.h>
@@ -9,6 +9,8 @@
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
+#include <ATen/native/xpu/sycl/CopyKernel.h>
+#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/UnaryComplexKernels.h>
 
 namespace at::native::xpu {
@@ -46,7 +48,7 @@ struct ConjPhysicalFunctor<c10::complex<TYPE>> {
   }
 };
 
-void conj_physical_kernel(TensorIterator& iter) {
+void conj_physical_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_SWITCH(
       iter.common_dtype(),
       "conj_xpu",
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
index 7dd91771b..aef6cde9f 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
@@ -6,7 +6,7 @@ namespace at::native::xpu {
 
 TORCH_XPU_API void conj_kernel(TensorIterator& iter);
 
-TORCH_XPU_API void conj_physical_kernel(TensorIterator& iter);
+TORCH_XPU_API void conj_physical_kernel(TensorIteratorBase& iter);
 
 TORCH_XPU_API void neg_conj_kernel(TensorIterator& iter);
 
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
index 2f0645c4d..614838f65 100644
--- a/src/ATen/native/xpu/sycl/UnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+
+#include <comm/xpu_aten.h>
+
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/core/Tensor.h>
diff --git a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
index 81fc380bb..6be7cee18 100644
--- a/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryLogKernels.cpp
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/core/Tensor.h>
diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
index 031a2ace2..7e878fdeb 100644
--- a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/core/Tensor.h>
diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index cb00537fe..05cdd31b2 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
@@ -9,6 +9,7 @@
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/complex.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/UnarySpecialOpsKernels.h>
 
diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
index 4b95fc553..77f310029 100644
--- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.cpp
@@ -1,9 +1,9 @@
-#include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/UnfoldBackward.h>
 #include <c10/core/ScalarType.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
diff --git a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
index 136f88482..c74a8a950 100644
--- a/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
+++ b/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
index a60136e30..1a64e233b 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
@@ -1,10 +1,10 @@
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/ceil_div.h>
 #include <ATen/native/xpu/UpSample.h>
 #include <comm/SYCLContext.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h>
 
@@ -138,7 +138,7 @@ static void upsample_bicubic2d_out_template(
 }
 
 void upsample_bicubic2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
index e4d6d9485..fbf900238 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
 TORCH_XPU_API void upsample_bicubic2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
index e7c81e29a..653a293d8 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
@@ -4,11 +4,11 @@
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma GCC diagnostic ignored "-Wreturn-type"
 
-#include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/ceil_div.h>
+#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/Atomics.h>
@@ -281,7 +281,7 @@ void launch_upsample_bilinear2d_backward_kernel(
 }
 
 void upsample_bilinear2d_out_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
@@ -338,7 +338,7 @@ void upsample_bilinear2d_out_kernel(
 }
 
 void upsample_bilinear2d_backward_out_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
index fa0e8601a..aa5ee2c09 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
 TORCH_XPU_API void upsample_bilinear2d_out_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
@@ -13,7 +13,7 @@ TORCH_XPU_API void upsample_bilinear2d_out_kernel(
     std::optional<double> scales_w);
 
 TORCH_XPU_API void upsample_bilinear2d_backward_out_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
index b0ac8cc54..2bce3027e 100644
--- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.cpp
@@ -84,7 +84,7 @@ void upsample_linear1d_kernel(
     IntArrayRef output_size,
     bool align_corners,
     std::optional<double> scales,
-    Tensor& output) {
+    const Tensor& output) {
   int output_width = output_size[0];
   output.zero_();
   int input_width = input.size(2);
@@ -183,7 +183,7 @@ void upsample_linear1d_backward_kernel(
     IntArrayRef input_size,
     bool align_corners,
     std::optional<double> scales,
-    Tensor& grad_input) {
+    const Tensor& grad_input) {
   globalContext().alertNotDeterministic("upsample_linear1d_backward_out_xpu");
 
   int output_width = output_size[0];
diff --git a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
index 5beeeebc7..70befad77 100644
--- a/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h
@@ -9,7 +9,7 @@ TORCH_XPU_API void upsample_linear1d_kernel(
     IntArrayRef output_size,
     bool align_corners,
     std::optional<double> scales,
-    Tensor& output);
+    const Tensor& output);
 
 TORCH_XPU_API void upsample_linear1d_backward_kernel(
     const Tensor& grad_output_,
@@ -17,6 +17,6 @@ TORCH_XPU_API void upsample_linear1d_backward_kernel(
     IntArrayRef input_size,
     bool align_corners,
     std::optional<double> scales,
-    Tensor& grad_input);
+    const Tensor& grad_input);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
index 4ceda372c..0751749fc 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp
@@ -95,7 +95,7 @@ void upsample_nearest1d_backward_frame(
 }
 
 void upsample_nearest1d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
@@ -244,7 +244,7 @@ void upsample_nearest1d_frame(
 }
 
 void upsample_nearest1d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
     c10::optional<double> scales,
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
index 173302bc6..79801b4d6 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
@@ -1,19 +1,19 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/xpu/UpSample.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
 TORCH_XPU_API void upsample_nearest1d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
     c10::optional<double> scales,
     bool is_exact);
 
 TORCH_XPU_API void upsample_nearest1d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
index 682a73cc6..76e95c1ed 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp
@@ -224,7 +224,7 @@ void upsample_nearest2d_backward_channels_last_frame(
 }
 
 void upsample_nearest2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
@@ -574,7 +574,7 @@ void upsample_nearest2d_channels_last_frame(
 }
 
 void upsample_nearest2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
     c10::optional<double> scales_h,
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
index 8f8f6fff7..ea12e7b39 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/xpu/UpSample.h>
+#include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
 
 TORCH_XPU_API void upsample_nearest2d_kernel(
-    Tensor& output,
+    const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
     c10::optional<double> scales_h,
@@ -14,7 +14,7 @@ TORCH_XPU_API void upsample_nearest2d_kernel(
     bool is_exact);
 
 TORCH_XPU_API void upsample_nearest2d_backward_kernel(
-    Tensor& grad_input,
+    const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
index 23135894b..3b463f9c4 100644
--- a/src/BuildOnWindows.cmake
+++ b/src/BuildOnWindows.cmake
@@ -16,13 +16,21 @@ add_library(
 set(PATH_TO_TORCH_XPU_OPS_ATEN_LIB \"torch_xpu_ops_aten.dll\")
 target_compile_options(torch_xpu_ops PRIVATE -DPATH_TO_TORCH_XPU_OPS_ATEN_LIB=${PATH_TO_TORCH_XPU_OPS_ATEN_LIB})
 
+
 add_library(
   torch_xpu_ops_aten
   SHARED
   ${ATen_XPU_NATIVE_CPP_SRCS}
   ${ATen_XPU_GEN_SRCS})
 install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+# target_compile_definitions(torch_xpu_ops_aten PRIVATE CAFFE2_BUILD_MAIN_LIB)
+target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
 target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
+target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
+target_link_libraries(torch_xpu_ops_aten PUBLIC c10)
+
+
+
 
 if(BUILD_SEPARATE_OPS)
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
@@ -177,4 +185,5 @@ foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
 
   target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
   target_link_libraries(${lib} PUBLIC c10_xpu)
+  target_link_libraries(${lib} PUBLIC torch_cpu)
 endforeach()
diff --git a/src/comm/TensorInfo.h b/src/comm/TensorInfo.h
index 369ba0982..6d602a632 100644
--- a/src/comm/TensorInfo.h
+++ b/src/comm/TensorInfo.h
@@ -2,7 +2,35 @@
 // will upstream to pytorch when in tree
 #pragma once
 
-#include <ATen/ATen.h>
+// #include <comm/xpu_aten.h>
+#include <ATen/Context.h>
+#include <ATen/Device.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/DimVector.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Formatting.h>
+// #include <ATen/Functions.h>
+#include <ATen/NamedTensor.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/Version.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Scalar.h>
+#include <ATen/core/UnsafeFromTH.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+
 #include <ATen/CPUApplyUtils.h>
 
 #include <ATen/native/xpu/sycl/IntegerDivider.h>
diff --git a/src/comm/xpu_aten.h b/src/comm/xpu_aten.h
new file mode 100644
index 000000000..9b14b7c1c
--- /dev/null
+++ b/src/comm/xpu_aten.h
@@ -0,0 +1,26 @@
+#include <ATen/Context.h>
+#include <ATen/Device.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/DimVector.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Formatting.h>
+#include <ATen/NamedTensor.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/Version.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Scalar.h>
+#include <ATen/core/UnsafeFromTH.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
\ No newline at end of file
diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py
index 509efbe93..0303b16ca 100644
--- a/test/xpu/extended/skip_list_common.py
+++ b/test/xpu/extended/skip_list_common.py
@@ -162,8 +162,14 @@
     # https://github.com/intel/torch-xpu-ops/issues/781
     "test_compare_cpu_square_xpu_complex64",
 
+
+    # The operator 'aten::_assert_async.msg' is not currently implemented for the XPU device.
+    "test_operator_multinomial_xpu_float32",
+    "test_view_replay_multinomial_xpu_float32"
+
     # https://github.com/intel/torch-xpu-ops/issues/922
     "test_compare_cpu_isin_xpu_bfloat16",
     "test_compare_cpu_unique_consecutive_xpu_bfloat16",
+
     ),
 }
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 005e3c5fe..ec1d9fdf6 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -708,6 +708,21 @@
         # https://github.com/intel/torch-xpu-ops/issues/484
         "test_gather_backward_with_empty_index_tensor_sparse_grad_True_xpu_float32",
         "test_gather_backward_with_empty_index_tensor_sparse_grad_True_xpu_float64",
+        "test_scatter_add__xpu_complex64",
+        "test_scatter_add__xpu_float16",
+        "test_scatter_add__xpu_float32",
+        "test_scatter_add_mult_index_base_xpu_float32",
+        "test_scatter_reduce_mean_xpu_bfloat16",
+        "test_scatter_reduce_mean_xpu_float16",
+        "test_scatter_reduce_mean_xpu_float32",
+        "test_scatter_reduce_mean_xpu_float64",
+        "test_scatter_reduce_mean_xpu_int16",
+        "test_scatter_reduce_mean_xpu_int32",
+        "test_scatter_reduce_mean_xpu_int64",
+        "test_scatter_reduce_mean_xpu_int8",
+        "test_scatter_reduce_mean_xpu_uint8",
+        "test_gather_backward_deterministic_path_xpu",
+        "test_scatter_add_one_dim_deterministic_xpu",
     ),
 
     "test_autograd_fallback_xpu.py": None,
@@ -2574,6 +2589,10 @@
         # Greatest absolute difference: 0.03125 at index (1, 227, 114) (up to 0.01 allowed)
         # Greatest relative difference: 0.01495361328125 at index (1, 227, 114) (up to 0.01 allowed)
         "test_index_add_correctness",
+        # scatter_add needs handle XPU deterministic
+        # https://github.com/intel/torch-xpu-ops/issues/906
+        "test_gather_backward_deterministic_path_xpu",
+        "test_scatter_add_one_dim_deterministic_xpu",
     ),
 
     "nn/test_multihead_attention_xpu.py": (
diff --git a/tools/codegen/remove_headers.py b/tools/codegen/remove_headers.py
new file mode 100644
index 000000000..30ac6ea3c
--- /dev/null
+++ b/tools/codegen/remove_headers.py
@@ -0,0 +1,29 @@
+import argparse
+import re
+
+parser = argparse.ArgumentParser(description="Utils for remove unused headers")
+parser.add_argument("--register_xpu_path", type=str, help="file location of RegisterXPU.cpp")
+args = parser.parse_args()
+
+def rm_as_strided_native():
+    with open(args.register_xpu_path, 'r') as fr:
+        lines = fr.readlines()
+
+        with open(args.register_xpu_path, 'w') as fw:
+            for ln in lines:
+                if "#include <ATen/ops/as_strided_native.h>" not in ln:
+                    fw.write(ln)
+
+def replace_op_headers():
+    with open(args.register_xpu_path, 'r') as fr:
+        lines = fr.readlines()
+        patt = r'#include <ATen/ops'
+        rep = r'#include <xpu/ATen/ops'
+        with open(args.register_xpu_path, 'w') as fw:
+            for ln in lines:
+                replaced = re.sub(patt, rep, ln)
+                fw.write(replaced)
+
+if __name__ == "__main__":
+    # rm_as_strided_native()
+    replace_op_headers()
\ No newline at end of file
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
new file mode 100644
index 000000000..607cb38ed
--- /dev/null
+++ b/yaml/native/native_functions.yaml
@@ -0,0 +1,5167 @@
+- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
+  variants: function
+  tags: core
+
+- func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  autogen: copy.out
+
+- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
+  autogen: _copy_from.out
+
+# We need this to be able to properly copy from a CPU to an XLA tensor with different sizes.
+# See https://github.com/pytorch/xla/issues/2881
+- func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor
+  dispatch:
+    MPS: _copy_from_and_resize_mps
+  autogen: _copy_from_and_resize.out
+
+- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: add.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  autogen: _to_copy.out
+  tags: core
+
+# to(Device) must not exist because all constructors of Device also works for
+# TensorOptions. Otherwise, an ambiguity error is thrown.
+# See NOTE [ TensorOptions Constructors ].
+- func: to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: add.out
+  tags: pointwise
+
+- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  ufunc_inner_loop:
+    Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
+    ScalarOnly: add (Bool)
+  dispatch:
+    XPU: add_out_xpu
+  tags: pointwise
+
+- func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: cumsum.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  structured_delegate: cumsum.out
+  variants: method
+
+- func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU : cumsum_out
+
+- func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: cumprod.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  structured_delegate: cumprod.out
+  variants: method
+
+- func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: cumprod_out
+
+- func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  variants: method
+
+- func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: cumprod_backward(Tensor grad, Tensor input, int dim, Tensor output) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sub_out
+  tags: pointwise
+
+- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: sub.out
+  tags: [core, pointwise]
+
+- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: sub.out
+  tags: pointwise
+# For C++ only, until we have conversion from C++ numbers to Tensor
+
+- func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  autogen: sub.Scalar_out
+  tags: pointwise
+# subtract, alias for sub
+
+- func: mul.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: method
+  tags: pointwise
+
+- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: mul_out
+  tags: pointwise
+  # For C++ only, until we have conversion from C++ numbers to Tensor
+
+- func: mul.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  autogen: mul.Scalar_out
+  tags: pointwise
+# multiply, alias for mul
+
+- func: div.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: div.out
+  tags: [core, pointwise]
+
+- func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: div.out
+  tags: pointwise
+
+- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: div_out
+  tags: pointwise
+
+- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: div.out_mode
+  tags: [core, pointwise]
+
+- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: div.out_mode
+  tags: pointwise
+
+- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: div_out_mode
+  tags: pointwise
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: div.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: div
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
+  tags: [core, pointwise]
+
+- func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: div_
+  autogen: div.Scalar_out
+  tags: pointwise
+
+- func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: div
+  tags: [core, pointwise]
+
+- func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: div_
+  autogen: div.Scalar_mode_out
+  tags: pointwise
+
+# divide, alias for div
+- func: divide.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: divide.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
+- func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+
+- func: divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+
+- func: divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+
+- func: divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+
+- func: divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+
+  # true_divide, an alias for div
+- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    XPU: rsub
+  autogen: rsub.Tensor_out
+
+- func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: remainder_out
+  tags: pointwise
+
+- func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: remainder.Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: remainder.Tensor_out
+  variants: method
+  tags: pointwise
+
+- func: remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    XPU: remainder
+  autogen: remainder.Scalar_Tensor_out
+  tags: pointwise
+
+
+- func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: fmod_out
+  tags: pointwise
+
+- func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: fmod.Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: fmod.Tensor_out
+  tags: pointwise
+
+- func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: tanh_backward_out
+  tags: pointwise
+
+- func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
+  python_module: nn
+  structured_delegate: tanh_backward.grad_input
+
+- func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: eq.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: eq_Scalar_out
+  tags: pointwise
+
+- func: eq.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: eq.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: eq_Tensor_out
+  tags: pointwise
+
+- func: eq.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: eq.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: eq.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: ne_Scalar_out
+  tags: pointwise
+
+- func: ne.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: ne.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: ne_Tensor_out
+  tags: pointwise
+
+- func: ne.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: ne.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: ne.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: ne.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: lt_Scalar_out
+  tags: pointwise
+
+- func: lt.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: lt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: lt_Tensor_out
+  tags: pointwise
+
+- func: lt.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: lt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: lt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: lt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: le_Scalar_out
+  tags: pointwise
+
+- func: le.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: le.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: le_Tensor_out
+  tags: pointwise
+
+- func: le.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: le.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: le.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: le.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: gt_Scalar_out
+  tags: pointwise
+
+- func: gt.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: gt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: gt_Tensor_out
+  tags: pointwise
+
+- func: gt.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: gt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: gt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: gt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: ge_Scalar_out
+  tags: pointwise
+
+- func: ge.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: ge.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: ge_Tensor_out
+  tags: pointwise
+
+- func: ge.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: ge.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: ge.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: ge.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: isnan(Tensor self) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: isnan
+  autogen: isnan.out
+  tags: [core, pointwise]
+
+- func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: masked_fill__xpu
+  autogen: masked_fill.Scalar_out
+
+- func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: masked_fill__xpu
+  autogen: masked_fill.Tensor_out
+
+- func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    XPU: _index_put_impl_
+  autogen: _index_put_impl, _index_put_impl.out
+
+- func: index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    XPU: index_add_xpu_out
+
+- func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: index_add.out
+  variants: method
+
+- func: index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+  structured_delegate: index_add.out
+  variants: function, method
+
+- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: index_select_out_xpu
+
+- func: index_select(Tensor self, int dim, Tensor index) -> Tensor
+  variants: method, function
+  dispatch:
+    XPU: index_select_xpu_
+  tags: core
+
+- func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: gcd_out
+  tags: pointwise
+
+- func: gcd(Tensor self, Tensor other) -> Tensor
+  structured_delegate: gcd.out
+  variants: function, method
+  tags: pointwise
+
+- func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: gcd.out
+  variants: function, method
+
+# - func: relu(Tensor self) -> Tensor
+#   device_check: NoCheck   # TensorIterator
+#   variants: function, method
+#   dispatch:
+#     XPU: relu
+#   tags: [core, pointwise]
+
+# - func: relu_(Tensor(a!) self) -> Tensor(a!)
+#   device_check: NoCheck   # TensorIterator
+#   variants: function, method
+#   dispatch:
+#     XPU: relu_
+#   autogen: relu.out
+#   tags: pointwise
+
+- func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  structured_delegate: threshold.out
+  dispatch:
+    QuantizedCPU: threshold_quantized_cpu
+
+- func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  structured_delegate: threshold.out
+
+- func: threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: threshold_out
+
+- func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: threshold_backward_out
+
+- func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
+  variants: function
+  structured_delegate: threshold_backward.grad_input
+  tags: pointwise
+
+- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: gelu_out_xpu
+
+- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
+  structured_delegate: gelu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: gelu(Tensor self, *, str approximate='none') -> Tensor
+  structured_delegate: gelu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  tags: [core, pointwise]
+
+- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: gelu_backward_out_xpu
+
+- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
+  structured_delegate: gelu_backward.grad_input
+  python_module: nn
+  tags: pointwise
+
+- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+
+- func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+
+- func: arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+  cpp_no_default_args: ['step']
+  tags: core
+
+- func: arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: arange_out_xpu
+  cpp_no_default_args: ['step']
+
+- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: abs_out
+  tags: pointwise
+
+- func: abs(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: abs_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: sin(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sin.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: sin_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sin.out
+  variants: function, method
+  tags: pointwise
+
+- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sin_out
+  tags: pointwise
+
+- func: cos(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cos.out
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: cos_nested
+  tags: [core, pointwise]
+
+- func: cos_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cos.out
+  tags: pointwise
+
+- func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: cos_out
+  tags: pointwise
+
+- func: log(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log.out
+  variants: function, method
+  tags: pointwise
+
+- func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: log_out
+  tags: pointwise
+
+- func: sqrt(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sqrt.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: sqrt_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sqrt.out
+  variants: function, method
+  tags: pointwise
+
+- func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sqrt_out
+  tags: pointwise
+
+- func: rsqrt(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: rsqrt.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: rsqrt.out
+  variants: function, method
+  tags: pointwise
+
+- func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: rsqrt_out
+  tags: pointwise
+
+- func: tanh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tanh.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: tanh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tanh.out
+  variants: function, method
+  tags: pointwise
+
+- func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: tanh_out
+  tags: pointwise
+
+- func: neg(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: neg.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: neg_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: neg.out
+  variants: function, method
+  tags: pointwise
+
+- func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: neg_out
+  tags: pointwise
+
+- func: reciprocal(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: reciprocal.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: reciprocal.out
+  variants: function, method
+  tags: pointwise
+
+- func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: reciprocal_out
+  tags: pointwise
+
+- func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: pow_Tensor_Tensor_out
+  tags: pointwise
+
+- func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    XPU: pow_Scalar_out
+  tags: pointwise
+
+- func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Scalar_out
+  tags: [core, pointwise]
+
+- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: pow_Tensor_Scalar_out
+  tags: pointwise
+
+- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Scalar_out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: pow_sparse_scalar
+  tags: [core, pointwise]
+
+- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Scalar_out
+  variants: method
+  tags: pointwise
+
+- func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Tensor_out
+  variants: method
+  tags: pointwise
+
+- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    XPU: empty_xpu
+  tags: core
+
+- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    XPU: empty_strided_xpu
+  autogen: empty_strided.out
+  tags: core
+
+- func: fill.Scalar(Tensor self, Scalar value) -> Tensor
+  variants: function
+  tags: core
+
+- func: fill.Tensor(Tensor self, Tensor value) -> Tensor
+  variants: function
+
+- func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: fill_
+  autogen: fill.Scalar_out
+
+- func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: fill_
+  autogen: fill.Tensor_out
+
+- func: zero_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    XPU: zero_
+  autogen: zero, zero.out
+
+- func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    XPU: random_
+  autogen: random.from, random.from_out
+
+- func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    XPU: random_
+  autogen: random.to, random.to_out
+
+- func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    XPU: random_
+  autogen: random, random.out
+
+- func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    XPU: normal_
+  autogen: normal.out
+
+# Only used by the functionalization pass.
+# Normally, the codegen would be able to generate a normal() NativeFunction,
+# but we can't due to overload ambiguity with normal.Tensor_float.
+- func: normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+
+- func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    XPU: normal_out
+
+- func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+  dispatch:
+    XPU: normal
+  tags: nondeterministic_seeded
+
+- func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: normal_out
+  tags: nondeterministic_seeded
+
+- func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    XPU: normal
+  tags: nondeterministic_seeded
+
+- func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: normal_out
+  tags: nondeterministic_seeded
+
+- func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    XPU: normal
+  tags: nondeterministic_seeded
+
+- func: normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: normal
+  tags: nondeterministic_seeded
+
+- func: normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: normal
+  tags: nondeterministic_seeded
+
+- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    XPU: uniform_
+  autogen: uniform, uniform.out
+
+# Sample bernoulli with values in `self` as probability.
+- func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: nondeterministic_seeded
+
+- func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  tags: nondeterministic_seeded
+  dispatch:
+    XPU: bernoulli_out
+
+- func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    XPU: bernoulli_
+  autogen: bernoulli.Tensor, bernoulli.Tensor_out
+
+- func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    XPU: bernoulli_
+  autogen: bernoulli.float_out
+
+# Note [bernoulli.p schema]
+# We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking)
+# This out-of-place version isn't used explicitly, but needed by jit.
+# There is no default valid on `p` here because it would introduce ambiguity
+# with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
+- func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: nondeterministic_seeded
+
+- func: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: native_dropout_xpu
+  tags: [nondeterministic_seeded, core]
+  autogen: native_dropout.out
+
+- func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
+  dispatch:
+    XPU: native_dropout_backward_xpu
+  autogen: native_dropout_backward.out
+  tags: pointwise
+
+- func: view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: view
+  tags: core
+
+- func: view_as_real(Tensor(a) self) -> Tensor(a)
+  variants: function
+  dispatch:
+    XPU: view_as_real
+
+- func: view_as_complex(Tensor(a) self) -> Tensor(a)
+  variants: function
+  dispatch:
+    XPU: view_as_complex
+
+- func: view_copy(Tensor self, SymInt[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_copy_symint
+  tags: view_copy
+  autogen: view_copy.out
+
+- func: view_as_real_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_as_real_copy
+  tags: view_copy
+  autogen: view_as_real_copy.out
+
+- func: view_as_complex_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_as_complex_copy
+  tags: view_copy
+  autogen: view_as_complex_copy.out
+
+- func: as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: as_strided_copy_symint
+  tags: view_copy
+  autogen: as_strided_copy.out
+
+- func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    XPU: as_strided_tensorimpl
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+
+- func: _reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: _reshape_alias
+
+- func: _reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint
+  tags: view_copy
+  autogen: _reshape_alias_copy.out
+
+- func: resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: [core, inplace_view]
+  dispatch:
+    XPU: resize_xpu_
+  autogen: resize, resize.out
+
+- func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: set_
+  autogen: set.source_Storage, set.source_Storage_out
+  tags: inplace_view
+
+- func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: set_storage_xpu_
+  autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
+  tags: inplace_view
+
+- func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: unfold
+
+- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unfold_copy
+  tags: view_copy
+  autogen: unfold_copy.out
+
+- func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    XPU: bitwise_and_out
+  tags: pointwise
+
+- func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  tags: pointwise
+
+- func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  autogen: bitwise_and.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_and.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_and.Tensor_out
+  tags: pointwise
+
+- func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    XPU: bitwise_xor_out
+  tags: pointwise
+
+- func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  tags: pointwise
+
+- func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  autogen: bitwise_xor.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_xor.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor_
+  tags: pointwise
+
+- func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_xor.Tensor_out
+  tags: pointwise
+
+- func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    XPU: bitwise_or_out
+  tags: pointwise
+
+- func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  tags: pointwise
+
+- func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  autogen: bitwise_or.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_or.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_or.Tensor_out
+  tags: pointwise
+
+- func: bitwise_not(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: bitwise_not.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: bitwise_not.out
+  variants: method
+  tags: pointwise
+
+- func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: bitwise_not_out
+  tags: pointwise
+
+- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: where_self_out
+
+- func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: where
+  tags: [core, pointwise]
+
+- func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ['min']
+  structured_delegate: clamp.out
+  tags: [core, pointwise]
+
+- func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+  variants: function, method
+  structured_delegate: clamp.Tensor_out
+  tags: [core, pointwise]
+
+- func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ['min']
+  structured_delegate: clamp.out
+  tags: pointwise
+
+- func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp.Tensor_out
+  tags: pointwise
+
+- func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ['min']
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: clamp_out
+  tags: pointwise
+
+- func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: clamp_Tensor_out
+  tags: pointwise
+
+- func: clamp_max(Tensor self, Scalar max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_max.out
+  tags: pointwise
+
+- func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
+  variants: function, method
+  structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
+
+- func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_max.out
+  tags: pointwise
+
+- func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
+
+- func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: clamp_max_out
+  tags: pointwise
+
+- func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: clamp_max_Tensor_out
+  tags: pointwise
+
+- func: clamp_min(Tensor self, Scalar min) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_min.out
+  tags: pointwise
+
+- func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
+  variants: function, method
+  structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
+
+- func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_min.out
+  tags: pointwise
+
+- func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
+
+- func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: clamp_min_out
+  tags: pointwise
+
+- func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: clamp_min_Tensor_out
+  tags: pointwise
+
+- func: max(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    XPU: max
+
+- func: max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: max_unary_out
+
+- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: max.dim_max
+  variants: function, method
+  tags: core
+
+- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    XPU: max_out
+
+- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+
+- func: min(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    XPU: min
+
+- func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: min.dim_min
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: qmin
+  tags: core
+
+- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    XPU: min_out
+
+- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
+  structured_delegate: sum.IntList_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: sum_out
+
+- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: mean.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: mean_out
+
+- func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    XPU: any_out
+
+- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  dispatch:
+    XPU: any_all_out
+
+- func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+  structured_delegate: argmax.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: argmax_out
+
+- func: _local_scalar_dense(Tensor self) -> Scalar
+  tags: [core, data_dependent_output]
+  dispatch:
+    XPU: _local_scalar_dense_xpu
+  variants: function
+
+- func: col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: col2im_out_xpu
+
+- func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: col2im_xpu
+  tags: core
+
+- func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: im2col_out_xpu
+
+- func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: im2col_xpu
+
+- func: flip(Tensor self, int[] dims) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: flip
+  autogen: flip.out
+  tags: core
+
+- func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: nonzero_out_xpu
+  tags: dynamic_output_shape
+
+- func: nonzero(Tensor self) -> Tensor
+  variants: method, function
+  dispatch:
+    XPU: nonzero_xpu
+  tags: [dynamic_output_shape, core]
+
+- func: maximum(Tensor self, Tensor other) -> Tensor
+  structured_delegate: maximum.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: maximum_out
+  tags: pointwise
+
+- func: minimum(Tensor self, Tensor other) -> Tensor
+  structured_delegate: minimum.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: minimum_out
+  tags: pointwise
+
+- func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sigmoid_backward_out
+  tags: pointwise
+
+- func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
+  python_module: nn
+  structured_delegate: sigmoid_backward.grad_input
+  tags: pointwise
+
+- func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  structured_delegate: _softmax.out
+  tags: core
+
+- func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: softmax_xpu_out
+
+- func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  structured_delegate: _softmax_backward_data.out
+
+- func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: softmax_backward_xpu_out
+
+- func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  structured_delegate: _log_softmax.out
+  tags: core
+
+- func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: log_softmax_xpu_out
+
+- func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  structured_delegate: _log_softmax_backward_data.out
+
+- func: _log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: log_softmax_backward_xpu_out
+
+- func: exp(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: exp.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: exp_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: exp.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: exp_out
+  tags: pointwise
+
+- func: sigmoid(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sigmoid.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sigmoid.out
+  variants: function, method
+  tags: pointwise
+
+- func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sigmoid_out
+  tags: pointwise
+
+- func: sgn(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: sgn.out
+  tags: pointwise
+
+- func: sgn_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  structured_delegate: sgn.out
+  tags: pointwise
+
+- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sgn_out
+  tags: pointwise
+
+- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_add_list_kernel_xpu
+
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_add_list_kernel_xpu_
+  autogen: _foreach_add.List_out
+
+- func: _foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_add_scalar_kernel_xpu
+
+- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_add_scalar_kernel_xpu_
+  autogen: _foreach_add.Scalar_out
+
+- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_mul_scalar_kernel_xpu
+
+- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_mul_list_kernel_xpu
+
+- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_mul_list_kernel_xpu_
+  autogen: _foreach_mul.List_out
+
+- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_mul_scalar_kernel_xpu_
+  autogen: _foreach_mul.Scalar_out
+
+- func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_div_list_kernel_xpu
+
+- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_div_list_kernel_xpu_
+  autogen: _foreach_div.List_out
+
+- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_div_scalar_kernel_xpu
+
+- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_div_scalar_kernel_xpu_
+  autogen: _foreach_div.Scalar_out
+
+- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcmul_scalar_xpu
+
+- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcmul_scalar_xpu_
+  autogen: _foreach_addcmul.Scalar_out
+
+
+- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcmul_scalarlist_xpu
+
+- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcmul_scalarlist_xpu_
+  autogen: _foreach_addcmul.ScalarList_out
+
+- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcdiv_scalarlist_xpu_
+  autogen: _foreach_addcdiv.ScalarList_out
+
+- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcdiv_scalarlist_xpu
+
+- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcdiv_scalar_xpu
+
+- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_addcdiv_scalar_xpu_
+  autogen: _foreach_addcdiv.Scalar_out
+
+- func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: layer_norm_xpu
+  autogen: native_layer_norm.out
+  tags: core
+
+- func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: layer_norm_backward_xpu
+  autogen: native_layer_norm_backward.out
+  tags: core
+
+- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: tril_xpu
+
+- func: tril(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: tril.out
+  variants: method, function
+
+- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: tril.out
+  variants: method
+
+- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: triu.out
+  variants: method
+
+- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: triu_xpu
+
+- func: triu(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: triu.out
+  variants: method, function
+
+- func: set_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  dispatch:
+    XPU: set_xpu_
+  autogen: set, set.out
+  tags: inplace_view
+
+- func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: set_tensor_
+  autogen: set.source_Tensor, set.source_Tensor_out
+  tags: inplace_view
+
+- func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: nll_loss_forward_out_xpu
+
+- func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  python_module: nn
+  structured_delegate: nll_loss_forward.output
+
+- func: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: nll_loss_backward_out_xpu
+
+- func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  python_module: nn
+  structured_delegate: nll_loss_backward.grad_input
+
+# - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+#   device_check: NoCheck   # TensorIterator
+#   dispatch:
+#     CompositeExplicitAutograd: sort_out
+
+- func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  structured: True
+  dispatch:
+    XPU: sort_stable_out
+
+# - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+#   device_check: NoCheck   # TensorIterator
+#   variants: method, function
+#   dispatch:
+#     CompositeExplicitAutograd: sort
+#   tags: core
+
+- func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  structured_delegate: sort.values_stable
+  variants: method, function
+
+- func: cat(Tensor[] tensors, int dim=0) -> Tensor
+  structured_delegate: cat.out
+  tags: core
+
+- func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  precomputed:
+  - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format
+  dispatch:
+    XPU: cat_out_xpu
+
+- func: is_pinned(Tensor self, Device? device=None) -> bool
+  variants: method
+  dispatch:
+    XPU: is_pinned_xpu
+
+# TODO: add a copy kwarg that guarantees that the tensor is put into fresh
+# pinned memory
+- func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)
+  variants: method
+
+# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor
+- func: _pin_memory(Tensor self, Device? device=None) -> Tensor
+  dispatch:
+    XPU: _pin_memory_xpu
+  autogen: _pin_memory.out
+
+- func: _embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _embedding_bag_forward_only_xpu
+  autogen: _embedding_bag_forward_only.out
+
+- func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _embedding_bag_xpu
+  autogen: _embedding_bag.out
+  tags: core
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: max_pool2d_with_indices_out_xpu
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: max_pool2d_with_indices.out
+  tags: core
+
+- func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: max_pool2d_with_indices_backward_out_xpu
+
+- func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: max_pool2d_with_indices_backward.grad_input
+  tags: core
+
+- func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: adaptive_avg_pool2d_backward_xpu
+  autogen: _adaptive_avg_pool2d_backward.out
+  tags: core
+
+- func: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
+  dispatch:
+    XPU: embedding_dense_backward_xpu
+  autogen: embedding_dense_backward.out
+  tags: core
+
+- func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: elu_out
+
+- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+  structured_delegate: elu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: elu_backward_out
+
+- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
+  structured_delegate: elu_backward.grad_input
+  python_module: nn
+
+- func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
+  structured_delegate: elu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: silu(Tensor self) -> Tensor
+  structured_delegate: silu.out
+  python_module: nn
+  tags: pointwise
+
+- func: silu_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: silu.out
+  python_module: nn
+  tags: pointwise
+
+- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: silu_out
+  tags: pointwise
+
+- func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: silu_backward_out
+  tags: pointwise
+
+- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
+  structured_delegate: silu_backward.grad_input
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: math_silu_backward
+  tags: pointwise
+
+- func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: hardswish_out
+
+- func: hardswish(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: hardswish
+
+- func: hardswish_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: hardswish_
+
+- func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: hardswish_backward
+  autogen: hardswish_backward.out
+
+- func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: hardtanh_out
+
+- func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: hardtanh
+  tags: core
+
+- func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: hardtanh_backward_out
+    MPS: hardtanh_backward_out_mps
+
+- func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: hardtanh_backward
+
+- func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: hardtanh_
+
+- func: relu(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: relu
+  tags: [core, pointwise]
+
+- func: relu_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: relu_
+  autogen: relu.out
+  tags: pointwise
+
+- func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.out
+  variants: function, method
+
+- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.dims_out
+  variants: function, method
+  cpp_no_default_args: ['dim']
+  dispatch:
+    CompositeExplicitAutograd: all_dims_default
+
+- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    XPU: all_out
+
+- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    XPU: all_dims_out
+    CompositeExplicitAutograd: all_dims_out_default
+  cpp_no_default_args: ['dim']
+
+- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: all(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.all_out
+  variants: method, function
+
+- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  dispatch:
+    XPU: all_all_out
+
+- func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.out
+  variants: function, method
+  tags: core
+
+- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.dims_out
+  variants: function, method
+  cpp_no_default_args: ['dim']
+  tags: core
+  dispatch:
+    CompositeExplicitAutograd: any_dims_default
+
+- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    XPU: any_dims_out
+    CompositeExplicitAutograd: any_dims_out_default
+  cpp_no_default_args: ['dim']
+
+- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: any(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.all_out
+  variants: method, function
+  tags: core
+
+- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: addr
+
+- func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+
+- func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: addr_out
+
+- func: _foreach_sqrt(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_sqrt_xpu
+
+- func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_sqrt_xpu_
+  autogen: _foreach_sqrt.out
+
+- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_lerp_ternary_xpu
+  autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_lerp_ternary_xpu_
+  autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_lerp_list_xpu
+  autogen: _foreach_lerp.Scalar_out
+
+- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    XPU: foreach_tensor_lerp_list_xpu_
+  autogen: _foreach_lerp.Scalar_out
+
+- func: acos(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: acos.out
+  tags: [core, pointwise]
+
+- func: acos_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: acos.out
+  tags: pointwise
+
+- func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: acos_out
+  tags: pointwise
+
+- func: acosh(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: acosh.out
+  tags: [core, pointwise]
+
+- func: acosh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: acosh.out
+  tags: pointwise
+
+- func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: acosh_out
+  tags: pointwise
+
+- func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: index.Tensor_out
+  variants: function, method
+  tags: [core, dynamic_output_shape]
+  # NB: This function is special-cased in tools/autograd/gen_variable_type.py
+  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
+  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
+  # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
+
+- func: index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  structured_inherits: TensorIteratorBase
+  precomputed:
+  - indices -> DimVector sizes, DimVector strides
+  dispatch:
+    XPU: index_out
+
+- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  structured_delegate: scatter.src_out
+  variants: function, method
+  tags: core
+
+- func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  structured_delegate: scatter.src_out
+  variants: method
+
+- func: scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    XPU: scatter_src_out
+
+- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  structured_delegate: scatter.value_out
+  variants: function, method
+  tags: core
+
+- func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  structured_delegate: scatter.value_out
+  variants: method
+
+- func: scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    XPU: scatter_value_out
+
+- func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
+  structured_delegate: scatter.reduce_out
+  variants: function, method
+
+- func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
+  structured_delegate: scatter.reduce_out
+  variants: method
+
+- func: scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    XPU: scatter_reduce_out
+
+- func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
+  structured_delegate: scatter.value_reduce_out
+  variants: function, method
+
+- func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
+  structured_delegate: scatter.value_reduce_out
+  variants: method
+
+- func: scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    XPU: scatter_value_reduce_out
+
+- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+  variants: function, method
+
+- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+  variants: function, method
+
+- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  structured_delegate: scatter_add.out
+  variants: function, method
+  tags: core
+
+- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  structured_delegate: scatter_add.out
+  variants: method
+
+- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    XPU: scatter_add
+
+- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+  variants: function, method
+
+- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+  structured_delegate: scatter_reduce.two_out
+  variants: function, method
+  tags: core
+
+- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
+  structured_delegate: scatter_reduce.two_out
+  variants: method
+
+- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    XPU: scatter_reduce_two
+
+- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  variants: function, method
+  structured_delegate: amax.out
+  tags: core
+
+- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: amax_out
+
+- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  variants: function, method
+  structured_delegate: amin.out
+  tags: core
+
+- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: amin_out
+
+- func: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_nearest1d.vec_out
+
+- func: _upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_nearest_exact1d.vec_out
+
+- func: _upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: _upsample_nearest_exact1d_out_xpu
+
+- func: _upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact1d.out
+
+- func: upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_nearest1d_out_xpu
+
+- func: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest1d.out
+
+- func: _upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: _upsample_nearest_exact1d_backward_out_xpu
+
+- func: _upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact1d_backward.grad_input
+
+- func: upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_nearest1d_backward_out_xpu
+
+- func: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest1d_backward.grad_input
+
+- func: upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_nearest2d_out_xpu
+
+- func: _upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: _upsample_nearest_exact2d_out_xpu
+
+- func: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest2d.out
+
+- func: _upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact2d.out
+
+- func: upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_nearest2d_backward_out_xpu
+
+- func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: _upsample_nearest_exact2d_backward_out_xpu
+
+- func: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest2d_backward.grad_input
+
+- func: _upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact2d_backward.grad_input
+
+- func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
+  variants: function
+  dispatch:
+    XPU: unfold_backward
+  autogen: unfold_backward.out
+
+- func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: _bincount_xpu
+  tags: dynamic_output_shape
+  autogen: bincount.out
+
+- func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: reflection_pad1d_out_xpu
+
+- func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad1d.out
+  tags: core
+
+- func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: reflection_pad1d_backward_out_xpu
+
+- func: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad1d_backward.grad_input
+
+- func: reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: reflection_pad2d_out_xpu
+
+- func: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: reflection_pad2d_xpu
+  tags: core
+
+- func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: reflection_pad2d_backward_out_xpu
+
+- func: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: reflection_pad2d_backward_xpu
+
+- func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: addcmul_out
+  tags: pointwise
+
+- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  structured_delegate: addcmul.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  structured_delegate: addcmul.out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+
+- func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: native_group_norm
+  autogen: native_group_norm.out
+  tags: core
+
+- func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: native_group_norm_backward
+  autogen: native_group_norm_backward.out
+  tags: core
+
+- func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+
+- func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  dispatch:
+    XPU: grid_sampler_2d_xpu
+  autogen: grid_sampler_2d.out
+  tags: core
+
+- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    XPU: grid_sampler_2d_backward_xpu
+  autogen: grid_sampler_2d_backward.out
+
+- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: foreach_tensor_norm_slow
+    XPU: foreach_tensor_norm_xpu
+  autogen: _foreach_norm.Scalar_out
+
+- func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
+  dispatch:
+    XPU: _cdist_forward
+  autogen: _cdist_forward.out
+  tags: core
+
+- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+  variants: function
+  dispatch:
+    XPU: _amp_foreach_non_finite_check_and_unscale_xpu_
+  autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
+
+- func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
+  variants: function
+  dispatch:
+    XPU: _amp_update_scale_xpu_
+  autogen: _amp_update_scale, _amp_update_scale.out
+
+- func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: hardsigmoid_out
+
+- func: hardsigmoid(Tensor self) -> Tensor
+  structured_delegate: hardsigmoid.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: hardsigmoid_quantized_cpu
+
+- func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: hardsigmoid.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: hardsigmoid_backward_out
+
+- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
+  structured_delegate: hardsigmoid_backward.grad_input
+  python_module: nn
+
+- func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: leaky_relu_out
+
+- func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+  structured_delegate: leaky_relu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  tags: core
+
+- func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: leaky_relu_backward_out
+
+- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
+  structured_delegate: leaky_relu_backward.grad_input
+  python_module: nn
+
+- func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
+  structured_delegate: leaky_relu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_vector_norm.out
+
+- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  structured: True
+  dispatch:
+    XPU: linalg_vector_norm_out
+
+- func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: clone
+  autogen: clone.out
+  tags: [core, pointwise]
+
+- func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: [core, nondeterministic_seeded]
+  dispatch:
+    CompositeExplicitAutograd: randperm
+
+- func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randperm
+
+- func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randperm_out
+
+- func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    XPU: randperm_out_xpu
+
+- func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: roll_xpu
+  autogen: roll.out
+
+- func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  precomputed:
+  - kernel_size -> int kH, int kW
+  - stride -> int dH, int dW
+  - padding -> int padH, int padW
+  dispatch:
+    XPU: avg_pool2d_out_xpu
+
+- func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool2d.out
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool2d
+    QuantizedCPU: avg_pool2d_quantized_cpu
+  tags: core
+
+- func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: avg_pool2d_backward_out_xpu
+
+- func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool2d_backward.grad_input
+  tags: core
+
+- func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: mse_loss_out
+
+- func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mse_loss.out
+  python_module: nn
+
+- func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: mse_loss_backward_out
+
+- func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: mse_loss_backward
+
+- func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: glu_out
+
+- func: glu(Tensor self, int dim=-1) -> Tensor
+  structured_delegate: glu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: glu_backward_xpu_out
+
+- func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: glu_backward_xpu
+
+- func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: softplus_out
+
+- func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
+  structured_delegate: softplus.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: softplus_backward_out
+
+- func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
+  structured_delegate: softplus_backward.grad_input
+  python_module: nn
+
+- func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: softshrink_out
+
+- func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  structured_delegate: softshrink.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: softshrink_backward_out
+
+- func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
+  structured_delegate: softshrink_backward.grad_input
+  python_module: nn
+
+- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: hypot_out
+  tags: pointwise
+
+- func: hypot(Tensor self, Tensor other) -> Tensor
+  structured_delegate: hypot.out
+  variants: method, function
+  tags: pointwise
+
+- func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: hypot.out
+  variants: method
+  tags: pointwise
+
+- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    XPU: _efficientzerotensor_xpu
+  autogen: _efficientzerotensor.out
+
+- func: complex(Tensor real, Tensor imag) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: complex
+
+- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: complex_out
+
+- func: trace(Tensor self) -> Tensor
+  variants: method, function
+  dispatch:
+    XPU: trace_xpu
+  autogen: trace.out
+
+- func: _adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+  dispatch:
+    XPU: adaptive_avg_pool2d_xpu
+  autogen: _adaptive_avg_pool2d.out
+  tags: core
+
+- func: adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: adaptive_avg_pool2d_symint
+
+- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    XPU: std_mean_xpu
+  autogen: std_mean.correction_out
+
+- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: std(Tensor self, bool unbiased=True) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: std_xpu
+
+- func: var(Tensor self, bool unbiased=True) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: var_xpu
+  tags: core
+
+- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: var_xpu_out
+
+- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    XPU: var_mean_xpu
+  autogen: var_mean.correction_out
+
+- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: gather_out
+
+- func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+  variants: method, function
+  structured_delegate: gather.out
+  tags: core
+
+- func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+
+- func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+  variants: method, function
+
+- func: upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_bicubic2d_out_xpu
+
+- func: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bicubic2d.out
+
+- func: upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_bilinear2d_out_xpu
+
+- func: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bilinear2d.out
+
+- func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_bilinear2d_backward_out_xpu
+
+- func: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bilinear2d_backward.grad_input
+
+- func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
+  dispatch:
+    XPU: batch_norm_stats_xpu
+  autogen: batch_norm_stats.out
+
+- func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
+  dispatch:
+    XPU: batch_norm_elemt_xpu
+
+- func: batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: batch_norm_elemt_xpu_out
+
+- func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: batch_norm_backward_reduce_xpu
+  autogen: batch_norm_backward_reduce.out
+
+- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
+  dispatch:
+    XPU: batch_norm_backward_elemt_xpu
+  autogen: batch_norm_backward_elemt.out
+
+- func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
+  dispatch:
+    XPU: batch_norm_update_stats_xpu
+  autogen: batch_norm_update_stats.out
+
+- func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: batch_norm_xpu
+
+- func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  dispatch:
+    XPU: batch_norm_xpu_out
+
+- func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: batch_norm_backward_xpu
+  autogen: native_batch_norm_backward.out
+
+# TODO: In 2 weeks, we should make native_batch_norm composite implicit so that this correct schema percolates correctly through our dispatching
+- func: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _batch_norm_legit_xpu
+  autogen: _native_batch_norm_legit_functional
+  tags: core
+
+- func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
+  dispatch:
+    XPU: _batch_norm_legit_xpu_out
+
+- func: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _batch_norm_legit_no_stats_xpu
+  tags: core
+
+- func: _native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  dispatch:
+    XPU: _batch_norm_legit_no_stats_xpu_out
+
+- func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _batch_norm_with_update_xpu
+  autogen: _batch_norm_with_update_functional
+
+- func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
+  dispatch:
+    XPU: _batch_norm_with_update_xpu_out
+
+- func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _new_batch_norm_backward_xpu
+
+- func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: lerp_Scalar
+  tags: pointwise
+
+- func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: lerp.Scalar_out
+  tags: pointwise
+
+- func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: lerp.Tensor_out
+  tags: pointwise
+
+- func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: lerp_Tensor
+  tags: pointwise
+
+- func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: addcdiv_out
+  tags: pointwise
+
+- func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  structured_delegate: addcdiv.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  structured_delegate: addcdiv.out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: eye
+
+- func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: eye
+
+- func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: eye_out_xpu
+
+- func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: eye_out_xpu
+
+- func: logical_not(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_not
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
+  tags: [core, pointwise]
+
+- func: logical_not_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_not_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
+  tags: pointwise
+
+- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: logical_not_out
+  tags: pointwise
+
+- func: erf(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erf.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: erf_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erf.out
+  variants: function, method
+  tags: pointwise
+
+- func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: erf_out
+  tags: pointwise
+
+- func: erfc(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfc.out
+  variants: function, method
+  tags: pointwise
+
+- func: erfc_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfc.out
+  variants: function, method
+  tags: pointwise
+
+- func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: erfc_out
+  tags: pointwise
+
+- func: _conj_physical(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _conj_physical
+  autogen: _conj_physical.out
+
+- func: conj_physical(Tensor self) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: conj_physical_out
+  tags: pointwise
+
+- func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: conj_physical_
+  tags: pointwise
+
+- func: ceil(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: ceil.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: ceil_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: ceil.out
+  variants: function, method
+  tags: pointwise
+
+- func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: ceil_out
+  tags: pointwise
+
+- func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+  dispatch:
+    XPU: searchsorted_xpu
+
+- func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: searchsorted_out_xpu
+
+- func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+  dispatch:
+    XPU: searchsorted_xpu
+
+- func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: searchsorted_out_xpu
+
+- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+  dispatch:
+    XPU: bucketize_xpu
+
+- func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: bucketize_out_xpu
+
+- func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+  dispatch:
+    XPU: bucketize_xpu
+  autogen: bucketize.Scalar_out
+
+- func: is_set_to(Tensor self, Tensor tensor) -> bool
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    XPU: is_set_to
+
+- func: mish(Tensor self) -> Tensor
+  structured_delegate: mish.out
+  python_module: nn
+
+- func: mish_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: mish.out
+  python_module: nn
+
+- func: mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: mish_out
+
+- func: mish_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: mish_backward
+
+- func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: atan2_out
+  tags: [core, pointwise]
+
+- func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan2.out
+  variants: method
+  tags: pointwise
+
+- func: atan2(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan2.out
+  variants: method, function
+  tags: [core, pointwise]
+# arctan2, alias of atan2
+
+- func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: copysign_out
+  tags: pointwise
+
+- func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: copysign.out
+  tags: pointwise
+
+- func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: copysign.out
+
+- func: copysign.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: copysign
+  tags: pointwise
+
+- func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: copysign_
+
+- func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: copysign_out
+  tags: pointwise
+
+# TODO: remove dispatch section when porting TH CUDA to ATen
+- func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    XPU: multinomial_out
+
+- func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+  variants: method, function
+  dispatch:
+    XPU: multinomial
+  tags: nondeterministic_seeded
+
+- func: huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: huber_loss_out
+
+- func: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: huber_loss
+
+- func: huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: huber_loss_backward_out
+
+- func: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: huber_loss_backward
+
+- func: reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: reflection_pad3d_out_xpu
+
+- func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad3d.out
+  tags: core
+
+- func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: reflection_pad3d_backward_out_xpu
+
+- func: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad3d_backward.grad_input
+
+- func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: count_nonzero_xpu
+  autogen: count_nonzero.dim_IntList_out
+
+- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: count_nonzero
+  autogen: count_nonzero.out
+
+- func: sinh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sinh.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: sinh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sinh.out
+  variants: function, method
+  tags: pointwise
+
+- func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sinh_out
+# Returns a copy of this `Variable` that is detached from its autograd graph.
+# This method is OK to call if the `Variable` is a view.
+#
+# NOTE: Previously, if we change the tensor metadata (e.g. sizes / strides /
+# storage / storage_offset) of a tensor created from `detach()`, those metadata
+# in the original tensor will also be updated. However, the new behavior is that
+# those metadata changes to the detached tensor will not update the original tensor
+# anymore, and in the `detach()` function we need to set `allow_tensor_metadata_change_`
+# to false to make such changes explicitly illegal, in order to prevent users from
+# changing metadata of the detached tensor and expecting the original tensor to also
+# be updated.
+  tags: pointwise
+
+- func: asinh(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: asinh.out
+  tags: [core, pointwise]
+
+- func: asinh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: asinh.out
+  tags: pointwise
+
+- func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: asinh_out
+  tags: pointwise
+
+- func: tan(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: tan_sparse
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr
+  tags: [core, pointwise]
+
+- func: tan_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: tan_sparse_
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_
+  tags: pointwise
+
+- func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: tan_out
+  tags: pointwise
+
+- func: atan(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: atan_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan.out
+  variants: function, method
+  tags: pointwise
+
+- func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: atan_out
+  tags: pointwise
+
+- func: atanh(Tensor self) -> Tensor
+  structured_delegate: atanh.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: atanh_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: atanh.out
+  variants: function, method
+  tags: pointwise
+
+- func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: atanh_out
+  tags: pointwise
+# arctanh, alias for atanh
+
+- func: cosh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cosh.out
+  tags: [core, pointwise]
+
+- func: cosh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cosh.out
+  tags: pointwise
+
+- func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: cosh_out
+  tags: pointwise
+
+- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nan_to_num
+  tags: pointwise
+
+- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nan_to_num_
+  tags: pointwise
+
+- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: nan_to_num_out
+  tags: pointwise
+
+- func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_cross.out
+  dispatch:
+    ZeroTensor: linalg_cross_zerotensor
+
+- func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  structured: True
+  dispatch:
+    XPU: linalg_cross_out
+
+- func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: log_sigmoid(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: log_sigmoid_forward_out_xpu
+
+- func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    XPU: log_sigmoid_forward_xpu
+
+- func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: log_sigmoid_backward_xpu_out
+
+- func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: log_sigmoid_backward_xpu
+
+- func: logical_and(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_and
+  tags: [core, pointwise]
+
+- func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_and_
+  tags: pointwise
+
+- func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: logical_and_out
+  tags: pointwise
+
+- func: logical_or(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_or
+  tags: [core, pointwise]
+
+- func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_or_
+  tags: pointwise
+
+- func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: logical_or_out
+  tags: pointwise
+
+- func: logical_xor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_xor
+  tags: [core, pointwise]
+
+- func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_xor_
+  tags: pointwise
+
+- func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: logical_xor_out
+  tags: pointwise
+
+- func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    XPU: isin_Tensor_Tensor_out
+
+- func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Tensor_Tensor_out
+
+- func: isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    XPU: isin_Tensor_Scalar_out
+
+- func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Tensor_Scalar_out
+
+- func: isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    XPU: isin_Scalar_Tensor_out
+
+- func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Scalar_Tensor_out
+
+- func: equal(Tensor self, Tensor other) -> bool
+  tags: [data_dependent_output, pointwise]
+  variants: method, function
+  dispatch:
+    XPU: xpu_equal
+
+- func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: nextafter_out
+  tags: pointwise
+
+- func: nextafter(Tensor self, Tensor other) -> Tensor
+  structured_delegate: nextafter.out
+  variants: method, function
+  tags: pointwise
+
+- func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: nextafter.out
+  variants: method
+  tags: pointwise
+
+- func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: logit_backward_out
+  tags: pointwise
+
+- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
+  python_module: nn
+  structured_delegate: logit_backward.grad_input
+  tags: pointwise
+
+- func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: logaddexp_out
+  tags: pointwise
+
+- func: logaddexp(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+  structured_delegate: logaddexp.out
+  tags: pointwise
+
+- func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: logaddexp2_out
+  tags: pointwise
+
+- func: logaddexp2(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+  structured_delegate: logaddexp2.out
+  tags: pointwise
+
+- func: floor_divide(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: floor_divide
+
+- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: floor_divide_
+
+- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: floor_divide_out
+
+- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: floor_divide
+
+- func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: floor_divide_
+  autogen: floor_divide.Scalar_out
+
+- func: fmax(Tensor self, Tensor other) -> Tensor
+  structured_delegate: fmax.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: fmax_out
+  tags: pointwise
+
+- func: fmin(Tensor self, Tensor other) -> Tensor
+  structured_delegate: fmin.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: fmin_out
+  tags: pointwise
+
+- func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    XPU: exponential_
+  autogen: exponential, exponential.out
+
+- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  dispatch:
+    XPU: _embedding_bag_backward_symint
+
+- func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
+  variants: method
+  dispatch:
+    XPU: masked_scatter__xpu
+  autogen: masked_scatter.out
+
+- func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: masked_scatter
+
+- func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: masked_select_out_xpu
+  tags: dynamic_output_shape
+
+- func: masked_select(Tensor self, Tensor mask) -> Tensor
+  variants: method, function
+  dispatch:
+    XPU: masked_select_xpu
+  tags: dynamic_output_shape
+
+- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    XPU: smooth_l1_loss_out
+
+- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: smooth_l1_loss.out
+  python_module: nn
+
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: smooth_l1_loss_backward_out
+
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: smooth_l1_loss_backward
+
+- func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  variants: function
+  dispatch:
+    XPU: binary_cross_entropy_xpu
+
+- func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  variants: function
+  dispatch:
+    XPU: binary_cross_entropy_out_xpu
+
+- func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  python_module: nn
+  variants: function
+  dispatch:
+    XPU: binary_cross_entropy_backward_xpu
+
+- func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  variants: function
+  dispatch:
+    XPU: binary_cross_entropy_backward_out_xpu
+
+- func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: binary_cross_entropy_with_logits
+  autogen: binary_cross_entropy_with_logits.out
+
+- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+  structured_delegate: argmin.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    XPU: argmin_out
+
+- func: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: nansum
+
+- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: nansum_out
+
+- func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: norm
+  autogen: norm.ScalarOpt_dtype_out
+
+- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: norm
+  autogen: norm.Scalar_out
+
+- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  structured_delegate: norm.dtype_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
+  structured_delegate: norm.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: norm_dtype_out
+
+- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: norm_out
+
+# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
+- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: aminmax.out
+  variants: function, method
+
+- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    XPU: aminmax_out
+
+- func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: index_fill_
+  autogen: index_fill.int_Scalar_out
+
+- func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_fill
+
+- func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: index_fill_
+  autogen: index_fill.int_Tensor_out
+
+- func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_fill
+
+- func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: polar(Tensor abs, Tensor angle) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: polar
+
+- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: polar_out
+
+- func: digamma_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: digamma.out
+  variants: method
+  tags: pointwise
+
+- func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: digamma_out
+  tags: pointwise
+
+- func: digamma(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: digamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: polygamma_out
+  tags: pointwise
+
+- func: polygamma(int n, Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: polygamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: polygamma_
+  tags: pointwise
+
+- func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: lgamma_out
+  tags: pointwise
+
+- func: lgamma_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: lgamma.out
+  variants: method
+  tags: pointwise
+
+- func: lgamma(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: lgamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: log10(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log10.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log10_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log10.out
+  variants: function, method
+  tags: pointwise
+
+- func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: log10_out
+  tags: pointwise
+
+- func: log1p(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log1p.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log1p_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log1p.out
+  variants: function, method
+  tags: pointwise
+
+- func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: log1p_out
+  tags: pointwise
+
+- func: log2(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log2.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log2_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log2.out
+  variants: function, method
+  tags: pointwise
+
+- func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: log2_out
+  tags: pointwise
+
+- func: logit(Tensor self, float? eps=None) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: logit
+  tags: pointwise
+
+- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    XPU: logit_
+  tags: pointwise
+
+- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: logit_out
+  tags: pointwise
+
+- func: erfinv(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfinv.out
+  variants: method, function
+  tags: pointwise
+
+- func: erfinv_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfinv.out
+  variants: method
+  tags: pointwise
+
+- func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: erfinv_out
+  tags: pointwise
+
+- func: exp2(Tensor self) -> Tensor
+  structured_delegate: exp2.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp2_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: exp2.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: exp2_out
+  tags: pointwise
+
+- func: expm1(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: expm1.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: expm1_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: expm1.out
+  variants: function, method
+  tags: pointwise
+
+- func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: expm1_out
+  tags: pointwise
+
+- func: frac(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: frac.out
+  variants: function, method
+  tags: pointwise
+
+- func: frac_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: frac.out
+  variants: function, method
+  tags: pointwise
+
+- func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: frac_out
+  tags: pointwise
+
+- func: asin(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: asin.out
+  tags: [core, pointwise]
+
+- func: asin_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: asin.out
+  tags: pointwise
+
+- func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: asin_out
+  tags: pointwise
+
+- func: round(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: round_sparse
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr
+  tags: [core, pointwise]
+
+- func: round_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: round_sparse_
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_
+  tags: pointwise
+
+- func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: round_out
+  tags: pointwise
+
+- func: round.decimals(Tensor self, *, int decimals) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.decimals_out
+  variants: function, method
+  tags: pointwise
+
+- func: round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.decimals_out
+  variants: function, method
+  tags: pointwise
+
+- func: round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: round_decimals_out
+  tags: pointwise
+
+- func: floor(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: floor.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: floor_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: floor.out
+  variants: function, method
+  tags: pointwise
+
+- func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: floor_out
+  tags: pointwise
+
+- func: replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: replication_pad1d_out_xpu
+
+- func: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad1d.out
+
+- func: replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: replication_pad1d_backward_out_xpu
+
+- func: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad1d_backward.grad_input
+
+- func: replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: replication_pad2d_out_xpu
+
+- func: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad2d.out
+  tags: core
+
+- func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: replication_pad2d_backward_out_xpu
+
+- func: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: replication_pad2d_backward_xpu
+
+- func: replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: replication_pad3d_out_xpu
+
+- func: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad3d.out
+  tags: core
+
+- func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: replication_pad3d_backward_out_xpu
+
+- func: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: replication_pad3d_backward_xpu
+
+# - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
+#   dispatch:
+#     XPU: histogram_histc_out
+
+# - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
+#   variants: method, function
+#   dispatch:
+#     XPU: histogram_histc
+
+- func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+  dispatch:
+    XPU: histogram_out
+
+- func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+  variants: method, function
+  dispatch:
+    XPU: histogram
+
+- func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+  dispatch:
+    XPU: histogram_out
+
+- func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+  variants: method, function
+  dispatch:
+    XPU: histogram
+
+# - func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor
+#   dispatch:
+#     XPU: _histogramdd
+#   autogen: _histogramdd_from_bin_cts.out
+
+# - func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor
+#   dispatch:
+#     XPU: _histogramdd
+#   autogen: _histogramdd_from_bin_tensors.out
+
+- func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: _unique_xpu
+  autogen: _unique.out
+
+- func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: unique_dim_xpu
+  tags: dynamic_output_shape
+  autogen: unique_dim.out
+
+- func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: unique_consecutive_xpu
+  tags: dynamic_output_shape
+  autogen: unique_consecutive.out
+
+- func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: unique_dim_consecutive_xpu
+  tags: dynamic_output_shape
+  autogen: unique_dim_consecutive.out
+
+# _unique and _unique_dim are fragile and modifying them easily cause internal break
+# the below operator is a temporary hack for adding return_counts support
+# Please don't rely on these two operators, they will be removed soon
+
+- func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: _unique2_xpu
+  tags: dynamic_output_shape
+  autogen: _unique2.out
+
+- func: upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_linear1d.vec_out
+
+# NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility.
+- func: upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_linear1d_out_xpu
+
+- func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_linear1d.out
+
+- func: upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: upsample_linear1d_backward_out_xpu
+
+- func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_linear1d_backward.grad_input
+
+
+- func: nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: nll_loss2d_symint
+
+- func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  dispatch:
+    XPU: nll_loss2d_forward_out_xpu
+
+- func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  python_module: nn
+  dispatch:
+    XPU: nll_loss2d_forward_xpu
+
+- func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    XPU: nll_loss2d_backward_out_xpu
+
+- func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  python_module: nn
+  dispatch:
+    XPU: nll_loss2d_backward_xpu
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: adaptive_max_pool2d_out_xpu
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: adaptive_max_pool2d.out
+
+- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    XPU: adaptive_max_pool2d_backward_out_xpu
+
+- func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: adaptive_max_pool2d_backward.grad_input
+
+  # VariableType::_weight_norm does not want to be given a gap in the autograd graph,
+# so we don't define "dispatch" variants for it.
+- func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
+  variants: function
+
+- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: weight_norm_xpu
+  autogen: _weight_norm_interface.out
+
+- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    XPU: weight_norm_backward_xpu
+  autogen: _weight_norm_interface_backward.out
+
+- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+  variants: function
+
+- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    XPU: renorm_out
+
+- func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: renorm.out
+
+- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: renorm.out
+
+- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  structured: True
+  dispatch:
+    XPU: topk_out_xpu
+
+- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+  variants: method, function
+  structured_delegate: topk.values
+  tags: core
+
+- func: repeat(Tensor self, SymInt[] repeats) -> Tensor
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  dispatch:
+    CompositeExplicitAutograd: repeat
+    MPS: repeat_mps
+  autogen: repeat.out
+  tags: core
+
+- func: repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor
+  variants: function
+  dispatch:
+    XPU: repeat_interleave_xpu
+  tags: dynamic_output_shape
+  autogen: repeat_interleave.Tensor_out
+
+- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: repeat_interleave_symint
+
+- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: repeat_interleave_symint
+
+- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    XPU: prod
+  autogen: prod.out
+  tags: core
+
+- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: prod.int_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    XPU: prod_out
+
+- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: signbit(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: signbit.out
+  tags: pointwise
+
+- func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: signbit_out
+  tags: pointwise
+
+- func: median(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: median_xpu
+  autogen: median.out
+
+- func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: median
+
+- func: median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    XPU: median_out_xpu
+
+- func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: nanmedian(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    XPU: nanmedian_xpu
+  autogen: nanmedian.out
+
+- func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nanmedian
+
+- func: nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    XPU: nanmedian_out_xpu
+
+- func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: prelu(Tensor self, Tensor weight) -> Tensor
+  variants: function, method
+  autogen: prelu.out
+
+- func: _prelu_kernel(Tensor self, Tensor weight) -> Tensor
+  dispatch:
+    XPU: _prelu_kernel
+
+- func: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
+  dispatch:
+    XPU: _prelu_kernel_backward
+
+- func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    XPU: __lshift__
+  tags: pointwise
+
+- func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    XPU: __lshift__
+  tags: pointwise
+
+- func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: __ilshift__
+  autogen: __lshift__.Scalar_out
+  tags: pointwise
+
+- func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: __ilshift__
+  autogen: __lshift__.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: bitwise_left_shift_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift
+  tags: pointwise
+
+- func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift_
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift
+  autogen: bitwise_left_shift.Scalar_Tensor_out
+  tags: pointwise
+
+- func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    XPU: __rshift__
+  tags: pointwise
+
+- func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    XPU: __rshift__
+  tags: pointwise
+
+- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: __irshift__
+  autogen: __rshift__.Scalar_out
+
+- func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    XPU: __irshift__
+  autogen: __rshift__.Tensor_out
+
+- func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: bitwise_right_shift_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift
+  tags: pointwise
+
+- func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift_
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift
+  autogen: bitwise_right_shift.Scalar_Tensor_out
+  tags: pointwise
+
+- func: sign(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sign.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: sign_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sign.out
+  variants: method
+  tags: pointwise
+
+- func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    XPU: sign_out
+  tags: pointwise
+
+- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: range_out_no_step
+
+- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: range_xpu_out
+  cpp_no_default_args: ['step']
diff --git a/yaml/native/tags.yaml b/yaml/native/tags.yaml
new file mode 100644
index 000000000..c31721729
--- /dev/null
+++ b/yaml/native/tags.yaml
@@ -0,0 +1,65 @@
+# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml`
+
+- tag: inplace_view
+  desc: |
+          This tag indicates if an operator *only* modifies the tensor metadata
+- tag: pt2_compliant_tag
+  desc: |
+          This tag indicates if the operator is guaranteed to
+          work with the PT2 compilation APIs (torch.compile,
+          torch.export, etc). If you add this tag to an
+          operator, please use
+          `torch.testing._internal.optest.opcheck` to test that
+          the operator has been registered correctly and
+          works with torch.compile
+- tag: view_copy
+  desc: |
+          This tag indicates operators that are *_copy* variants
+          of view/aliasing operators. If an operator has a view_copy tag,
+          then it should have the name {op}_copy, where {op} is a view operator.
+- tag: dynamic_output_shape
+  desc: |
+          This tag indicates if an operator's output's shape depends on input Tensor
+          data.
+- tag: data_dependent_output
+  desc: |
+          Operator has a non-Tensor output whose value is dependent on the data
+          of Tensor inputs.  Among other things, this implies that this operator
+          cannot be run with meta tensor (since data is not available), nor
+          can it be symbolically traced.
+- tag: generated
+  desc: |
+          This tag indicates that the operator doesn't have an explicit entry in
+          native_functions.yaml, and instead was generated automatically by the codegen.
+- tag: nondeterministic_seeded
+  desc: |
+          This tag indicates if an operator is nondeterministically seeded
+          (i.e., is random) such that the operator intentionally produces
+          different results when run twice on the same inputs, but this randomness
+          is controlled by a Generator which, if reseeded would give you the
+          same result.
+- tag: nondeterministic_bitwise
+  desc: |
+          This tag indicates if an operator doesn't guarantee bitwise equivalence
+          across different runs of an operator with identical inputs.
+- tag: needs_fixed_stride_order
+  desc: |
+          This tag indicates that the operator should be passed Tensors following
+          the same stride permutation as observed in eager when compiled in inductor.
+
+# NOTE [Core ATen Ops]
+- tag: core
+  desc: |
+          Core aten ops is a subset of aten ops that remains after aten-to-aten decomposition and
+          functionalization pass. Core aten ops are fully functional and adhere to single static
+          assignment (SSA): this implies there will be no `inplace` or `_out` variants in this opset.
+          This opset is designed to serve as the functional IR to interface with compiler backends.
+          In contrast to primTorch, core aten opset doesn't decompose ops into explicit
+          type promotion and broadcasting ops.
+          Core aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True),
+          and thus can be used as an opset for export purpose.
+- tag: pointwise
+  desc: |
+          Pointwise operators are operators where each element of the output is computed only by accessing
+          the corresponding element of all the broadcasted inputs. The output shape will be the broadcasted
+          shape of the inputs.