diff --git a/.lintrunner.toml b/.lintrunner.toml index bd49e3721..9be45a043 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -56,8 +56,8 @@ code = 'CLANGFORMAT' include_patterns = [ 'src/aten/*.h', 'src/aten/*.cpp', - 'src/aten/sycl/*.h', - 'src/aten/sycl/*.cpp', + 'src/ATen/native/xpu/sycl/*.h', + 'src/ATen/native/xpu/sycl/*.cpp', 'aten/src/ATen/*.h', 'aten/src/ATen/mps/**/*.mm', 'aten/src/ATen/xpu/**/*.h', diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake index f85598b07..0ee38df7f 100644 --- a/cmake/BuildFlags.cmake +++ b/cmake/BuildFlags.cmake @@ -47,6 +47,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" list(APPEND SYCL_HOST_FLAGS -O0) endif(CMAKE_BUILD_TYPE MATCHES Debug) + if(USE_PER_OPERATOR_HEADERS) + list(APPEND SYCL_HOST_FLAGS -DAT_PER_OPERATOR_HEADERS) + endif() + # -- Kernel flags (SYCL_KERNEL_OPTIONS) # The fast-math will be enabled by default in SYCL compiler. # Refer to [https://clang.llvm.org/docs/UsersManual.html#cmdoption-fno-fast-math] diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index e579576ff..090f924ab 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -3,7 +3,7 @@ if(Codegen_GPU_cmake_included) endif() set(Codegen_GPU_cmake_included true) -set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/aten/src/ATen/xpu") +set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen/") file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED}) set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU.cpp) @@ -43,10 +43,64 @@ function(GEN_BACKEND file_yaml) ) endfunction(GEN_BACKEND) -GEN_BACKEND( - xpu_functions.yaml - XPUNativeFunctions.h - RegisterXPU.cpp) + +set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU.cpp) +set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template) +function(GEN_XPU file_yaml) + set(generated_files "") + foreach(f ${ARGN}) + list(APPEND generated_files "${BUILD_TORCH_XPU_ATEN_GENERATED}/${f}") + endforeach() + file(GLOB_RECURSE depend_files ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml}) + set(CODEGEN_TEMPLATE ${TORCH_XPU_OPS_ROOT}/yaml/) + + # Codegen prepare process + if(WIN32) + string(REPLACE "/" "\\" LinkPATH "${CODEGEN_TEMPLATE}templates") + string(REPLACE "/" "\\" TargetPATH "${CMAKE_SOURCE_DIR}/aten/src/ATen/templates") + execute_process(COMMAND cmd /c mklink /D ${LinkPATH} ${TargetPATH}) + string(REPLACE "/" "\\" RegisterXPU_PATH_BACKSLASH "${RegisterXPU_PATH}") + string(REPLACE "/" "\\" XPUFallback_PATH_BACKSLASH "${XPUFallback_PATH}") + set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH_BACKSLASH} ">>" ${RegisterXPU_PATH_BACKSLASH}) + else() + execute_process(COMMAND ln -s ${CMAKE_SOURCE_DIR}/aten/src/ATen/templates ${CODEGEN_TEMPLATE}) # soft link to pytorch templates + set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH} ">>" ${RegisterXPU_PATH}) + endif() + + add_custom_command( + OUTPUT ${generated_files} + COMMAND + "${PYTHON_EXECUTABLE}" -m torchgen.gen + --source-path ${TORCH_XPU_OPS_ROOT}/yaml/ + --install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED} + --per-operator-headers + --static-dispatch-backend + --backend-whitelist=XPU + COMMAND + ${REGISTER_FALLBACK_CMD} + # Codegen post-process + COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH} + ${SIMPLE_TRACE} + WORKING_DIRECTORY ${TORCH_ROOT} + DEPENDS + ${depended_files} + ${TORCH_XPU_OPS_ROOT}/yaml/native/${file_yaml} + ${XPUFallback_PATH} + ) +endfunction(GEN_XPU) + +# GEN_BACKEND( +# xpu_functions.yaml +# XPUNativeFunctions.h +# RegisterXPU.cpp) + +GEN_XPU( + native_functions.yaml + XPUFunctions.h + RegisterXPU.cpp +) + + list(APPEND xpu_generated_src ${RegisterXPU_PATH}) diff --git a/src/ATen/native/sparse/SparseTensor.cpp b/src/ATen/native/sparse/SparseTensor.cpp index b842b9839..cd9a755fe 100644 --- a/src/ATen/native/sparse/SparseTensor.cpp +++ b/src/ATen/native/sparse/SparseTensor.cpp @@ -5,13 +5,9 @@ #include #include -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else #include #include -#endif +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/Activation.cpp b/src/ATen/native/xpu/Activation.cpp index 38aa44dc6..ce000752f 100644 --- a/src/ATen/native/xpu/Activation.cpp +++ b/src/ATen/native/xpu/Activation.cpp @@ -1,7 +1,15 @@ #include #include +#include +#include #include -#include + +#include + +#include +#include +#include +#include #include #include @@ -13,659 +21,57 @@ #include #include #include + #include #include #include namespace at { -Tensor XPUNativeFunctions::relu(const Tensor& self) { - TORCH_CHECK( - self.scalar_type() != at::kBool, "Boolean inputs not supported for relu"); - return at::clamp_min(self, 0); -} - -Tensor& XPUNativeFunctions::relu_(Tensor& self) { - TORCH_CHECK( - self.scalar_type() != at::kBool, "Boolean inputs not supported for relu"); - return at::clamp_min_(self, 0); -} - -Tensor& XPUNativeFunctions::relu_out(const Tensor& self, Tensor& out) { - TORCH_CHECK( - self.scalar_type() != at::kBool, "Boolean inputs not supported for relu"); - return at::clamp_min_out(out, self, 0); -} - -TensorIterator threshold_meta( - const Tensor& self, - const Scalar& threshold, - const Scalar& value, - Tensor& out) { - TensorIterator iter; - iter.build(TensorIteratorConfig() - .set_check_mem_overlap( - false) // threshold is idempotent, so overlap is okay - .add_output(out) - .add_const_input(self) - .add_const_input(self) // other - .allow_cpu_scalars(true) - .promote_inputs_to_common_dtype(true) - .cast_common_dtype_to_outputs(true) - .enforce_safe_casting_to_output(true)); - return iter; -} - -Tensor XPUNativeFunctions::threshold( - const Tensor& self, - const Scalar& threshold, - const Scalar& value) { - Tensor out; - auto iter = threshold_meta(self, threshold, value, out); - native::xpu::threshold_kernel(iter, threshold, value); - return iter.output(); -} - -Tensor& XPUNativeFunctions::threshold_( - Tensor& self, - const Scalar& threshold, - const Scalar& value) { - auto iter = threshold_meta(self, threshold, value, self); - native::xpu::threshold_kernel(iter, threshold, value); - return self; -} - -Tensor& XPUNativeFunctions::threshold_out( - const Tensor& self, - const Scalar& threshold, - const Scalar& value, - Tensor& out) { - auto iter = threshold_meta(self, threshold, value, out); - native::xpu::threshold_kernel(iter, threshold, value); - return out; -} - -TensorIterator threshold_backward_meta( - const Tensor& grad, - const Tensor& self, - const Scalar& threshold, - Tensor& gradInput) { - TensorIterator iter; - iter.build(TensorIteratorConfig() - .set_check_mem_overlap( - false) // threshold is idempotent, so overlap is okay - .add_output(gradInput) - .add_input(self) - .add_input(grad) // other - .allow_cpu_scalars(true) - .promote_inputs_to_common_dtype(true) - .cast_common_dtype_to_outputs(true) - .enforce_safe_casting_to_output(true)); - return iter; -} - -Tensor XPUNativeFunctions::threshold_backward( - const Tensor& grad_output, - const Tensor& self, - const Scalar& threshold) { - Tensor grad_input; - auto iter = threshold_backward_meta(grad_output, self, threshold, grad_input); - native::xpu::threshold_kernel(iter, threshold, 0); - return iter.output(); -} - -Tensor& XPUNativeFunctions::threshold_backward_out( - const Tensor& grad_output, - const Tensor& self, - const Scalar& threshold, - Tensor& grad_input) { - auto iter = threshold_backward_meta(grad_output, self, threshold, grad_input); - native::xpu::threshold_kernel(iter, threshold, 0); - return grad_input; -} - -Tensor XPUNativeFunctions::gelu( - const Tensor& self, - c10::string_view approximate) { - Tensor out; - auto iter = TensorIterator::unary_op(out, self); - native::xpu::gelu_kernel(iter, approximate); - return iter.output(); -} - -Tensor& XPUNativeFunctions::gelu_(Tensor& self, c10::string_view approximate) { - auto iter = TensorIterator::unary_op(self, self); - native::xpu::gelu_kernel(iter, approximate); - return self; -} - -Tensor& XPUNativeFunctions::gelu_out( - const Tensor& self, - c10::string_view approximate, - Tensor& out) { - auto iter = TensorIterator::unary_op(out, self); - native::xpu::gelu_kernel(iter, approximate); - return out; -} - -Tensor XPUNativeFunctions::gelu_backward( - const Tensor& grad_output, - const Tensor& self, - c10::string_view approximate) { - Tensor grad_input; - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, self); - native::xpu::gelu_backward_kernel(iter, approximate); - return iter.output(); -} - -Tensor& XPUNativeFunctions::gelu_backward_out( - const Tensor& grad_output, - const Tensor& self, - c10::string_view approximate, - Tensor& grad_input) { - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, self); - native::xpu::gelu_backward_kernel(iter, approximate); - return grad_input; -} - -TensorIterator elu_meta( - const Tensor& self, - const Scalar& alpha, - const Scalar& scale, - const Scalar& input_scale, - Tensor& out) { - TensorIterator iter; - iter = TensorIterator::unary_op(out, self); - return iter; -} - -Tensor& XPUNativeFunctions::elu_out( - const Tensor& self, - const Scalar& alpha, - const Scalar& scale, - const Scalar& input_scale, - Tensor& out) { - auto iter = elu_meta(self, alpha, scale, input_scale, out); - native::xpu::elu_kernel(iter, alpha, scale, input_scale); - return out; -} - -Tensor XPUNativeFunctions::elu( - const Tensor& self, - const Scalar& alpha, - const Scalar& scale, - const Scalar& input_scale) { - Tensor out; - auto iter = elu_meta(self, alpha, scale, input_scale, out); - native::xpu::elu_kernel(iter, alpha, scale, input_scale); - return iter.output(); -} - -Tensor& XPUNativeFunctions::elu_( - Tensor& self, - const Scalar& alpha, - const Scalar& scale, - const Scalar& input_scale) { - auto iter = elu_meta(self, alpha, scale, input_scale, self); - native::xpu::elu_kernel(iter, alpha, scale, input_scale); - return self; -} - -TensorIterator elu_backward_meta( - const Tensor& grad_output, - const Scalar& alpha, - const Scalar& scale, - const Scalar& input_scale, - bool is_result, - const Tensor& self_or_result, - Tensor& grad_input) { - TORCH_CHECK( - !is_result || alpha.to() >= 0.0, - "In-place elu backward calculation is triggered with a negative slope which is not supported. " - "This is caused by calling in-place forward function with a negative slope, " - "please call out-of-place version instead."); - - TensorIterator iter; - iter = TensorIterator::borrowing_binary_op( - grad_input, grad_output, self_or_result); - return iter; -} - -Tensor& XPUNativeFunctions::elu_backward_out( - const Tensor& grad_output, - const Scalar& alpha, - const Scalar& scale, - const Scalar& input_scale, - bool is_result, - const Tensor& self_or_result, - Tensor& grad_input) { - auto iter = elu_backward_meta( - grad_output, - alpha, - scale, - input_scale, - is_result, - self_or_result, - grad_input); - native::xpu::elu_backward_kernel(iter, alpha, scale, input_scale, is_result); - return grad_input; -} - -Tensor XPUNativeFunctions::elu_backward( - const Tensor& grad_output, - const Scalar& alpha, - const Scalar& scale, - const Scalar& input_scale, - bool is_result, - const Tensor& self_or_result) { - Tensor grad_input; - auto iter = elu_backward_meta( - grad_output, - alpha, - scale, - input_scale, - is_result, - self_or_result, - grad_input); - native::xpu::elu_backward_kernel(iter, alpha, scale, input_scale, is_result); - return iter.output(); -} - -Tensor XPUNativeFunctions::silu(const Tensor& self) { - Tensor out; - auto iter = TensorIterator::unary_op(out, self); - native::xpu::silu_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::silu_(Tensor& self) { - auto iter = TensorIterator::unary_op(self, self); - native::xpu::silu_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::silu_out(const Tensor& self, Tensor& out) { - auto iter = TensorIterator::unary_op(out, self); - native::xpu::silu_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::silu_backward( - const Tensor& grad_output, - const Tensor& self) { - Tensor grad_input; - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, self); - native::xpu::silu_backward_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::silu_backward_out( - const Tensor& grad_output, - const Tensor& self, - Tensor& grad_input) { - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, self); - native::xpu::silu_backward_kernel(iter); - return grad_input; -} - -Tensor XPUNativeFunctions::hardtanh( - const Tensor& self, - const Scalar& min, - const Scalar& max) { - Tensor result = at::empty_like(self); - return at::hardtanh_out(result, self, min, max); -} - -Tensor& XPUNativeFunctions::hardtanh_out( - const Tensor& self, - const Scalar& min, - const Scalar& max, - Tensor& result) { - TORCH_CHECK( - self.scalar_type() != at::kBool, - "Boolean inputs not supported for hardtanh"); - Scalar min_, max_; - if (at::isIntegralType(self.scalar_type(), /*include_bool*/ false)) { - int64_t minval = min.toLong(); - int64_t maxval = max.toLong(); - TORCH_CHECK( - self.dtype() != at::kByte || (minval >= 0 && maxval >= 0), - "cannot do hardtanh on an unsigned type with negative limits"); - min_ = minval; - max_ = maxval; - } else { - min_ = min; - max_ = max; - } - return at::clamp_out(result, self, min_, max_); -} - -Tensor& XPUNativeFunctions::hardtanh_( - Tensor& self, - const Scalar& min, - const Scalar& max) { - return at::hardtanh_out(self, self, min, max); -} - -Tensor& XPUNativeFunctions::hardtanh_backward_out( - const Tensor& grad_output, - const Tensor& self, - const Scalar& min, - const Scalar& max, - Tensor& grad_input) { - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, self); - native::xpu::hardtanh_backward_kernel(iter, min, max); - return grad_input; -} - -Tensor XPUNativeFunctions::hardtanh_backward( - const Tensor& grad_output, - const Tensor& self, - const Scalar& min, - const Scalar& max) { - Tensor result; - auto iter = TensorIterator::borrowing_binary_op(result, grad_output, self); - native::xpu::hardtanh_backward_kernel(iter, min, max); - return iter.output(); -} - -Tensor XPUNativeFunctions::hardswish(const Tensor& self) { - Tensor result; - auto iter = TensorIterator::unary_op(result, self); - native::xpu::hardswish_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::hardswish_out(const Tensor& self, Tensor& result) { - auto iter = TensorIterator::unary_op(result, self); - native::xpu::hardswish_kernel(iter); - return result; -} - -Tensor& XPUNativeFunctions::hardswish_(Tensor& self) { - auto iter = TensorIterator::unary_op(self, self); - native::xpu::hardswish_kernel(iter); - return self; -} - -Tensor XPUNativeFunctions::hardswish_backward( - const Tensor& grad_output, - const Tensor& self) { - Tensor grad_input; - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, self); - native::xpu::hardswish_backward_kernel(iter); - return iter.output(); -} - -Tensor XPUNativeFunctions::hardsigmoid(const Tensor& self) { - Tensor out; - auto iter = TensorIterator::unary_op(out, self); - native::xpu::hardsigmoid_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::hardsigmoid_(Tensor& self) { - auto iter = TensorIterator::unary_op(self, self); - native::xpu::hardsigmoid_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::hardsigmoid_out(const Tensor& self, Tensor& out) { - auto iter = TensorIterator::unary_op(out, self); - native::xpu::hardsigmoid_kernel(iter); - return out; -} - -TensorIterator hardsigmoid_backward_meta( - const Tensor& grad_output, - const Tensor& self, - Tensor& grad_input) { - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, self); - return iter; -} - -Tensor XPUNativeFunctions::hardsigmoid_backward( - const Tensor& grad_output, - const Tensor& self) { - Tensor grad_input; - auto iter = hardsigmoid_backward_meta(grad_output, self, grad_input); - native::xpu::hardsigmoid_backward_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::hardsigmoid_backward_out( - const Tensor& grad_output, - const Tensor& self, - Tensor& grad_input) { - auto iter = hardsigmoid_backward_meta(grad_output, self, grad_input); - native::xpu::hardsigmoid_backward_kernel(iter); - return grad_input; -} - -Tensor XPUNativeFunctions::leaky_relu( - const Tensor& self, - const Scalar& negval) { - Tensor out; - auto iter = TensorIterator::unary_op(out, self); - native::xpu::leaky_relu_kernel(iter, negval); - return iter.output(); -} - -Tensor& XPUNativeFunctions::leaky_relu_(Tensor& self, const Scalar& negval) { - auto iter = TensorIterator::unary_op(self, self); - native::xpu::leaky_relu_kernel(iter, negval); - return self; -} - -Tensor& XPUNativeFunctions::leaky_relu_out( - const Tensor& self, - const Scalar& negval, - Tensor& out) { - auto iter = TensorIterator::unary_op(out, self); - native::xpu::leaky_relu_kernel(iter, negval); - return out; -} - -TensorIterator leaky_relu_backward_meta( - const Tensor& grad_output, - const Tensor& self, - const Scalar& negval, - bool is_result, - const Tensor& grad_input) { - TORCH_CHECK( - !is_result || negval.to() >= 0.0, - "In-place leakyReLu backward calculation is triggered with a negative slope which is not supported. " - "This is caused by calling in-place forward function with a negative slope, " - "please call out-of-place version instead. File an issue at https://github.com/pytorch/pytorch if you do " - "require supporting in-place leakRelu backward calculation with negative slope"); - - return TensorIterator::borrowing_binary_op(grad_input, self, grad_output); -} - -Tensor XPUNativeFunctions::leaky_relu_backward( - const Tensor& grad_output, - const Tensor& self, - const Scalar& negval, - bool is_result) { - Tensor grad_input; - auto iter = leaky_relu_backward_meta( - grad_output, self, negval, is_result, grad_input); - native::xpu::leaky_relu_backward_kernel(iter, negval); - return iter.output(); -} - -Tensor& XPUNativeFunctions::leaky_relu_backward_out( - const Tensor& grad_output, - const Tensor& self, - const Scalar& negval, - bool is_result, - Tensor& grad_input) { - auto iter = leaky_relu_backward_meta( - grad_output, self, negval, is_result, grad_input); - native::xpu::leaky_relu_backward_kernel(iter, negval); - return grad_input; -} - -TensorIterator softplus_meta( - const Tensor& self, - const Scalar& beta, - const Scalar& threshold, - Tensor& out) { - return TensorIterator::unary_op(out, self); -} - -Tensor XPUNativeFunctions::softplus( - const Tensor& self, - const Scalar& beta, - const Scalar& threshold) { - Tensor out; - auto iter = softplus_meta(self, beta, threshold, out); - native::xpu::softplus_kernel(iter, beta, threshold); - return iter.output(); -} - -Tensor& XPUNativeFunctions::softplus_out( - const Tensor& self, - const Scalar& beta, - const Scalar& threshold, - Tensor& out) { - auto iter = softplus_meta(self, beta, threshold, out); - native::xpu::softplus_kernel(iter, beta, threshold); - return out; -} - -TensorIterator softplus_backward_meta( - const Tensor& grad_output, - const Tensor& self, - const Scalar& beta, - const Scalar& threshold, - Tensor& grad_input) { - return TensorIterator::borrowing_binary_op(grad_input, grad_output, self); -} - -Tensor XPUNativeFunctions::softplus_backward( - const Tensor& grad_output, - const Tensor& self, - const Scalar& beta, - const Scalar& threshold) { - Tensor grad_input; - auto iter = - softplus_backward_meta(grad_output, self, beta, threshold, grad_input); - native::xpu::softplus_backward_kernel(iter, beta, threshold); - return iter.output(); -} - -Tensor& XPUNativeFunctions::softplus_backward_out( - const Tensor& grad_output, - const Tensor& self, - const Scalar& beta, - const Scalar& threshold, - Tensor& grad_input) { - auto iter = - softplus_backward_meta(grad_output, self, beta, threshold, grad_input); - native::xpu::softplus_backward_kernel(iter, beta, threshold); - return grad_input; -} - -static inline void softshrink_check(const Scalar& lambd) { - double lamb = lambd.to(); - TORCH_CHECK( - lamb >= 0, - "lambda must be greater or equal to 0, but found to be ", - lamb, - "."); -} - -TensorIterator softshrink_meta( - const Tensor& self, - const Scalar& lambd, - Tensor& out) { - softshrink_check(lambd); - return TensorIterator::unary_op(out, self); -} - -Tensor XPUNativeFunctions::softshrink(const Tensor& self, const Scalar& lambd) { - Tensor out; - auto iter = softshrink_meta(self, lambd, out); - native::xpu::softshrink_kernel(iter, lambd); - return iter.output(); -} - -Tensor& XPUNativeFunctions::softshrink_out( - const Tensor& self, - const Scalar& lambd, - Tensor& out) { - auto iter = softshrink_meta(self, lambd, out); - native::xpu::softshrink_kernel(iter, lambd); - return out; -} - -TensorIterator softshrink_backward_meta( - const Tensor& grad_output, - const Tensor& self, - const Scalar& lambd, - Tensor& grad_input) { - return TensorIterator::borrowing_binary_op(grad_input, grad_output, self); -} - -Tensor XPUNativeFunctions::softshrink_backward( - const Tensor& grad_output, - const Tensor& self, - const Scalar& lambd) { - Tensor grad_input; - auto iter = softshrink_backward_meta(grad_output, self, lambd, grad_input); - native::xpu::softshrink_backward_kernel(iter, lambd); - return iter.output(); -} - -Tensor& XPUNativeFunctions::softshrink_backward_out( - const Tensor& grad_output, - const Tensor& self, - const Scalar& lambd, - Tensor& grad_input) { - auto iter = softshrink_backward_meta(grad_output, self, lambd, grad_input); - native::xpu::softshrink_backward_kernel(iter, lambd); - return grad_input; -} - -Tensor XPUNativeFunctions::_prelu_kernel( - const Tensor& self, - const Tensor& weight) { - // Weight broadcasts over self and they have the same dtype - auto result = at::empty_like(self); - auto iter = TensorIteratorConfig() - .add_output(result) - .add_const_input(self) - .add_const_input(weight) - .build(); - native::xpu::prelu_kernel(iter); - return result; -} - -std::tuple XPUNativeFunctions::_prelu_kernel_backward( - const Tensor& grad_out, - const Tensor& self, - const Tensor& weight) { - Tensor grad_self = at::empty({0}, self.options()); - Tensor grad_weight = at::empty({0}, weight.options()); - auto iter = TensorIteratorConfig() - .add_output(grad_self) - .add_output(grad_weight) - .add_const_input(self) - .add_const_input(weight) - .add_const_input(grad_out) - .build(); - native::xpu::prelu_backward_kernel(iter); - return {grad_self, grad_weight}; -} - -std::tuple XPUNativeFunctions::log_sigmoid_forward_out( +namespace native { +REGISTER_XPU_DISPATCH(threshold_stub, &xpu::threshold_kernel); +REGISTER_XPU_DISPATCH(elu_stub, &xpu::elu_kernel); +REGISTER_XPU_DISPATCH(elu_backward_stub, &xpu::elu_backward_kernel); +REGISTER_XPU_DISPATCH(silu_stub, &xpu::silu_kernel); +REGISTER_XPU_DISPATCH(silu_backward_stub, &xpu::silu_backward_kernel); +REGISTER_XPU_DISPATCH(hardswish_stub, &xpu::hardswish_kernel); +REGISTER_XPU_DISPATCH(hardswish_backward_stub, &xpu::hardswish_backward_kernel); +REGISTER_XPU_DISPATCH(hardtanh_backward_stub, &xpu::hardtanh_backward_kernel); +REGISTER_XPU_DISPATCH(hardsigmoid_stub, &xpu::hardsigmoid_kernel); +REGISTER_XPU_DISPATCH( + hardsigmoid_backward_stub, + &xpu::hardsigmoid_backward_kernel); +REGISTER_XPU_DISPATCH(leaky_relu_stub, &xpu::leaky_relu_kernel); +REGISTER_XPU_DISPATCH( + leaky_relu_backward_stub, + &xpu::leaky_relu_backward_kernel); +REGISTER_XPU_DISPATCH(softplus_stub, &xpu::softplus_kernel); +REGISTER_XPU_DISPATCH(softplus_backward_stub, &xpu::softplus_backward_kernel); +REGISTER_XPU_DISPATCH(softshrink_stub, &xpu::softshrink_kernel); +REGISTER_XPU_DISPATCH(shrink_backward_stub, &xpu::softshrink_backward_kernel); +REGISTER_XPU_DISPATCH(mish_stub, &xpu::mish_kernel); +REGISTER_XPU_DISPATCH(mish_backward_stub, &xpu::mish_backward_kernel); +REGISTER_XPU_DISPATCH( + log_sigmoid_backward_stub, + &xpu::log_sigmoid_backward_kernel); +REGISTER_XPU_DISPATCH(prelu_stub, &xpu::prelu_kernel); +REGISTER_XPU_DISPATCH(prelu_backward_stub, &xpu::prelu_backward_kernel); + +TORCH_IMPL_FUNC(gelu_backward_out_xpu) +(const Tensor& /*grad*/, + const Tensor& /*self*/, + c10::string_view approximate, + const Tensor& /*grad_input*/ +) { + xpu::gelu_backward_kernel(*this, approximate); +} + +TORCH_IMPL_FUNC(gelu_out_xpu) +(const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*result*/ +) { + xpu::gelu_kernel(*this, approximate); +} + +std::tuple log_sigmoid_forward_out_xpu( const Tensor& input, Tensor& result, Tensor& buffer) { @@ -675,72 +81,41 @@ std::tuple XPUNativeFunctions::log_sigmoid_forward_out( return std::forward_as_tuple(result, buffer); } -std::tuple XPUNativeFunctions::log_sigmoid_forward( - const Tensor& input) { +std::tuple log_sigmoid_forward_xpu(const Tensor& input) { auto result = at::empty_like(input); auto buffer = at::empty({0}, input.options()); - log_sigmoid_forward_out(input, result, buffer); + log_sigmoid_forward_out_xpu(input, result, buffer); return std::forward_as_tuple(result, buffer); } -TensorIterator log_sigmoid_backward_meta( +Tensor& log_sigmoid_backward_xpu_out( const Tensor& grad_output, const Tensor& input, - const Tensor& grad_input) { - TensorIterator iter; - iter.build(TensorIteratorConfig() - .add_output(grad_input) - .add_const_input(input) - .add_const_input(grad_output)); - return iter; + const Tensor& buffer, + Tensor& grad_input) { + auto iter = TensorIteratorConfig() + .add_output(grad_input) + .add_const_input(input) + .add_const_input(grad_output) + .build(); + log_sigmoid_backward_stub(kXPU, iter); + return grad_input; } -Tensor XPUNativeFunctions::log_sigmoid_backward( +Tensor log_sigmoid_backward_xpu( const Tensor& grad_output, const Tensor& input, const Tensor& buffer) { auto grad_input = at::empty_like(grad_output); - auto iter = log_sigmoid_backward_meta(grad_output, input, grad_input); - native::xpu::log_sigmoid_backward_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::log_sigmoid_backward_out( - const Tensor& grad_output, - const Tensor& input, - const Tensor& buffer, - Tensor& grad_input) { - auto iter = log_sigmoid_backward_meta(grad_output, input, grad_input); - native::xpu::log_sigmoid_backward_kernel(iter); - return grad_input; -} - -Tensor XPUNativeFunctions::mish(const Tensor& self) { - Tensor out; - auto iter = TensorIterator::unary_op(out, self); - native::xpu::mish_kernel(iter); + // NOTE: buffer is only used by CPU dispatch, we just ignore it here + auto iter = at::TensorIteratorConfig() + .add_output(grad_input) + .add_const_input(input) + .add_const_input(grad_output) + .build(); + log_sigmoid_backward_stub(kXPU, iter); return iter.output(); } -Tensor& XPUNativeFunctions::mish_out(const Tensor& self, Tensor& out) { - auto iter = TensorIterator::unary_op(out, self); - native::xpu::mish_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::mish_(Tensor& self) { - auto iter = TensorIterator::unary_op(self, self); - native::xpu::mish_kernel(iter); - return self; -} - -Tensor XPUNativeFunctions::mish_backward( - const Tensor& grad_output, - const Tensor& input) { - Tensor grad_input = at::empty({0}, input.options()); - auto iter = TensorIterator::binary_op(grad_input, grad_output, input); - native::xpu::mish_backward_kernel(iter); - return grad_input; -} - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp index 44ca61805..00aba1011 100644 --- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp +++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp @@ -1,15 +1,14 @@ -#include + #include #include #include -#include -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#endif +#include + +#include +#include +#include +#include #include @@ -89,7 +88,8 @@ Tensor mean_backward( } } // namespace -Tensor XPUNativeFunctions::_adaptive_avg_pool2d_backward( +namespace native { +Tensor adaptive_avg_pool2d_backward_xpu( const Tensor& grad_output, const Tensor& input) { TensorArg grad_output_arg{grad_output, "grad_output", 1}, @@ -127,7 +127,7 @@ Tensor XPUNativeFunctions::_adaptive_avg_pool2d_backward( return grad_input; } -Tensor& XPUNativeFunctions::adaptive_avg_pool2d_out( +Tensor& adaptive_avg_pool2d_out_xpu( const Tensor& input, IntArrayRef output_size, Tensor& output) { @@ -166,17 +166,18 @@ Tensor& XPUNativeFunctions::adaptive_avg_pool2d_out( output.as_strided__symint({n, c, 1, 1}, {c, 1, c, c}); } } else { - native::xpu::adaptive_avg_pool2d_kernel(output, input, output_size); + xpu::adaptive_avg_pool2d_kernel(output, input, output_size); } return output; } -Tensor XPUNativeFunctions::_adaptive_avg_pool2d( +Tensor adaptive_avg_pool2d_xpu( at::Tensor const& input, IntArrayRef output_size) { auto output = at::empty({0}, input.options()); - adaptive_avg_pool2d_out(input, output_size, output); + adaptive_avg_pool2d_out_xpu(input, output_size, output); return output; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp index 55c6a2964..6098072ac 100644 --- a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp +++ b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp @@ -1,214 +1,50 @@ #include #include -#include #include #include -namespace at { - -void adaptive_max_pool2d_meta( - const Tensor& input, - IntArrayRef output_size, - Tensor& output, - Tensor& indices) { - int ndim = input.ndimension(); - TORCH_CHECK( - ndim == 3 || ndim == 4, - "adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ", - input.sizes()); - for (const auto i : c10::irange(1, ndim)) { - TORCH_CHECK( - input.size(i) > 0, - "adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, " - "but input has sizes ", - input.sizes(), - " with dimension ", - i, - " being empty"); - } - - TORCH_CHECK( - output_size.size() == 2, - "adaptive_max_pool2d(): internal error: output_size.size() must be 2"); - - int dimH = 1; - int64_t sizeB = 1; - int64_t sizeD = 0; - - if (input.ndimension() == 4) { - sizeB = input.size(0); - dimH++; - } - - sizeD = input.size(dimH - 1); - - int64_t osizeH = output_size[0]; - int64_t osizeW = output_size[1]; +#include +#include - /* resize output */ - if (input.ndimension() == 3) { - if (output.defined()) { - at::xpu::resize_out(output, {sizeD, osizeH, osizeW}, {}, input.options()); - } else { - output = - at::xpu::create_out({sizeD, osizeH, osizeW}, {}, input.options()); - } - if (indices.defined()) { - at::xpu::resize_out( - indices, {sizeD, osizeH, osizeW}, {}, input.options()); - } else { - indices = at::xpu::create_out( - {sizeD, osizeH, osizeW}, {}, input.options().dtype(kLong)); - } - } else { - if (output.defined()) { - at::xpu::resize_out( - output, - {sizeB, sizeD, osizeH, osizeW}, - {}, - input.options().memory_format(input.suggest_memory_format())); - } else { - output = at::xpu::create_out( - {sizeB, sizeD, osizeH, osizeW}, - {}, - input.options().memory_format(input.suggest_memory_format())); - } - if (indices.defined()) { - at::xpu::resize_out( - indices, - {sizeB, sizeD, osizeH, osizeW}, - {}, - input.options() - .memory_format(input.suggest_memory_format()) - .dtype(kLong)); - } else { - indices = at::xpu::create_out( - {sizeB, sizeD, osizeH, osizeW}, - {}, - input.options() - .memory_format(input.suggest_memory_format()) - .dtype(kLong)); - } - } -} - -std::tuple XPUNativeFunctions::adaptive_max_pool2d( - const Tensor& input, - IntArrayRef output_size) { - TensorArg input_arg{input, "input", 1}; - checkAllSameGPU(__func__, {input_arg}); - - Tensor output, indices; - adaptive_max_pool2d_meta(input, output_size, output, indices); - - if (input.numel() == 0) { - return {output, indices}; - } - - native::xpu::adaptive_max_pool2d_kernel(input, output_size, output, indices); - return {output, indices}; -} +namespace at { +namespace native { -std::tuple XPUNativeFunctions::adaptive_max_pool2d_out( - const Tensor& input, - IntArrayRef output_size, - Tensor& output, - Tensor& indices) { +TORCH_IMPL_FUNC(adaptive_max_pool2d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + const Tensor& output, + const Tensor& indices) { TensorArg output_arg{output, "output", 1}; TensorArg indices_arg{indices, "indices", 2}; TensorArg input_arg{input, "input", 3}; checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg}); - - adaptive_max_pool2d_meta(input, output_size, output, indices); - if (input.numel() == 0) { - return {output, indices}; - } - - native::xpu::adaptive_max_pool2d_kernel(input, output_size, output, indices); - return {output, indices}; -} - -void adaptive_max_pool2d_backward_meta( - const Tensor& grad_output, - const Tensor& input, - const Tensor& indices, - Tensor& grad_input) { - int64_t ndim = grad_output.ndimension(); - TORCH_CHECK( - ndim == 3 || ndim == 4, - "adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ", - grad_output.sizes()); - - at::native::adaptive_pool_empty_output_check( - grad_output, "adaptive_max_pool2d_backward"); - - TORCH_CHECK( - input.dtype() == grad_output.dtype(), - "expected dtype ", - input.dtype(), - " for `grad_output` but got dtype ", - grad_output.dtype()); - - if (grad_input.defined()) { - at::xpu::resize_out( - grad_input, - input.sizes(), - {}, - input.options().memory_format(input.suggest_memory_format())); - } else { - grad_input = at::xpu::create_out( - input.sizes(), - {}, - input.options().memory_format(input.suggest_memory_format())); + return; } -} - -Tensor XPUNativeFunctions::adaptive_max_pool2d_backward( - const Tensor& grad_output, - const Tensor& input, - const Tensor& indices) { - TensorArg grad_output_arg{grad_output, "grad_output", 1}; - TensorArg input_arg{input, "input", 2}; - TensorArg indices_arg{indices, "indices", 3}; - - checkAllSameGPU(__func__, {grad_output_arg, input_arg, indices_arg}); - - Tensor grad_input; - adaptive_max_pool2d_backward_meta(grad_output, input, indices, grad_input); - if (grad_output.numel() == 0) { - return grad_input; - } - - native::xpu::adaptive_max_pool2d_backward_kernel( - grad_output, input, indices, grad_input); - return grad_input; + xpu::adaptive_max_pool2d_kernel(input, output_size, output, indices); } -Tensor& XPUNativeFunctions::adaptive_max_pool2d_backward_out( - const Tensor& grad_output, - const Tensor& input, - const Tensor& indices, - Tensor& grad_input) { - TensorArg grad_input_arg{grad_input, "grad_input", 1}; - TensorArg grad_output_arg{grad_output, "grad_output", 2}; +TORCH_IMPL_FUNC(adaptive_max_pool2d_backward_out_xpu) +(const Tensor& gradOutput, + const Tensor& input, + const Tensor& indices, + const Tensor& gradInput) { + TensorArg grad_input_arg{gradInput, "grad_input", 1}; + TensorArg grad_output_arg{gradOutput, "grad_output", 2}; TensorArg input_arg{input, "input", 3}; TensorArg indices_arg{indices, "indices", 4}; checkAllSameGPU( __func__, {grad_input_arg, grad_output_arg, input_arg, indices_arg}); - adaptive_max_pool2d_backward_meta(grad_output, input, indices, grad_input); - - if (grad_output.numel() == 0) { - return grad_input; + if (gradOutput.numel() == 0) { + return; } - - native::xpu::adaptive_max_pool2d_backward_kernel( - grad_output, input, indices, grad_input); - return grad_input; + xpu::adaptive_max_pool2d_backward_kernel( + gradOutput, input, indices, gradInput); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/AmpKernels.cpp b/src/ATen/native/xpu/AmpKernels.cpp index 5ff9705d0..32216d354 100644 --- a/src/ATen/native/xpu/AmpKernels.cpp +++ b/src/ATen/native/xpu/AmpKernels.cpp @@ -1,12 +1,11 @@ -#include #include -#include +#include #include namespace at { - -void XPUNativeFunctions::_amp_foreach_non_finite_check_and_unscale_( +namespace native { +void _amp_foreach_non_finite_check_and_unscale_xpu_( TensorList scaled_grads, Tensor& found_inf, const Tensor& inv_scale) { @@ -79,7 +78,7 @@ void XPUNativeFunctions::_amp_foreach_non_finite_check_and_unscale_( tensor_lists, found_inf, inv_scale); } -Tensor& XPUNativeFunctions::_amp_update_scale_( +Tensor& _amp_update_scale_xpu_( Tensor& current_scale, Tensor& growth_tracker, const Tensor& found_inf, @@ -115,5 +114,5 @@ Tensor& XPUNativeFunctions::_amp_update_scale_( return current_scale; } - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/AveragePool2d.cpp b/src/ATen/native/xpu/AveragePool2d.cpp index 4d3cc1c0e..326ad8a51 100644 --- a/src/ATen/native/xpu/AveragePool2d.cpp +++ b/src/ATen/native/xpu/AveragePool2d.cpp @@ -1,314 +1,63 @@ -#include #include #include -#include +#include #include #include -namespace at { -using namespace at::native; -using namespace at::native::xpu; - -Tensor& avg_pool2d_meta( - const Tensor& input, - Tensor& output, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - std::optional divisor_override) { - TORCH_CHECK( - kernel_size.size() == 1 || kernel_size.size() == 2, - "avg_pool2d: kernel_size must either be a single int, or a tuple " - "of two ints"); - const int64_t kH = kernel_size[0]; - const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1]; - - TORCH_CHECK( - stride.empty() || stride.size() == 1 || stride.size() == 2, - "avg_pool2d: stride must either be omitted, a single int, or a " - "tuple of two ints"); - const int64_t dH = stride.empty() ? kH : stride[0]; - const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1]; - - TORCH_CHECK( - padding.size() == 1 || padding.size() == 2, - "avg_pool2d: padding must either be a single int, or a tuple of " - "two ints"); - const int64_t padH = padding[0]; - const int64_t padW = padding.size() == 1 ? padH : padding[1]; - - TORCH_CHECK( - !divisor_override.has_value() || divisor_override.value() != 0, - "divisor must be not zero"); - - const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; - const int64_t nInputPlane = input.size(-3); - const int64_t inputHeight = input.size(-2); - const int64_t inputWidth = input.size(-1); - - const int64_t outputHeight = - pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); - const int64_t outputWidth = - pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); - - auto memory_format = input.suggest_memory_format(); - pool2d_shape_check( - input, - kH, - kW, - dH, - dW, - padH, - padW, - 1, - 1, - nInputPlane, - inputHeight, - inputWidth, - outputHeight, - outputWidth, - memory_format); - - /* resize output */ - if (input.ndimension() == 3) { - if (output.defined()) { - at::xpu::resize_out( - output, - {nInputPlane, outputHeight, outputWidth}, - {}, - input.options()); - } else { - output = at::xpu::create_out( - {nInputPlane, outputHeight, outputWidth}, {}, input.options()); - } - } else { - if (output.defined()) { - at::xpu::resize_out( - output, - {nbatch, nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format)); - } else { - output = at::xpu::create_out( - {nbatch, nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format)); - } - } - - return output; -} - -Tensor& avg_pool2d_backward_meta( - const Tensor& gradOutput_, - Tensor& grad_input, - const Tensor& input, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - std::optional divisor_override) { - TORCH_CHECK( - kernel_size.size() == 1 || kernel_size.size() == 2, - "avg_pool2d: kernel_size must either be a single int, or a tuple " - "of two ints"); - const int kH = safe_downcast(kernel_size[0]); - const int kW = kernel_size.size() == 1 - ? kH - : safe_downcast(kernel_size[1]); - - TORCH_CHECK( - stride.empty() || stride.size() == 1 || stride.size() == 2, - "avg_pool2d: stride must either be omitted, a single int, or a " - "tuple of two ints"); - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() ? kW - : stride.size() == 1 ? dH - : safe_downcast(stride[1]); - - TORCH_CHECK( - padding.size() == 1 || padding.size() == 2, - "avg_pool2d: padding must either be a single int, or a tuple of " - "two ints"); - const int padH = safe_downcast(padding[0]); - const int padW = - padding.size() == 1 ? padH : safe_downcast(padding[1]); - - TORCH_CHECK( - !divisor_override.has_value() || divisor_override.value() != 0, - "divisor must be not zero"); - - /* sizes */ - const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; - const int64_t nInputPlane = input.size(-3); // number of channels (or colors) - const int64_t inputHeight = input.size(-2); - const int64_t inputWidth = input.size(-1); - const int64_t outputWidth = - pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); - const int64_t outputHeight = - pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); - - auto memory_format = input.suggest_memory_format(); - avg_pool2d_backward_shape_check( - input, - gradOutput_, - nbatch, - kH, - kW, - dH, - dW, - padH, - padW, - nInputPlane, - inputHeight, - inputWidth, - outputHeight, - outputWidth, - memory_format); - - if (grad_input.defined()) { - at::xpu::resize_out( - grad_input, - input.sizes(), - {}, - input.options().memory_format(memory_format)); - } else { - grad_input = at::xpu::create_out( - input.sizes(), {}, input.options().memory_format(memory_format)); - } - return grad_input; -} - -Tensor XPUNativeFunctions::avg_pool2d( - const Tensor& input, - at::IntArrayRef kernel_size, - at::IntArrayRef stride, - at::IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override) { - Tensor output; - output = avg_pool2d_meta( - input, - output, - kernel_size, - stride, - padding, - ceil_mode, - count_include_pad, - divisor_override); - - at::native::xpu::avg_pool2d_kernel( - input, - kernel_size, - stride, - padding, - ceil_mode, - count_include_pad, - divisor_override, - output); - return output; -} - -Tensor& XPUNativeFunctions::avg_pool2d_out( - const Tensor& input, - at::IntArrayRef kernel_size, - at::IntArrayRef stride, - at::IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override, - Tensor& output) { - avg_pool2d_meta( - input, - output, - kernel_size, - stride, - padding, - ceil_mode, - count_include_pad, - divisor_override); +#include +#include - at::native::xpu::avg_pool2d_kernel( - input, - kernel_size, - stride, - padding, +namespace at { +namespace native { + +TORCH_IMPL_FUNC(avg_pool2d_out_xpu) +(const Tensor& input_, + int64_t kH_, + int64_t kW_, + int64_t dH_, + int64_t dW_, + int64_t padH_, + int64_t padW_, + bool ceil_mode, + bool count_include_pad, + std::optional divisor_override, + const Tensor& output) { + xpu::avg_pool2d_kernel( + input_, + kH_, + kW_, + dH_, + dW_, + padH_, + padW_, ceil_mode, count_include_pad, divisor_override, output); - return output; -} - -Tensor XPUNativeFunctions::avg_pool2d_backward( - const Tensor& grad_output, - const Tensor& input, - at::IntArrayRef kernel_size, - at::IntArrayRef stride, - at::IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override) { - Tensor grad_input; - grad_input = avg_pool2d_backward_meta( - grad_output, - grad_input, - input, - kernel_size, - stride, - padding, - ceil_mode, - count_include_pad, - divisor_override); - at::native::xpu::avg_pool2d_backward_kernel( - grad_output, - input, - kernel_size, - stride, - padding, - ceil_mode, - count_include_pad, - divisor_override, - grad_input); - return grad_input; } -Tensor& XPUNativeFunctions::avg_pool2d_backward_out( - const Tensor& grad_output, - const Tensor& input, - at::IntArrayRef kernel_size, - at::IntArrayRef stride, - at::IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override, - Tensor& grad_input) { - avg_pool2d_backward_meta( - grad_output, - grad_input, - input, - kernel_size, - stride, - padding, - ceil_mode, - count_include_pad, - divisor_override); - at::native::xpu::avg_pool2d_backward_kernel( - grad_output, - input, +TORCH_IMPL_FUNC(avg_pool2d_backward_out_xpu) +(const Tensor& gradOutput_, + const Tensor& input_, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + std::optional divisor_override, + const Tensor& gradInput) { + xpu::avg_pool2d_backward_kernel( + gradOutput_, + input_, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, - grad_input); - return grad_input; + gradInput); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/BatchNorm.cpp b/src/ATen/native/xpu/BatchNorm.cpp index 93018263d..63e04365a 100644 --- a/src/ATen/native/xpu/BatchNorm.cpp +++ b/src/ATen/native/xpu/BatchNorm.cpp @@ -1,19 +1,19 @@ -#include #include #include #include #include -#include +#include namespace at { +namespace native { -std::tuple XPUNativeFunctions::batch_norm_stats( +std::tuple batch_norm_stats_xpu( const Tensor& input, double eps) { - return native::xpu::batch_norm_stats_kernel(input, eps); + return xpu::batch_norm_stats_kernel(input, eps); } -Tensor XPUNativeFunctions::batch_norm_elemt( +Tensor batch_norm_elemt_xpu( const Tensor& input, const std::optional& weight, const std::optional& bias, @@ -21,12 +21,11 @@ Tensor XPUNativeFunctions::batch_norm_elemt( const Tensor& invstd, double eps) { auto output = at::empty_like(input); - native::xpu::batch_norm_elemt_kernel( - output, input, weight, bias, mean, invstd); + xpu::batch_norm_elemt_kernel(output, input, weight, bias, mean, invstd); return output; } -Tensor& XPUNativeFunctions::batch_norm_elemt_out( +Tensor& batch_norm_elemt_xpu_out( const Tensor& input, const std::optional& weight, const std::optional& bias, @@ -34,25 +33,24 @@ Tensor& XPUNativeFunctions::batch_norm_elemt_out( const Tensor& invstd, double eps, Tensor& out) { - native::xpu::batch_norm_elemt_kernel(out, input, weight, bias, mean, invstd); + xpu::batch_norm_elemt_kernel(out, input, weight, bias, mean, invstd); return out; } -std::tuple XPUNativeFunctions:: - batch_norm_backward_reduce( - const Tensor& grad_out, - const Tensor& input, - const Tensor& mean, - const Tensor& invstd, - const std::optional& weight, - bool input_g, - bool weight_g, - bool bias_g) { - return native::xpu::batch_norm_backward_reduce_kernel( +std::tuple batch_norm_backward_reduce_xpu( + const Tensor& grad_out, + const Tensor& input, + const Tensor& mean, + const Tensor& invstd, + const std::optional& weight, + bool input_g, + bool weight_g, + bool bias_g) { + return xpu::batch_norm_backward_reduce_kernel( grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g); } -Tensor XPUNativeFunctions::batch_norm_backward_elemt( +Tensor batch_norm_backward_elemt_xpu( const Tensor& grad_out, const Tensor& input, const Tensor& mean, @@ -61,20 +59,20 @@ Tensor XPUNativeFunctions::batch_norm_backward_elemt( const Tensor& sum_dy, const Tensor& sum_dy_xmu, const Tensor& count) { - return native::xpu::batch_norm_backward_elemt_kernel( + return xpu::batch_norm_backward_elemt_kernel( grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count); } -std::tuple XPUNativeFunctions::batch_norm_update_stats( +std::tuple batch_norm_update_stats_xpu( const Tensor& input, const std::optional& running_mean, const std::optional& running_var, double momentum) { - return native::xpu::batch_norm_update_stats_kernel( + return xpu::batch_norm_update_stats_kernel( input, running_mean, running_var, momentum); } -std::tuple XPUNativeFunctions::native_batch_norm( +std::tuple batch_norm_xpu( const Tensor& input, const std::optional& weight, const std::optional& bias, @@ -90,7 +88,7 @@ std::tuple XPUNativeFunctions::native_batch_norm( auto save_mean = at::empty({n_input}, options); auto save_invstd = at::empty({n_input}, options); - native::xpu::batch_norm_kernel( + xpu::batch_norm_kernel( input, weight, bias, @@ -106,7 +104,7 @@ std::tuple XPUNativeFunctions::native_batch_norm( return std::make_tuple(output, save_mean, save_invstd); } -std::tuple XPUNativeFunctions::native_batch_norm_out( +std::tuple batch_norm_xpu_out( const Tensor& input, const std::optional& weight, const std::optional& bias, @@ -118,7 +116,7 @@ std::tuple XPUNativeFunctions::native_batch_norm_out( Tensor& out, Tensor& save_mean, Tensor& save_invstd) { - return native::xpu::batch_norm_kernel( + return xpu::batch_norm_kernel( input, weight, bias, @@ -132,19 +130,18 @@ std::tuple XPUNativeFunctions::native_batch_norm_out( save_invstd); } -std::tuple XPUNativeFunctions:: - native_batch_norm_backward( - const Tensor& grad_out, - const Tensor& input, - const std::optional& weight, - const std::optional& running_mean, - const std::optional& running_var, - const std::optional& save_mean, - const std::optional& save_invstd, - bool train, - double eps, - std::array output_mask) { - return native::xpu::batch_norm_backward_kernel( +std::tuple batch_norm_backward_xpu( + const Tensor& grad_out, + const Tensor& input, + const std::optional& weight, + const std::optional& running_mean, + const std::optional& running_var, + const std::optional& save_mean, + const std::optional& save_invstd, + bool train, + double eps, + std::array output_mask) { + return xpu::batch_norm_backward_kernel( grad_out, input, weight, @@ -157,7 +154,7 @@ std::tuple XPUNativeFunctions:: output_mask); } -std::tuple XPUNativeFunctions::_native_batch_norm_legit( +std::tuple _batch_norm_legit_xpu( const Tensor& input, const std::optional& weight, const std::optional& bias, @@ -166,24 +163,23 @@ std::tuple XPUNativeFunctions::_native_batch_norm_legit( bool training, double momentum, double eps) { - return XPUNativeFunctions::native_batch_norm( + return batch_norm_xpu( input, weight, bias, running_mean, running_var, training, momentum, eps); } -std::tuple XPUNativeFunctions:: - _native_batch_norm_legit_out( - const Tensor& input, - const std::optional& weight, - const std::optional& bias, - Tensor& running_mean, - Tensor& running_var, - bool training, - double momentum, - double eps, - Tensor& out, - Tensor& save_mean, - Tensor& save_invstd) { - return XPUNativeFunctions::native_batch_norm_out( +std::tuple _batch_norm_legit_xpu_out( + const Tensor& input, + const std::optional& weight, + const std::optional& bias, + Tensor& running_mean, + Tensor& running_var, + bool training, + double momentum, + double eps, + Tensor& out, + Tensor& save_mean, + Tensor& save_invstd) { + return batch_norm_xpu_out( input, weight, bias, @@ -197,29 +193,29 @@ std::tuple XPUNativeFunctions:: save_invstd); } -std::tuple XPUNativeFunctions::_native_batch_norm_legit( +std::tuple _batch_norm_legit_no_stats_xpu( const Tensor& input, const std::optional& weight, const std::optional& bias, bool training, double momentum, double eps) { - return XPUNativeFunctions::native_batch_norm( + return batch_norm_xpu( input, weight, bias, Tensor(), Tensor(), training, momentum, eps); } -std::tuple XPUNativeFunctions:: - _native_batch_norm_legit_out( - const at::Tensor& input, - const std::optional& weight, - const std::optional& bias, - bool training, - double momentum, - double eps, - at::Tensor& out, - at::Tensor& save_mean, - at::Tensor& save_invstd) { - return XPUNativeFunctions::native_batch_norm_out( +std::tuple +_batch_norm_legit_no_stats_xpu_out( + const at::Tensor& input, + const std::optional& weight, + const std::optional& bias, + bool training, + double momentum, + double eps, + at::Tensor& out, + at::Tensor& save_mean, + at::Tensor& save_invstd) { + return batch_norm_xpu_out( input, weight, bias, @@ -233,7 +229,7 @@ std::tuple XPUNativeFunctions:: save_invstd); } -inline std::tuple batch_norm_with_update( +std::tuple _batch_norm_with_update_xpu( const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, @@ -256,7 +252,7 @@ inline std::tuple batch_norm_with_update( auto save_mean = at::empty({n_input}, options); auto save_invstd = at::empty({n_input}, options); - native::xpu::batch_norm_kernel( + xpu::batch_norm_kernel( input, weight, bias, @@ -273,7 +269,7 @@ inline std::tuple batch_norm_with_update( output, save_mean, save_invstd, reserve); } -inline std::tuple batch_norm_with_update_out( +std::tuple _batch_norm_with_update_xpu_out( const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, @@ -290,7 +286,7 @@ inline std::tuple batch_norm_with_update_out const Tensor& weight = *weight_maybe_owned; const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); }); - std::tie(out, save_mean, save_var) = native::xpu::batch_norm_kernel( + std::tie(out, save_mean, save_var) = xpu::batch_norm_kernel( input, weight, bias, @@ -307,47 +303,7 @@ inline std::tuple batch_norm_with_update_out out, save_mean, save_var, reserve); } -std::tuple XPUNativeFunctions:: - _batch_norm_with_update( - const Tensor& input, - const std::optional& weight, - const std::optional& bias, - Tensor& running_mean, - Tensor& running_var, - double momentum, - double eps) { - return batch_norm_with_update( - input, weight, bias, running_mean, running_var, momentum, eps); -} - -std::tuple XPUNativeFunctions:: - _batch_norm_with_update_out( - const Tensor& input, - const std::optional& weight, - const std::optional& bias, - Tensor& running_mean, - Tensor& running_var, - double momentum, - double eps, - Tensor& out, - Tensor& save_mean, - Tensor& save_invstd, - Tensor& reserve) { - return batch_norm_with_update_out( - input, - weight, - bias, - running_mean, - running_var, - momentum, - eps, - out, - save_mean, - save_invstd, - reserve); -} - -std::tuple XPUNativeFunctions::batch_norm_backward( +std::tuple _new_batch_norm_backward_xpu( const Tensor& grad_output, const Tensor& input, const Tensor& weight, @@ -367,7 +323,7 @@ std::tuple XPUNativeFunctions::batch_norm_backward( c10::value_or_else(save_mean_opt, [] { return Tensor(); }); const Tensor& save_var = c10::value_or_else(save_var_opt, [] { return Tensor(); }); - return native::xpu::batch_norm_backward_kernel( + return xpu::batch_norm_backward_kernel( grad_output, input, weight, @@ -380,4 +336,5 @@ std::tuple XPUNativeFunctions::batch_norm_backward( grad_input_mask); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp index e17309841..31c6dd984 100644 --- a/src/ATen/native/xpu/BinaryOps.cpp +++ b/src/ATen/native/xpu/BinaryOps.cpp @@ -1,8 +1,10 @@ #include #include #include +#include #include -#include + +#include #include #include @@ -18,770 +20,46 @@ #include namespace at { -Tensor XPUNativeFunctions::add( - const Tensor& self, - const Tensor& other, - const Scalar& alpha) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::alpha_check(iter.dtype(), alpha); - native::xpu::add_kernel(iter, alpha); - return iter.output(); -} - -Tensor& XPUNativeFunctions::add_( - Tensor& self, - const Tensor& other, - const Scalar& alpha) { - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::alpha_check(iter.dtype(), alpha); - native::xpu::add_kernel(iter, alpha); - return self; -} - -Tensor& XPUNativeFunctions::add_out( - const Tensor& self, - const Tensor& other, - const Scalar& alpha, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::alpha_check(iter.dtype(), alpha); - native::xpu::add_kernel(iter, alpha); - return out; -} - -Tensor XPUNativeFunctions::sub( - const Tensor& self, - const Tensor& other, - const Scalar& alpha) { - Tensor out; - native::sub_check(self, other); - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::alpha_check(iter.dtype(), alpha); - native::xpu::sub_kernel(iter, alpha); - return iter.output(); -} - -Tensor& XPUNativeFunctions::sub_( - Tensor& self, - const Tensor& other, - const Scalar& alpha) { - native::sub_check(self, other); - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::alpha_check(iter.dtype(), alpha); - native::xpu::sub_kernel(iter, alpha); - return self; -} - -Tensor& XPUNativeFunctions::sub_out( - const Tensor& self, - const Tensor& other, - const Scalar& alpha, - Tensor& out) { - native::sub_check(self, other); - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::alpha_check(iter.dtype(), alpha); - native::xpu::sub_kernel(iter, alpha); - return out; -} - -Tensor XPUNativeFunctions::mul(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::mul_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::mul_(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::xpu::mul_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::mul_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::mul_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::div(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_binary_float_op(out, self, other); - native::xpu::div_true_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::div_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_binary_float_op(self, self, other); - native::xpu::div_true_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::div_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_binary_float_op(out, self, other); - native::xpu::div_true_kernel(iter); - return out; -} - -static inline TensorIterator meta_func_div_Tensor_mode( - const Tensor& self, - const Tensor& other, - const Tensor& output, - c10::optional rounding_mode) { - TensorIterator iter; - if (!rounding_mode.has_value()) { - iter.build_borrowing_binary_float_op(output, self, other); - // NOLINTNEXTLINE(bugprone-branch-clone) - } else if (*rounding_mode == "trunc") { - iter.build_borrowing_binary_op(output, self, other); - } else if (*rounding_mode == "floor") { - iter.build_borrowing_binary_op(output, self, other); - } else { - TORCH_CHECK( - false, - "div expected rounding_mode to be one of None, 'trunc', or 'floor' " - "but found '", - *rounding_mode, - "'"); - } - return iter; -} - -static inline void impl_func_div_Tensor_mode( - TensorIterator& iter, - ::std::optional rounding_mode) { - if (!rounding_mode.has_value()) { - native::xpu::div_true_kernel(iter); - } else if (*rounding_mode == "trunc") { - native::xpu::div_trunc_kernel(iter); - } else if (*rounding_mode == "floor") { - native::xpu::div_floor_kernel(iter); - } -} - -Tensor XPUNativeFunctions::div( - const at::Tensor& self, - const at::Tensor& other, - ::std::optional rounding_mode) { - Tensor output; - TensorIterator iter = - meta_func_div_Tensor_mode(self, other, output, rounding_mode); - impl_func_div_Tensor_mode(iter, rounding_mode); - return iter.output(); -} - -Tensor& XPUNativeFunctions::div_( - at::Tensor& self, - const at::Tensor& other, - ::std::optional rounding_mode) { - TensorIterator iter = - meta_func_div_Tensor_mode(self, other, self, rounding_mode); - impl_func_div_Tensor_mode(iter, rounding_mode); - return self; -} - -Tensor& XPUNativeFunctions::div_out( - const at::Tensor& self, - const at::Tensor& other, - ::std::optional rounding_mode, - at::Tensor& output) { - TensorIterator iter = - meta_func_div_Tensor_mode(self, other, output, rounding_mode); - impl_func_div_Tensor_mode(iter, rounding_mode); - return output; -} - -Tensor XPUNativeFunctions::rsub( - const Tensor& self, - const Tensor& other, - const Scalar& alpha) { - return XPUNativeFunctions::sub(other, self, alpha); -} - -Tensor XPUNativeFunctions::remainder(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::remainder_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::remainder_(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::xpu::remainder_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::remainder_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::remainder_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::remainder(const Scalar& self, const Tensor& other) { - auto wrapper = native::wrapped_scalar_tensor(self); - return XPUNativeFunctions::remainder(wrapper, other); -} - -Tensor XPUNativeFunctions::fmod(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::fmod_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::fmod_(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::xpu::fmod_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::fmod_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::fmod_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::tanh_backward( - const Tensor& grad_output, - const Tensor& output) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, grad_output, output); - native::xpu::tanh_backward_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::tanh_backward_out( - const Tensor& grad_output, - const Tensor& output, - Tensor& grad_input) { - auto iter = - TensorIterator::borrowing_binary_op(grad_input, grad_output, output); - native::xpu::tanh_backward_kernel(iter); - return grad_input; -} - -Tensor& XPUNativeFunctions::bitwise_and_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::bitwise_and_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::bitwise_or_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::bitwise_or_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::bitwise_xor_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::bitwise_xor_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::__lshift__(const Tensor& self, const Tensor& other) { - Tensor result; - auto iter = TensorIterator::binary_op(result, self, other); - native::xpu::lshift_kernel(iter); - return iter.output(); -} - -Tensor XPUNativeFunctions::__lshift__(const Tensor& self, const Scalar& other) { - Tensor result; - auto wrapper = native::wrapped_scalar_tensor(other); - auto iter = TensorIterator::binary_op(result, self, wrapper); - native::xpu::lshift_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::__ilshift__(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::binary_op(self, self, other); - native::xpu::lshift_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::__ilshift__(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - auto iter = TensorIterator::binary_op(self, self, wrapper); - native::xpu::lshift_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::bitwise_left_shift_out( - const Tensor& self, - const Tensor& other, - Tensor& result) { - auto iter = TensorIterator::borrowing_binary_op(result, self, other); - native::xpu::lshift_kernel(iter); - return result; -} - -Tensor XPUNativeFunctions::__rshift__(const Tensor& self, const Tensor& other) { - Tensor result; - auto iter = TensorIterator::binary_op(result, self, other); - native::xpu::rshift_kernel(iter); - return iter.output(); -} - -Tensor XPUNativeFunctions::__rshift__(const Tensor& self, const Scalar& other) { - Tensor result; - auto wrapper = native::wrapped_scalar_tensor(other); - auto iter = TensorIterator::binary_op(result, self, wrapper); - native::xpu::rshift_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::__irshift__(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::binary_op(self, self, other); - native::xpu::rshift_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::__irshift__(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - auto iter = TensorIterator::binary_op(self, self, wrapper); - native::xpu::rshift_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::bitwise_right_shift_out( - const Tensor& self, - const Tensor& other, - Tensor& result) { - auto iter = TensorIterator::borrowing_binary_op(result, self, other); - native::xpu::rshift_kernel(iter); - return result; -} - -Tensor XPUNativeFunctions::gcd(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::gcd_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::gcd_(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::xpu::gcd_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::gcd_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::gcd_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::nextafter(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::nextafter_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::nextafter_(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::xpu::nextafter_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::nextafter_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::nextafter_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::hypot(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::hypot_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::hypot_(Tensor& self, const Tensor& other) { - auto iter = TensorIterator::borrowing_binary_op(self, self, other); - native::xpu::hypot_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::hypot_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::hypot_kernel(iter); - return out; -} - -static inline TensorIterator meta_func_maximum( - const Tensor& self, - const Tensor& other, - Tensor& output) { - TORCH_CHECK( - !self.is_complex() && !other.is_complex(), - "maximum not implemented for complex tensors."); +namespace native { +REGISTER_XPU_DISPATCH(add_stub, &xpu::add_kernel) +REGISTER_XPU_DISPATCH(sub_stub, &xpu::sub_kernel); +REGISTER_XPU_DISPATCH(mul_stub, &xpu::mul_kernel); +REGISTER_XPU_DISPATCH(div_true_stub, &xpu::div_true_kernel); +REGISTER_XPU_DISPATCH(div_trunc_stub, &xpu::div_trunc_kernel); +REGISTER_XPU_DISPATCH(div_floor_stub, &xpu::div_floor_kernel); +REGISTER_XPU_DISPATCH(remainder_stub, &xpu::remainder_kernel); +REGISTER_XPU_DISPATCH(fmod_stub, &xpu::fmod_kernel); +REGISTER_XPU_DISPATCH(tanh_backward_stub, &xpu::tanh_backward_kernel); +REGISTER_XPU_DISPATCH(bitwise_and_stub, &xpu::bitwise_and_kernel); +REGISTER_XPU_DISPATCH(bitwise_or_stub, &xpu::bitwise_or_kernel); +REGISTER_XPU_DISPATCH(bitwise_xor_stub, &xpu::bitwise_xor_kernel); +REGISTER_XPU_DISPATCH(gcd_stub, &xpu::gcd_kernel); +REGISTER_XPU_DISPATCH(maximum_stub, &xpu::maximum_kernel); +REGISTER_XPU_DISPATCH(minimum_stub, &xpu::minimum_kernel); +REGISTER_XPU_DISPATCH(sigmoid_backward_stub, &xpu::sigmoid_backward_kernel); +REGISTER_XPU_DISPATCH(nextafter_stub, &xpu::nextafter_kernel); +REGISTER_XPU_DISPATCH(hypot_stub, &xpu::hypot_kernel); +REGISTER_XPU_DISPATCH(atan2_stub, &xpu::atan2_kernel); +REGISTER_XPU_DISPATCH(copysign_stub, &xpu::copysign_kernel); +REGISTER_XPU_DISPATCH(logical_and_stub, &xpu::logical_and_kernel); +REGISTER_XPU_DISPATCH(logical_or_stub, &xpu::logical_or_kernel); +REGISTER_XPU_DISPATCH(logical_xor_stub, &xpu::logical_xor_kernel); +REGISTER_XPU_DISPATCH(logit_backward_stub, &xpu::logit_backward_kernel); +REGISTER_XPU_DISPATCH(logaddexp_stub, &xpu::logaddexp_kernel); +REGISTER_XPU_DISPATCH(logaddexp2_stub, &xpu::logaddexp2_kernel); +REGISTER_XPU_DISPATCH(fmax_stub, &xpu::fmax_kernel); +REGISTER_XPU_DISPATCH(fmin_stub, &xpu::fmin_kernel); +REGISTER_XPU_DISPATCH(lshift_stub, &xpu::lshift_kernel); +REGISTER_XPU_DISPATCH(rshift_stub, &xpu::rshift_kernel); + +TORCH_IMPL_FUNC(add_out_xpu) +(const Tensor& self, + const Tensor& other, + const Scalar& alpha, + const Tensor& output) { auto iter = TensorIterator::borrowing_binary_op(output, self, other); - return iter; -} - -Tensor XPUNativeFunctions::maximum(const Tensor& self, const Tensor& other) { - Tensor output; - auto iter = meta_func_maximum(self, other, output); - native::xpu::maximum_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::maximum_out( - const Tensor& self, - const Tensor& other, - Tensor& output) { - auto iter = meta_func_maximum(self, other, output); - native::xpu::maximum_kernel(iter); - return output; -} - -static inline TensorIterator meta_func_minimum( - const Tensor& self, - const Tensor& other, - Tensor& output) { - TORCH_CHECK( - !self.is_complex() && !other.is_complex(), - "minimum not implemented for complex tensors."); - auto iter = TensorIterator::borrowing_binary_op(output, self, other); - return iter; -} - -Tensor XPUNativeFunctions::minimum(const Tensor& self, const Tensor& other) { - Tensor output; - auto iter = meta_func_minimum(self, other, output); - native::xpu::minimum_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::minimum_out( - const Tensor& self, - const Tensor& other, - Tensor& output) { - auto iter = meta_func_minimum(self, other, output); - native::xpu::minimum_kernel(iter); - return output; -} - -Tensor& XPUNativeFunctions::logit_backward_out( - const Tensor& grad_output, - const Tensor& input, - std::optional eps, - Tensor& grad_input) { - TensorIterator iter; - iter.build_borrowing_binary_op(grad_input, grad_output, input); - native::xpu::logit_backward_kernel(iter, Scalar(eps ? eps.value() : -1.0)); - return grad_input; -} - -Tensor XPUNativeFunctions::logit_backward( - const Tensor& grad_output, - const Tensor& input, - std::optional eps) { - Tensor grad_input; - TensorIterator iter; - iter.build_borrowing_binary_op(grad_input, grad_output, input); - native::xpu::logit_backward_kernel(iter, Scalar(eps ? eps.value() : -1.0)); - return iter.output(); -} - -Tensor& XPUNativeFunctions::sigmoid_backward_out( - const Tensor& grad_output, - const Tensor& output, - Tensor& grad_input) { - TensorIterator iter; - iter.build_borrowing_binary_op(grad_input, grad_output, output); - native::xpu::sigmoid_backward_kernel(iter); - return grad_input; -} - -Tensor XPUNativeFunctions::sigmoid_backward( - const Tensor& grad_output, - const Tensor& output) { - Tensor grad_input; - TensorIterator iter; - iter.build_borrowing_binary_op(grad_input, grad_output, output); - native::xpu::sigmoid_backward_kernel(iter); - return iter.output(); -} - -Tensor XPUNativeFunctions::logaddexp(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::logaddexp_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::logaddexp_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::logaddexp_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::logaddexp2(const Tensor& self, const Tensor& other) { - Tensor out; - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::logaddexp2_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::logaddexp2_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::borrowing_binary_op(out, self, other); - native::xpu::logaddexp2_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::floor_divide_out( - const Tensor& self, - const Tensor& other, - Tensor& output) { - auto iter = TensorIterator::binary_op(output, self, other); - native::xpu::div_floor_kernel(iter); - if (!output.defined()) { - output = iter.output(); - } - return output; -} - -Tensor XPUNativeFunctions::floor_divide( - const Tensor& self, - const Tensor& other) { - Tensor output; - auto iter = TensorIterator::binary_op(output, self, other); - native::xpu::div_floor_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::floor_divide_(Tensor& self, const Tensor& other) { - return XPUNativeFunctions::floor_divide_out(self, other, self); -} - -TensorIterator meta_fmin_fmax( - const char* const name, - const Tensor& self, - const Tensor& other, - Tensor& output) { - TORCH_CHECK( - !self.is_complex() && !other.is_complex(), - name, - " not implemented for complex tensors."); - TensorIterator iter; - iter.build_binary_op(output, self, other); - return iter; -} - -Tensor& XPUNativeFunctions::fmax_out( - const Tensor& self, - const Tensor& other, - Tensor& output) { - auto iter = meta_fmin_fmax("fmax", self, other, output); - native::xpu::fmax_kernel(iter); - return output; -} - -Tensor XPUNativeFunctions::fmax(const Tensor& self, const Tensor& other) { - Tensor output; - auto iter = meta_fmin_fmax("fmax", self, other, output); - native::xpu::fmax_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::fmin_out( - const Tensor& self, - const Tensor& other, - Tensor& output) { - auto iter = meta_fmin_fmax("fmin", self, other, output); - native::xpu::fmin_kernel(iter); - return output; -} - -Tensor XPUNativeFunctions::fmin(const Tensor& self, const Tensor& other) { - Tensor output; - auto iter = meta_fmin_fmax("fmin", self, other, output); - native::xpu::fmin_kernel(iter); - return iter.output(); -} - -Tensor XPUNativeFunctions::atan2(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_binary_float_op(out, self, other); - native::xpu::atan2_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::atan2_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_binary_float_op(self, self, other); - native::xpu::atan2_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::atan2_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_binary_float_op(out, self, other); - native::xpu::atan2_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::copysign_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_binary_float_op(out, self, other); - native::xpu::copysign_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::copysign_(Tensor& self, const Tensor& other) { - return XPUNativeFunctions::copysign_out(self, other, self); -} - -Tensor XPUNativeFunctions::copysign(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_binary_float_op(out, self, other); - native::xpu::copysign_kernel(iter); - return iter.output(); -} - -// We need explicit cast to OutFunc because each *_out func is overloaded twice. -// Without An explicit cast, merely referring to *_out function is ambiguous. -using OutFunc = - std::add_const::type; - -template -Tensor comparison_op( - const Tensor& self, - const Tensor& other, - OutImpl& out_impl) { - Tensor result = at::empty({0}, self.options().dtype(kBool)); - return out_impl(result, self, other); -} - -template -Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) { - return out_impl(self, self, other); -} - -template -Tensor& comparison_op_out( - Tensor& result, - const Tensor& self, - const Scalar& other, - OutImpl& out_impl) { - return out_impl(result, self, native::wrapped_scalar_tensor(other)); -} - -template -Tensor comparison_op( - const Tensor& self, - const Scalar& other, - OutImpl& out_impl) { - return comparison_op(self, native::wrapped_scalar_tensor(other), out_impl); -} - -template -Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) { - return out_impl(self, self, native::wrapped_scalar_tensor(other)); -} - -Tensor& XPUNativeFunctions::logical_and_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::comparison_op(out, self, other); - native::xpu::logical_and_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::logical_and( - const Tensor& self, - const Tensor& other) { - return comparison_op(self, other, static_cast(at::logical_and_out)); -} - -Tensor& XPUNativeFunctions::logical_and_(Tensor& self, const Tensor& other) { - return comparison_op_(self, other, static_cast(at::logical_and_out)); -} - -Tensor& XPUNativeFunctions::logical_or_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::comparison_op(out, self, other); - native::xpu::logical_or_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::logical_or(const Tensor& self, const Tensor& other) { - return comparison_op(self, other, static_cast(at::logical_or_out)); -} - -Tensor& XPUNativeFunctions::logical_or_(Tensor& self, const Tensor& other) { - return comparison_op_(self, other, static_cast(at::logical_or_out)); -} - -Tensor& XPUNativeFunctions::logical_xor_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - auto iter = TensorIterator::comparison_op(out, self, other); - native::xpu::logical_xor_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::logical_xor( - const Tensor& self, - const Tensor& other) { - return comparison_op(self, other, static_cast(at::logical_xor_out)); -} - -Tensor& XPUNativeFunctions::logical_xor_(Tensor& self, const Tensor& other) { - return comparison_op_(self, other, static_cast(at::logical_xor_out)); + xpu::add_kernel(iter, alpha); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Bucketization.cpp b/src/ATen/native/xpu/Bucketization.cpp index 0d6c2a9f5..3394f87bd 100644 --- a/src/ATen/native/xpu/Bucketization.cpp +++ b/src/ATen/native/xpu/Bucketization.cpp @@ -1,11 +1,11 @@ #include #include #include -#include namespace at { +namespace native { -Tensor& XPUNativeFunctions::searchsorted_out( +Tensor& searchsorted_out_xpu( const Tensor& sorted_sequence, const Tensor& self, bool out_int32, @@ -17,9 +17,9 @@ Tensor& XPUNativeFunctions::searchsorted_out( c10::MaybeOwned sorter_maybe_owned = at::borrow_from_optional_tensor(sorter_opt); const Tensor& sorter = *sorter_maybe_owned; - at::native::searchsorted_pre_check( + searchsorted_pre_check( sorted_sequence, self, result, out_int32, right, side_opt, sorter); - at::native::resize_output(result, self.sizes()); + resize_output(result, self.sizes()); if (self.numel() == 0) { return result; @@ -28,12 +28,12 @@ Tensor& XPUNativeFunctions::searchsorted_out( // we have two inputs to set right, pre_check checks that they aren't set to // opposites bool is_right = (side_opt && *side_opt == "right") || right; - at::native::xpu::searchsorted_kernel( + xpu::searchsorted_kernel( result, self, sorted_sequence, out_int32, is_right, sorter); return result; } -Tensor& XPUNativeFunctions::searchsorted_out( +Tensor& searchsorted_out_xpu( const Tensor& sorted_sequence, const Scalar& self, bool out_int32, @@ -42,8 +42,8 @@ Tensor& XPUNativeFunctions::searchsorted_out( const std::optional& sorter_opt, Tensor& result) { const Tensor& scalar_tensor = - at::native::searchsorted_scalar_tensor(self, sorted_sequence.device()); - return searchsorted_out( + searchsorted_scalar_tensor(self, sorted_sequence.device()); + return searchsorted_out_xpu( sorted_sequence, scalar_tensor, out_int32, @@ -53,7 +53,7 @@ Tensor& XPUNativeFunctions::searchsorted_out( result); } -Tensor XPUNativeFunctions::searchsorted( +Tensor searchsorted_xpu( const Tensor& sorted_sequence, const Tensor& self, bool out_int32, @@ -64,12 +64,12 @@ Tensor XPUNativeFunctions::searchsorted( c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type); Tensor result = at::empty({0}, options, MemoryFormat::Contiguous); - searchsorted_out( + searchsorted_out_xpu( sorted_sequence, self, out_int32, right, side_opt, sorter, result); return result; } -Tensor XPUNativeFunctions::searchsorted( +Tensor searchsorted_xpu( const Tensor& sorted_sequence, const Scalar& self, bool out_int32, @@ -77,12 +77,12 @@ Tensor XPUNativeFunctions::searchsorted( const std::optional side_opt, const std::optional& sorter) { const Tensor& scalar_tensor = - at::native::searchsorted_scalar_tensor(self, sorted_sequence.device()); - return searchsorted( + searchsorted_scalar_tensor(self, sorted_sequence.device()); + return searchsorted_xpu( sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter); } -Tensor& XPUNativeFunctions::bucketize_out( +Tensor& bucketize_out_xpu( const Tensor& self, const Tensor& boundaries, bool out_int32, @@ -93,12 +93,12 @@ Tensor& XPUNativeFunctions::bucketize_out( "boundaries tensor must be 1 dimension, but got dim(", boundaries.dim(), ")"); - searchsorted_out( + searchsorted_out_xpu( boundaries, self, out_int32, right, nullopt, nullopt, result); return result; } -Tensor XPUNativeFunctions::bucketize( +Tensor bucketize_xpu( const Tensor& self, const Tensor& boundaries, bool out_int32, @@ -107,19 +107,20 @@ Tensor XPUNativeFunctions::bucketize( c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type); Tensor result = at::empty({0}, options, MemoryFormat::Contiguous); - bucketize_out(self, boundaries, out_int32, right, result); + bucketize_out_xpu(self, boundaries, out_int32, right, result); return result; } -Tensor XPUNativeFunctions::bucketize( +Tensor bucketize_xpu( const Scalar& self, const Tensor& boundaries, bool out_int32, bool right) { - return bucketize( - at::native::searchsorted_scalar_tensor(self, boundaries.device()), + return bucketize_xpu( + searchsorted_scalar_tensor(self, boundaries.device()), boundaries, out_int32, right); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Col2Im.cpp b/src/ATen/native/xpu/Col2Im.cpp index 3a46fd8ad..2a6742e5e 100644 --- a/src/ATen/native/xpu/Col2Im.cpp +++ b/src/ATen/native/xpu/Col2Im.cpp @@ -1,15 +1,17 @@ -#include + #include #include #include -#include #include #include -namespace at { +#include +#include + +namespace at::native { -Tensor& XPUNativeFunctions::col2im_out( +Tensor& col2im_out_xpu( const Tensor& self, IntArrayRef output_size, IntArrayRef kernel_size, @@ -27,7 +29,7 @@ Tensor& XPUNativeFunctions::col2im_out( return out; } -Tensor XPUNativeFunctions::col2im( +Tensor col2im_xpu( const Tensor& self, IntArrayRef output_size, IntArrayRef kernel_size, @@ -43,4 +45,4 @@ Tensor XPUNativeFunctions::col2im( return output; } -} // namespace at +} // namespace at::native diff --git a/src/ATen/native/xpu/CompareOps.cpp b/src/ATen/native/xpu/CompareOps.cpp index 6f84e68fc..ee0798ad0 100644 --- a/src/ATen/native/xpu/CompareOps.cpp +++ b/src/ATen/native/xpu/CompareOps.cpp @@ -2,336 +2,16 @@ #include #include #include -#include - #include namespace at { -Tensor XPUNativeFunctions::eq(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::eq_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::eq_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_comparison_op(self, self, other); - native::xpu::eq_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::eq_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::eq_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::eq(const Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - Tensor out; - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::eq_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::eq_(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper); - native::xpu::eq_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::eq_out( - const Tensor& self, - const Scalar& other, - Tensor& out) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::eq_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::ne(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::ne_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::ne_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_comparison_op(self, self, other); - native::xpu::ne_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::ne_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::ne_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::ne(const Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - Tensor out; - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::ne_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::ne_(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper); - native::xpu::ne_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::ne_out( - const Tensor& self, - const Scalar& other, - Tensor& out) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::ne_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::lt(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::lt_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::lt_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_comparison_op(self, self, other); - native::xpu::lt_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::lt_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::lt_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::lt(const Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - Tensor out; - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::lt_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::lt_(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper); - native::xpu::lt_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::lt_out( - const Tensor& self, - const Scalar& other, - Tensor& out) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::lt_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::le(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::le_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::le_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_comparison_op(self, self, other); - native::xpu::le_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::le_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::le_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::le(const Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - Tensor out; - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::le_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::le_(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper); - native::xpu::le_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::le_out( - const Tensor& self, - const Scalar& other, - Tensor& out) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::le_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::gt(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::gt_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::gt_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_comparison_op(self, self, other); - native::xpu::gt_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::gt_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::gt_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::gt(const Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - Tensor out; - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::gt_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::gt_(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper); - native::xpu::gt_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::gt_out( - const Tensor& self, - const Scalar& other, - Tensor& out) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::gt_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::ge(const Tensor& self, const Tensor& other) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::ge_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::ge_(Tensor& self, const Tensor& other) { - TensorIterator iter; - iter.build_borrowing_comparison_op(self, self, other); - native::xpu::ge_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::ge_out( - const Tensor& self, - const Tensor& other, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_comparison_op(out, self, other); - native::xpu::ge_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::ge(const Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - Tensor out; - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::ge_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::ge_(Tensor& self, const Scalar& other) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(self, self, wrapper); - native::xpu::ge_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::ge_out( - const Tensor& self, - const Scalar& other, - Tensor& out) { - auto wrapper = native::wrapped_scalar_tensor(other); - TensorIterator iter; - iter.build_borrowing_except_last_argument_comparison_op(out, self, wrapper); - native::xpu::ge_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::isnan(const Tensor& self) { - return XPUNativeFunctions::ne(self, self); -} - -Tensor& XPUNativeFunctions::isnan_out(const Tensor& self, Tensor& out) { - return XPUNativeFunctions::ne_out(self, self, out); -} - +namespace native { +REGISTER_XPU_DISPATCH(eq_stub, &xpu::eq_kernel); +REGISTER_XPU_DISPATCH(ne_stub, &xpu::ne_kernel); +REGISTER_XPU_DISPATCH(le_stub, &xpu::le_kernel); +REGISTER_XPU_DISPATCH(lt_stub, &xpu::lt_kernel); +REGISTER_XPU_DISPATCH(ge_stub, &xpu::ge_kernel); +REGISTER_XPU_DISPATCH(gt_stub, &xpu::gt_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Copy.cpp b/src/ATen/native/xpu/Copy.cpp index c95aa9cee..fea67164c 100644 --- a/src/ATen/native/xpu/Copy.cpp +++ b/src/ATen/native/xpu/Copy.cpp @@ -1,20 +1,22 @@ -#include #include +#include #include #include #include #include #include -#include #include #include #include +#include #include #include #include #include +#include + using namespace at; using namespace at::xpu; @@ -295,72 +297,7 @@ void _copy_xpu(TensorIterator& iter, bool non_blocking) { } // namespace native::xpu -Tensor& XPUNativeFunctions::copy_( - Tensor& self, - const Tensor& src, - bool non_blocking) { - if (self._is_zerotensor()) { - TORCH_CHECK( - false, - "ZeroTensors are immutable. Please materialize the tensor using `.clone()`, if you want a mutable zero tensor."); - } - if (src._is_zerotensor()) { - return self.zero_(); - } - - TORCH_CHECK(self.defined(), "self is undefined"); - TORCH_CHECK(src.defined(), "src is undefined"); - - if (self.is_same(src)) { - return self; - } - - // TODO: Support quantization - - // Exit early if self and src are views of the same data - const bool is_same_data = - (self.is_alias_of(src) && self.storage_offset() == src.storage_offset() && - self.strides().equals(src.strides()) && - self.sizes().equals(src.sizes()) && - self.scalar_type() == src.scalar_type() && - self.is_conj() == src.is_conj() && self.is_neg() == src.is_neg()); - if (is_same_data) { - return self; - } - - auto iter = TensorIteratorConfig() - .set_check_mem_overlap(true) - .add_output(self) - .add_input(src) - .resize_outputs(false) - .check_all_same_dtype(false) - .check_all_same_device(false) - .build(); - - if (iter.numel() == 0) { - return self; - } - - native::xpu::_copy_xpu(iter, non_blocking); - - return self; -} - -Tensor XPUNativeFunctions::_to_copy( - const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - bool non_blocking, - c10::optional optional_memory_format) { - return at::native::_to_copy( - self, - dtype, - layout, - device, - pin_memory, - non_blocking, - optional_memory_format); +namespace native { +REGISTER_XPU_DISPATCH(copy_stub, &native::xpu::_copy_xpu); } } // namespace at diff --git a/src/ATen/native/xpu/Cross.cpp b/src/ATen/native/xpu/Cross.cpp index 757f088f9..a155cf034 100644 --- a/src/ATen/native/xpu/Cross.cpp +++ b/src/ATen/native/xpu/Cross.cpp @@ -1,64 +1,12 @@ #include #include +#include +#include #include -#include #include namespace at { -void linalg_cross_meta( - const Tensor& input, - const Tensor& other, - int64_t dim, - Tensor& output) { - auto x_d = input.dim(); - auto y_d = other.dim(); - // This is to avoid things like - // linalg.cross(torch.randn(2, 3), torch.randn(5, 2, 3), dim=2) - TORCH_CHECK( - x_d == y_d, - "linalg.cross: inputs must have the same number of dimensions."); - TORCH_CHECK( - input.size(dim) == 3 && other.size(dim) == 3, - "linalg.cross: inputs dimension ", - dim, - " must have length 3. Got ", - input.size(dim), - " and ", - other.size(dim)); - - // Broadcast the batch dimension of input and other. - // Since the non-batch dimensions agree, this is the same as broadcast all the - // inputs - auto out_size = infer_size(input.sizes(), other.sizes()); - - if (output.defined()) { - at::xpu::resize_out(output, out_size, {}, input.options()); - } else { - output = at::xpu::create_out(out_size, {}, input.options()); - } -} - -Tensor& XPUNativeFunctions::linalg_cross_out( - const Tensor& self, - const Tensor& other, - int64_t dim, - Tensor& out) { - linalg_cross_meta(self, other, dim, out); - - dim = maybe_wrap_dim(dim, self.dim()); - auto out_size = out.sizes(); - Tensor input_broadcasted = self.expand(out_size); - Tensor other_broadcasted = other.expand(out_size); - native::xpu::linalg_cross_kernel( - out, input_broadcasted, other_broadcasted, dim); - return out; -} - -Tensor XPUNativeFunctions::linalg_cross( - const Tensor& self, - const Tensor& other, - int64_t dim) { - Tensor out; - return linalg_cross_out(self, other, dim, out); -} -} // namespace at \ No newline at end of file +namespace native { +REGISTER_XPU_DISPATCH(cross_stub, &xpu::linalg_cross_kernel); +} // namespace native +} // namespace at diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp index af0c0cfd6..600d29e85 100644 --- a/src/ATen/native/xpu/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/DilatedMaxPool2d.cpp @@ -2,305 +2,45 @@ #include #include #include -#include #include -namespace at { - -using namespace at::native; - -void max_pool2d_with_indices_meta( - const Tensor& input, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode, - Tensor& output, - Tensor& indices) { - TORCH_CHECK( - kernel_size.size() == 1 || kernel_size.size() == 2, - "max_pool2d: kernel_size must either be a single int, or a tuple of two ints") - const int kH = safe_downcast(kernel_size[0]); - const int kW = kernel_size.size() == 1 - ? kH - : safe_downcast(kernel_size[1]); - - // NB: stride default is not expressible as an integer constant, so we - // accept empty stride for this case - TORCH_CHECK( - stride.empty() || stride.size() == 1 || stride.size() == 2, - "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints") - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() ? kW - : stride.size() == 1 ? dH - : safe_downcast(stride[1]); - - TORCH_CHECK( - padding.size() == 1 || padding.size() == 2, - "max_pool2d: padding must either be a single int, or a tuple of two ints"); - const int padH = safe_downcast(padding[0]); - const int padW = - padding.size() == 1 ? padH : safe_downcast(padding[1]); - - TORCH_CHECK( - dilation.size() == 1 || dilation.size() == 2, - "max_pool2d: dilation must be either a single int, or a tuple of two ints"); - const int dilationH = safe_downcast(dilation[0]); - const int dilationW = dilation.size() == 1 - ? dilationH - : safe_downcast(dilation[1]); - - const auto memory_format = input.suggest_memory_format(); - if (memory_format == at::MemoryFormat::ChannelsLast) { - TORCH_CHECK( - input.ndimension() == 4, - "non-empty 4D (batch mode) tensor expected for input with channels_last layout"); - } else if (memory_format == at::MemoryFormat::Contiguous) { - TORCH_CHECK( - (input.ndimension() == 3 || input.ndimension() == 4), - "non-empty 3D or 4D (batch mode) tensor expected for input"); - } else { - TORCH_CHECK( - false, - "Unsupport memory format. Supports only ChannelsLast, Contiguous"); - } - - /* sizes */ - const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; - const int64_t nInputPlane = input.size(-3); - const int64_t inputHeight = input.size(-2); - const int64_t inputWidth = input.size(-1); - - const int64_t outputHeight = pooling_output_shape( - inputHeight, kH, padH, dH, dilationH, ceil_mode); - const int64_t outputWidth = pooling_output_shape( - inputWidth, kW, padW, dW, dilationW, ceil_mode); - - pool2d_shape_check( - input, - kH, - kW, - dH, - dW, - padH, - padW, - dilationH, - dilationW, - nInputPlane, - inputHeight, - inputWidth, - outputHeight, - outputWidth, - memory_format); - - /* resize output and indices */ - if (input.ndimension() == 3) { - if (output.defined()) { - at::xpu::resize_out( - output, - {nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format)); - } else { - output = at::xpu::create_out( - {nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format)); - } - - /* indices will contain the locations for each output point */ - if (indices.defined()) { - at::xpu::resize_out( - indices, - {nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format).dtype(kLong)); - } else { - indices = at::xpu::create_out( - {nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format).dtype(kLong)); - } - - } else { - if (output.defined()) { - at::xpu::resize_out( - output, - {nbatch, nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format)); - } else { - output = at::xpu::create_out( - {nbatch, nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format)); - } - - /* indices will contain the locations for each output point */ - if (indices.defined()) { - at::xpu::resize_out( - indices, - {nbatch, nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format).dtype(kLong)); - } else { - indices = at::xpu::create_out( - {nbatch, nInputPlane, outputHeight, outputWidth}, - {}, - input.options().memory_format(memory_format).dtype(kLong)); - } - } -} - -Tensor& max_pool2d_with_indices_backward_meta( - const Tensor& gradOutput, - const Tensor& input, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode, - const Tensor& indices, - Tensor& gradInput) { - // #20866, #22032: Guarantee this for the official C++ API? - TORCH_CHECK( - kernel_size.size() == 1 || kernel_size.size() == 2, - "max_pool2d: kernel_size must either be a single int, or a tuple of two ints") - const int kH = safe_downcast(kernel_size[0]); - const int kW = kernel_size.size() == 1 - ? kH - : safe_downcast(kernel_size[1]); - - // NB: stride default is not expressible as an integer constant, so we accept - // empty stride for this case - TORCH_CHECK( - stride.empty() || stride.size() == 1 || stride.size() == 2, - "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints") - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() ? kW - : stride.size() == 1 ? dH - : safe_downcast(stride[1]); - - TORCH_CHECK( - padding.size() == 1 || padding.size() == 2, - "max_pool2d: padding must either be a single int, or a tuple of two ints"); - const int padH = safe_downcast(padding[0]); - const int padW = - padding.size() == 1 ? padH : safe_downcast(padding[1]); - - TORCH_CHECK( - dilation.size() == 1 || dilation.size() == 2, - "max_pool2d: dilation must be either a single int, or a tuple of two ints"); - const int dilationH = safe_downcast(dilation[0]); - const int dilationW = dilation.size() == 1 - ? dilationH - : safe_downcast(dilation[1]); - - TORCH_CHECK( - input.dtype() == gradOutput.dtype(), - "expected dtype ", - input.dtype(), - " for `gradOutput` but got dtype ", - gradOutput.dtype()); +#include +#include - const auto memory_format = input.suggest_memory_format(); - if (memory_format == at::MemoryFormat::ChannelsLast) { - TORCH_CHECK( - input.ndimension() == 4, - "non-empty 4D (batch mode) tensor expected for input with channels_last layout"); - } else if (memory_format == at::MemoryFormat::Contiguous) { - TORCH_CHECK( - (input.ndimension() == 3 || input.ndimension() == 4), - "non-empty 3D or 4D (batch mode) tensor expected for input"); - } else { - TORCH_CHECK( - false, - "Unsupport memory format. Supports only ChannelsLast, Contiguous"); - } - - /* sizes */ - const int64_t nInputPlane = input.size(-3); - const int64_t inputHeight = input.size(-2); - const int64_t inputWidth = input.size(-1); - - /* XXX preserve the existing shape check behavior */ - const int64_t outputHeight_for_shape_check = pooling_output_shape( - inputHeight, kH, padH, dH, dilationH, ceil_mode); - const int64_t outputWidth_for_shape_check = pooling_output_shape( - inputWidth, kW, padW, dW, dilationW, ceil_mode); - - max_pool2d_backward_shape_check( - input, +namespace at { +namespace native { +TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_xpu) +(const Tensor& gradOutput, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + const Tensor& indices, + const Tensor& gradInput) { + xpu::max_pool2d_with_indices_backward_kernel( + gradInput, gradOutput, - indices, - kH, - kW, - dH, - dW, - padH, - padW, - dilationH, - dilationW, - nInputPlane, - inputHeight, - inputWidth, - outputHeight_for_shape_check, - outputWidth_for_shape_check, - memory_format); - - auto options = input.options().memory_format(memory_format); - if (gradInput.defined()) { - at::xpu::resize_out(gradInput, input.sizes(), {}, options); - } else { - gradInput = at::xpu::create_out(input.sizes(), {}, options); - } - - return gradInput; -} - -std::tuple XPUNativeFunctions::max_pool2d_with_indices( - const Tensor& input, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode) { - Tensor output; - Tensor indices; - max_pool2d_with_indices_meta( - input, - kernel_size, - stride, - padding, - dilation, - ceil_mode, - output, - indices); - - at::native::xpu::max_pool2d_with_indices_kernel( input, + indices, kernel_size, stride, padding, dilation, - ceil_mode, - output, - indices); - - return std::tuple(output, indices); + ceil_mode); } -std::tuple XPUNativeFunctions::max_pool2d_with_indices_out( - const Tensor& input, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode, - Tensor& output, - Tensor& indices) { - max_pool2d_with_indices_meta( +TORCH_IMPL_FUNC(max_pool2d_with_indices_out_xpu) +(const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + const Tensor& output, + const Tensor& indices) { + xpu::max_pool2d_with_indices_kernel( input, kernel_size, stride, @@ -309,77 +49,6 @@ std::tuple XPUNativeFunctions::max_pool2d_with_indices_out( ceil_mode, output, indices); - - at::native::xpu::max_pool2d_with_indices_kernel( - input, - kernel_size, - stride, - padding, - dilation, - ceil_mode, - output, - indices); - - return std::tuple(output, indices); } - -Tensor& XPUNativeFunctions::max_pool2d_with_indices_backward_out( - const Tensor& grad_output, - const Tensor& self, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode, - const Tensor& indices, - Tensor& grad_input) { - grad_input = max_pool2d_with_indices_backward_meta( - grad_output, - self, - kernel_size, - stride, - padding, - dilation, - ceil_mode, - indices, - grad_input); - - at::native::xpu::max_pool2d_with_indices_backward_kernel( - grad_input, - grad_output, - self, - indices, - kernel_size, - stride, - padding, - dilation, - ceil_mode); - - return grad_input; -} - -Tensor XPUNativeFunctions::max_pool2d_with_indices_backward( - const Tensor& grad_output, - const Tensor& self, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode, - const Tensor& indices) { - Tensor grad_input; - max_pool2d_with_indices_backward_out( - grad_output, - self, - kernel_size, - stride, - padding, - dilation, - ceil_mode, - indices, - grad_input); - - return grad_input; -} - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Distance.cpp b/src/ATen/native/xpu/Distance.cpp index 613eb542c..63ae0cced 100644 --- a/src/ATen/native/xpu/Distance.cpp +++ b/src/ATen/native/xpu/Distance.cpp @@ -1,123 +1,10 @@ +#include +#include #include -#include namespace at { -Tensor cdist_impl( - const Tensor& x1, - const Tensor& x2, - const double p, - c10::optional compute_mode) { - TORCH_CHECK( - at::isFloatingType(x1.scalar_type()), - "cdist only supports floating-point dtypes, X1 got: ", - x1.scalar_type()); - auto device1 = x1.device().type(); - TORCH_CHECK( - at::isFloatingType(x2.scalar_type()), - "cdist only supports floating-point dtypes, X2 got: ", - x2.scalar_type()); - auto device2 = x2.device().type(); - TORCH_CHECK(p >= 0, "cdist only supports non-negative p values"); - TORCH_CHECK( - device1 == device2, - "X1 and X2 must have the same device type. X1: ", - device1, - " X2: ", - device2); - // TODO: This is bad; this test should apply universally - TORCH_CHECK( - !x1.is_xpu() || x1.get_device() == x2.get_device(), - "device of X1 (", - x1.get_device(), - ") must match device of X2 (", - x2.get_device(), - ")"); - SymInt c1 = x1.sym_size(-1); - SymInt c2 = x2.sym_size(-1); - // 0 - default value. If p = 2 and r1 > 25 or r2 > 25 (these values are based - // on performance metrics), it will try to compute distance using matrix - // multiplication approach 1 - force to use matrix multiplication for p = 2 2 - // - do not use matrix multiplication for p = 2 - int64_t mode = compute_mode.value_or(0); - TORCH_CHECK( - mode >= 0 && mode <= 2, "possible modes: 0, 1, 2, but was: ", mode); - SymInt r1 = x1.size(-2); - SymInt r2 = x2.size(-2); - if (!(p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25))))) { - TORCH_CHECK( - device1 == kCPU || device1 == kXPU, - "cdist only supports CPU and XPU devices, X1 got: ", - device1); - TORCH_CHECK( - device2 == kCPU || device2 == kXPU, - "cdist only supports CPU and XPU devices, X2 got: ", - device2); - } - int64_t dim1 = x1.dim(); - int64_t dim2 = x2.dim(); - SymIntArrayRef batch_tensor1(x1.sym_sizes().data(), dim1 - 2); - SymIntArrayRef batch_tensor2(x2.sym_sizes().data(), dim2 - 2); - std::vector expand_batch_portion = - at::infer_size_symint(batch_tensor1, batch_tensor2); - std::vector x1_expand_size(expand_batch_portion); - x1_expand_size.insert(x1_expand_size.end(), {r1, c1}); - std::vector x2_expand_size(expand_batch_portion); - x2_expand_size.insert(x2_expand_size.end(), {r2, c2}); - - const SymInt expand_batch_product = - c10::multiply_integers(expand_batch_portion); - std::vector x1_view{expand_batch_product, r1, c1}; - std::vector x2_view{expand_batch_product, r2, c2}; - - Tensor x1_expanded = - x1.expand_symint(x1_expand_size).contiguous().view_symint(x1_view); - Tensor x2_expanded = - x2.expand_symint(x2_expand_size).contiguous().view_symint(x2_view); - - std::vector output_shape(std::move(expand_batch_portion)); - output_shape.insert(output_shape.end(), {r1, r2}); - - Tensor result; - if (r1 == 0 || r2 == 0 || expand_batch_product == 0) { - result = at::empty_symint(output_shape, x1.options()); - } else if (c1 == 0) { - result = at::zeros_symint(output_shape, x1.options()); - } else if (p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25)))) { - Tensor dist = (expand_batch_product == 1) - ? at::_euclidean_dist(x1, x2) - : at::_euclidean_dist(x1_expanded, x2_expanded); - result = dist.view_symint(output_shape); - } else { - result = at::empty_symint(output_shape, x1.options()); - native::xpu::cdist_kernel(result, x1_expanded, x2_expanded, p); - } - return result; +namespace native { +REGISTER_XPU_DISPATCH(cdist_stub, &xpu::cdist_kernel); } - -Tensor XPUNativeFunctions::_cdist_forward( - const Tensor& x1, - const Tensor& x2, - const double p, - c10::optional compute_mode) { - TORCH_CHECK( - x1.dim() >= 2, - "cdist only supports at least 2D tensors, X1 got: ", - x1.dim(), - "D"); - TORCH_CHECK( - x2.dim() >= 2, - "cdist only supports at least 2D tensors, X2 got: ", - x2.dim(), - "D"); - TORCH_CHECK( - x1.size(-1) == x2.size(-1), - "X1 and X2 must have the same number of columns. X1: ", - x1.size(-1), - " X2: ", - x2.size(-1)); - - return cdist_impl(x1, x2, p, compute_mode); -} - } // namespace at diff --git a/src/ATen/native/xpu/Distributions.cpp b/src/ATen/native/xpu/Distributions.cpp index 51ff727cc..bce51bbf8 100644 --- a/src/ATen/native/xpu/Distributions.cpp +++ b/src/ATen/native/xpu/Distributions.cpp @@ -1,314 +1,32 @@ -#include #include #include #include #include +#include #include +#include #include -#include +#include #include #include #include +#include namespace at { - -template -struct NormalStub { - void operator()( - Tensor& self, - double mean, - double std, - c10::optional gen) { - native::xpu::normal_kernel(self, mean, std, gen); - } -}; - -Tensor& XPUNativeFunctions::normal_( - Tensor& self, - double mean, - double std, - ::std::optional generator) { - return native::templates::normal_impl_( - self, mean, std, std::move(generator)); -} - -// out tensor float -Tensor& XPUNativeFunctions::normal_out( - const Tensor& mean, - double std, - c10::optional gen, - Tensor& output) { - return at::native::templates::normal_out_impl( - output, mean, std, std::move(gen)); -} - -// functional tensor float -Tensor XPUNativeFunctions::normal( - const Tensor& mean, - double std, - c10::optional gen) { - return at::native::templates::normal_impl( - mean, std, std::move(gen)); -} - -// out float tensor -Tensor& XPUNativeFunctions::normal_out( - double mean, - const Tensor& std, - c10::optional gen, - Tensor& output) { - return at::native::templates::normal_out_impl( - output, mean, std, std::move(gen)); -} - -// functional float tensor -Tensor XPUNativeFunctions::normal( - double mean, - const Tensor& std, - c10::optional gen) { - return at::native::templates::normal_impl( - mean, std, std::move(gen)); -} - -// out tensor tensor -Tensor& XPUNativeFunctions::normal_out( - const Tensor& mean, - const Tensor& std, - c10::optional gen, - Tensor& output) { - return at::native::templates::normal_out_impl( - output, mean, std, std::move(gen)); -} - -// functional tensor tensor -Tensor XPUNativeFunctions::normal( - const Tensor& mean, - const Tensor& std, - c10::optional gen) { - return at::native::templates::normal_impl( - mean, std, std::move(gen)); -} - -template -struct UniformStub { - void operator()( - TensorIteratorBase& iter, - double from, - double to, - c10::optional gen) { - native::xpu::uniform_kernel(iter, from, to, gen); - } -}; - -Tensor& XPUNativeFunctions::uniform_( - Tensor& self, - double from, - double to, - ::std::optional generator) { - return native::templates::uniform_impl_( - self, from, to, std::move(generator)); -} - -template -struct BernoulliStub { - void operator()( - Tensor& self, - const Tensor& p_, - c10::optional gen) { - native::xpu::bernoulli_tensor_kernel(self, p_, gen); - } - void operator()(Tensor& self, double p, c10::optional gen) { - native::xpu::bernoulli_scalar_kernel(self, p, gen); - } -}; - -Tensor& XPUNativeFunctions::bernoulli_( - Tensor& self, - const Tensor& p_, - ::std::optional generator) { - return native::templates::bernoulli_impl_( - self, p_, std::move(generator)); -} - -Tensor& XPUNativeFunctions::bernoulli_( - Tensor& self, - double p, - ::std::optional generator) { - return native::templates::bernoulli_impl_( - self, p, std::move(generator)); -} - -Tensor& XPUNativeFunctions::bernoulli_out( - const Tensor& self, - c10::optional gen, - Tensor& result) { - return native::templates::bernoulli_out_impl( - result, self, std::move(gen)); -} - -template -struct RandomStub { - void operator()(TensorIteratorBase& iter, c10::optional gen) { - native::xpu::random_kernel(iter, gen); - } -}; - -Tensor& XPUNativeFunctions::random_( - Tensor& self, - ::std::optional generator) { - return native::templates::random_impl( - self, std::move(generator)); -} - -template -struct RandomFromToStub { - void operator()( - TensorIteratorBase& iter, - uint64_t range, - int64_t from, - c10::optional gen) { - native::xpu::random_from_to_kernel(iter, range, from, gen); - } - void operator()(TensorIteratorBase& iter, c10::optional gen) { - native::xpu::random_full_64_bits_range_kernel(iter, gen); - } -}; - -Tensor& XPUNativeFunctions::random_( - Tensor& self, - int64_t from, - c10::optional to_opt, - ::std::optional generator) { - return native::templates::random_from_to_impl( - self, from, to_opt, std::move(generator)); -} - -Tensor& XPUNativeFunctions::random_( - Tensor& self, - int64_t to, - ::std::optional generator) { - return random_(self, 0, to, std::move(generator)); -} - -template -struct ExponentialStub { - void operator()( - TensorIteratorBase& iter, - double lambda, - c10::optional gen) { - native::xpu::exponential_kernel(iter, lambda, gen); - } -}; - -Tensor& XPUNativeFunctions::exponential_( - Tensor& self, - double lambda, - std::optional generator) { - return native::templates::exponential_impl_( - self, lambda, std::move(generator)); -} - -/* The largest consecutive integer representable in float32 (2^24) */ -constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (24); - -Tensor& XPUNativeFunctions::multinomial_out( - const Tensor& self, - int64_t n_sample, - bool with_replacement, - ::std::optional gen, - at::Tensor& result) { - TORCH_CHECK( - result.device() == self.device(), - "multinomial arguments must have the same device"); - TORCH_CHECK( - self.dim() > 0 && self.dim() <= 2, "prob_dist must be 1 or 2 dim"); - TORCH_CHECK( - at::isFloatingType(self.scalar_type()), - "multinomial only supports floating-point dtypes for input, got: ", - self.scalar_type()); - TORCH_CHECK( - result.scalar_type() == ScalarType::Long, - "multinomial expects Long tensor out, got: ", - result.scalar_type()); - TORCH_CHECK(n_sample > 0, "cannot sample n_sample <= 0 samples"); - int64_t n_categories = self.size(-1); - TORCH_CHECK( - with_replacement || (n_sample <= n_categories), - "cannot sample n_sample > prob_dist.size(-1) samples without replacement"); - // Since the index tensor is float, numCategories cannot exceed max - // float integer precision - TORCH_CHECK( - n_categories <= FLOAT32_MAX_CONSECUTIVE_INT, - "number of categories cannot exceed 2^24"); - - if (self.dim() == 1) { - result.resize_({n_sample}); - } else { - const int64_t n_dist = self.size(0); - result.resize_({n_dist, n_sample}); - } - if (result.numel() == 0) { - return result; - } - - // Fast-path for no replacement or if only one sample is drawn. - // Reference: - // https://github.com/pytorch/pytorch/issues/11931#issuecomment-625882503 - if (!with_replacement || n_sample == 1) { - // Sanity checks on `self`. - auto is_valid = ((self.max() < INFINITY) & (self.min() >= 0)).item(); - TORCH_CHECK( - is_valid.to(), - "probability tensor contains either `inf`, `nan` or element < 0"); - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool zero_prob_condition; - if (self.dim() == 1) { - zero_prob_condition = (self.sum() == 0).item().to(); - } else { - zero_prob_condition = (self.sum(1) == 0).sum().item().to(); - } - TORCH_CHECK( - !zero_prob_condition, - "invalid multinomial distribution (sum of probabilities <= 0)"); - - // The algorithm is from gumbel softmax. - // s = argmax( logp - log(-log(eps)) ) where eps ~ U(0, 1) - // Here we can apply exp to the formula which will not affect result of - // argmax or topk. Then we have - // s = argmax( p / (-log(eps)) ) where eps ~ U(0, 1). - // We can also simplify the formula above by - // s = argmax( p / q ) where q ~ Exp(1) - Tensor q = at::empty_like(self).exponential_(1, std::move(gen)); - // In theory the probability to generate 0 from exponential distribution is - // 0. However, on CUDA side there is a protection to avoid 0s, but on CPU - // side, there is a very low probability to generate 0 from - // exponential. The probability is about 2^(-DBL_MANT_DIG). We just - // ignore it here, but there may be some risk to get invalid output on CPU. - at::div_out(q, self, q); - if (n_sample == 1) { - at::argmax_out(result, q, /*dim=*/-1, /*keepdim=*/true); - } else { - Tensor vals = at::empty(result.sizes(), self.options()); - at::topk_out(vals, result, q, n_sample); - } - return result; - } - - at::native::xpu::multinomial_kernel(result, self, n_sample, gen); - return result; -} - -Tensor XPUNativeFunctions::multinomial( - const Tensor& self, - int64_t n_sample, - bool with_replacement, - ::std::optional gen) { - Tensor result = at::empty({0}, self.options().dtype(kLong)); - - XPUNativeFunctions::multinomial_out( - self, n_sample, with_replacement, std::move(gen), result); - return result; -} - +namespace native { +REGISTER_XPU_DISPATCH(normal_stub, &xpu::normal_kernel); +REGISTER_XPU_DISPATCH(uniform_stub, &xpu::uniform_kernel); +REGISTER_XPU_DISPATCH(bernoulli_scalar_stub, &xpu::bernoulli_scalar_kernel); +REGISTER_XPU_DISPATCH(bernoulli_tensor_stub, &xpu::bernoulli_tensor_kernel); +REGISTER_XPU_DISPATCH(random_stub, &xpu::random_kernel); +REGISTER_XPU_DISPATCH(random_from_to_stub, &xpu::random_from_to_kernel); +REGISTER_XPU_DISPATCH(exponential_stub, &xpu::exponential_kernel); +REGISTER_XPU_DISPATCH( + random_full_64_bits_range_stub, + &xpu::random_full_64_bits_range_kernel); +REGISTER_XPU_DISPATCH( + multinomial_with_replacement_stub, + &xpu::multinomial_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Dropout.cpp b/src/ATen/native/xpu/Dropout.cpp index d3d74dbaf..950afccc0 100644 --- a/src/ATen/native/xpu/Dropout.cpp +++ b/src/ATen/native/xpu/Dropout.cpp @@ -1,20 +1,24 @@ -#include #include #include #include #include -#include + +#include +#include + +#include namespace at { -::std::tuple XPUNativeFunctions::native_dropout( +namespace native { +::std::tuple native_dropout_xpu( const Tensor& input, double p, ::std::optional train) { return at::native::xpu::dropout_kernel(input, p, train); } -Tensor XPUNativeFunctions::native_dropout_backward( +Tensor native_dropout_backward_xpu( const Tensor& grad_output, const Tensor& mask, double scale) { @@ -29,4 +33,5 @@ Tensor XPUNativeFunctions::native_dropout_backward( return at::native::xpu::dropout_backward_kernel(grad_output, mask, scale); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Embedding.cpp b/src/ATen/native/xpu/Embedding.cpp index 1eb073e43..41901fb10 100644 --- a/src/ATen/native/xpu/Embedding.cpp +++ b/src/ATen/native/xpu/Embedding.cpp @@ -1,12 +1,13 @@ -#include #include +#include + #include -#include +#include namespace at { - -Tensor XPUNativeFunctions::embedding_dense_backward( +namespace native { +Tensor embedding_dense_backward_xpu( const Tensor& grad_output, const Tensor& indices, int64_t num_weights, @@ -20,9 +21,9 @@ Tensor XPUNativeFunctions::embedding_dense_backward( "grad_output"); c10::impl::check_and_update_common_device( common_device, indices, "xpu::embedding_dense_backward", "indices"); - return native::xpu::embedding_dense_backward_kernel( + return xpu::embedding_dense_backward_kernel( grad_output, indices, num_weights, padding_idx, scale_grad_by_freq); ; } - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/EmbeddingBag.cpp b/src/ATen/native/xpu/EmbeddingBag.cpp index 7300157d5..0786a9061 100644 --- a/src/ATen/native/xpu/EmbeddingBag.cpp +++ b/src/ATen/native/xpu/EmbeddingBag.cpp @@ -1,11 +1,13 @@ -#include -#include +#include +#include #include +#include namespace at { +namespace native { -std::tuple XPUNativeFunctions::_embedding_bag( +std::tuple _embedding_bag_xpu( const Tensor& weight, const Tensor& indices, const Tensor& offsets, @@ -46,55 +48,27 @@ std::tuple XPUNativeFunctions::_embedding_bag( padding_idx); } -std::tuple XPUNativeFunctions:: - _embedding_bag_forward_only( - const Tensor& weight, - const Tensor& indices, - const Tensor& offsets, - bool scale_grad_by_freq, - int64_t mode, - bool sparse, - const c10::optional& per_sample_weights_opt, - bool include_last_offset, - int64_t padding_idx) { - return _embedding_bag( - weight, - indices, - offsets, - scale_grad_by_freq, - mode, - sparse, - per_sample_weights_opt, - include_last_offset, - padding_idx); -} - -Tensor XPUNativeFunctions::_embedding_bag_backward( - const Tensor& grad, +std::tuple _embedding_bag_forward_only_xpu( + const Tensor& weight, const Tensor& indices, const Tensor& offsets, - const Tensor& offset2bag, - const Tensor& bag_size, - const Tensor& maximum_indices, - int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, - const c10::optional& per_sample_weights, + const c10::optional& per_sample_weights_opt, + bool include_last_offset, int64_t padding_idx) { - return at::native::_embedding_bag_backward_symint( - grad, + return _embedding_bag_xpu( + weight, indices, offsets, - offset2bag, - bag_size, - maximum_indices, - num_weights, scale_grad_by_freq, mode, sparse, - per_sample_weights, + per_sample_weights_opt, + include_last_offset, padding_idx); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Equal.cpp b/src/ATen/native/xpu/Equal.cpp index 107936c72..dcee9b380 100644 --- a/src/ATen/native/xpu/Equal.cpp +++ b/src/ATen/native/xpu/Equal.cpp @@ -1,15 +1,16 @@ #include -#include -#ifndef AT_PER_OPERATOR_HEADERS -#include -#else -#include -#endif +#include namespace at { - -bool XPUNativeFunctions::equal(const Tensor& self, const Tensor& src) { +namespace xpu { +// Note: +// Seems {op}_xpu_dispatch.h is not generated in codegen via +// backendwhitelist mode. We have to manually add a declaration here. +at::Tensor eq(const at::Tensor& self, const at::Tensor& other); +} // namespace xpu +namespace native { +bool xpu_equal(const Tensor& self, const Tensor& src) { if (!at::namedinference::are_names_equal( self.unsafeGetTensorImpl(), src.unsafeGetTensorImpl())) { return false; @@ -38,7 +39,7 @@ bool XPUNativeFunctions::equal(const Tensor& self, const Tensor& src) { return true; } - return at::XPUNativeFunctions::eq(self, src).all().item().to(); + return at::xpu::eq(self, src).all().item().to(); } - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Fill.cpp b/src/ATen/native/xpu/Fill.cpp index 025b2853f..88b3f765d 100644 --- a/src/ATen/native/xpu/Fill.cpp +++ b/src/ATen/native/xpu/Fill.cpp @@ -1,50 +1,10 @@ #include #include +#include #include #include -#include #include - -namespace at { - -Tensor& fill_out(Tensor& self, const Scalar& value) { - auto iter = TensorIteratorConfig() - .set_check_mem_overlap( - false) // Fill is idempotent, so overlap is okay - .check_all_same_dtype(false) - .add_output(self) - .resize_outputs(false) - .build(); - native::xpu::fill_kernel(iter, value); - return self; -} - -Tensor& XPUNativeFunctions::fill_(Tensor& self, const Scalar& value) { - return fill_out(self, value); -} - -Tensor& XPUNativeFunctions::fill_(Tensor& self, const Tensor& value) { - TORCH_CHECK( - value.dim() == 0, - "fill_ only supports 0-dimension value tensor but got tensor with ", - value.dim(), - " dimensions."); - if (self.device() != value.device()) { - return fill_out(self, value.item()); - } - // Check if value is a view of self and if it is we clone - // it to avoid overwriting self prematurely - if (self.is_alias_of(value)) { - self.copy_(value.clone()); - } else { - self.copy_(value); - } - return self; -} - -Tensor& XPUNativeFunctions::zero_(Tensor& self) { - return self.fill_(0); -} - -} // namespace at +namespace at::native { +REGISTER_XPU_DISPATCH(fill_stub, &native::xpu::fill_kernel); +} // namespace at::native diff --git a/src/ATen/native/xpu/ForeachOpList.cpp b/src/ATen/native/xpu/ForeachOpList.cpp index 9d9e01af8..6813a91ae 100644 --- a/src/ATen/native/xpu/ForeachOpList.cpp +++ b/src/ATen/native/xpu/ForeachOpList.cpp @@ -1,141 +1,186 @@ #include -#include - #include #include #include +#include + namespace at { +namespace native { + +::std::vector foreach_tensor_mul_list_kernel_slow( + at::TensorList self, + at::TensorList other); +void foreach_tensor_mul_list_kernel_slow_( + at::TensorList self, + at::TensorList other); + +::std::vector foreach_tensor_div_list_kernel_slow( + at::TensorList self, + at::TensorList other); +void foreach_tensor_div_list_kernel_slow_( + at::TensorList self, + at::TensorList other); + +::std::vector foreach_tensor_add_list_kernel_slow( + at::TensorList self, + at::TensorList other, + const at::Scalar& alpha); +void foreach_tensor_add_list_kernel_slow_( + at::TensorList self, + at::TensorList other, + const at::Scalar& alpha); #define FOREACH_BINARY_OP_LIST(NAME, DIVISION_OP) \ - void XPUNativeFunctions::_foreach_##NAME##_( \ + void foreach_tensor_##NAME##_list_kernel_xpu_( \ TensorList tensors1, TensorList tensors2) { \ - at::native::check_foreach_api_restrictions(tensors1, tensors2); \ - if (!at::native::can_use_fast_route(tensors1, tensors2, DIVISION_OP)) { \ - return at::native::foreach_tensor_##NAME##_list_kernel_slow_( \ - tensors1, tensors2); \ + check_foreach_api_restrictions(tensors1, tensors2); \ + if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) { \ + return foreach_tensor_##NAME##_list_kernel_slow_(tensors1, tensors2); \ } \ \ - at::native::xpu::FOREACH_BINARY_LIST_INPLACE_KERNEL_NAME(NAME)( \ - tensors1, tensors2); \ + xpu::FOREACH_BINARY_LIST_INPLACE_KERNEL_NAME(NAME)(tensors1, tensors2); \ } \ \ - std::vector XPUNativeFunctions::_foreach_##NAME( \ + std::vector foreach_tensor_##NAME##_list_kernel_xpu( \ TensorList tensors1, TensorList tensors2) { \ - at::native::check_foreach_api_restrictions(tensors1, tensors2); \ - if (!at::native::can_use_fast_route(tensors1, tensors2, DIVISION_OP)) { \ - return at::native::foreach_tensor_##NAME##_list_kernel_slow( \ - tensors1, tensors2); \ + check_foreach_api_restrictions(tensors1, tensors2); \ + if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) { \ + return foreach_tensor_##NAME##_list_kernel_slow(tensors1, tensors2); \ } \ \ - return at::native::xpu::FOREACH_BINARY_LIST_KERNEL_NAME(NAME)( \ - tensors1, tensors2); \ + return xpu::FOREACH_BINARY_LIST_KERNEL_NAME(NAME)(tensors1, tensors2); \ } -#define FOREACH_BINARY_OP_LIST_ALPHA(NAME) \ - void XPUNativeFunctions::_foreach_##NAME##_( \ - TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \ - at::native::check_foreach_api_restrictions(tensors1, tensors2); \ - if (!at::native::can_use_fast_route({tensors1, tensors2}, alpha)) { \ - return at::native::foreach_tensor_##NAME##_list_kernel_slow_( \ - tensors1, tensors2, alpha); \ - } \ - \ - at::native::xpu::FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL_NAME(NAME)( \ - tensors1, tensors2, alpha); \ - } \ - \ - std::vector XPUNativeFunctions::_foreach_##NAME( \ - TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \ - at::native::check_foreach_api_restrictions(tensors1, tensors2); \ - if (!at::native::can_use_fast_route({tensors1, tensors2}, alpha)) { \ - return at::native::foreach_tensor_##NAME##_list_kernel_slow( \ - tensors1, tensors2, alpha); \ - } \ - \ - return at::native::xpu::FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)( \ - tensors1, tensors2, alpha); \ +#define FOREACH_BINARY_OP_LIST_ALPHA(NAME) \ + void foreach_tensor_##NAME##_list_kernel_xpu_( \ + TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \ + check_foreach_api_restrictions(tensors1, tensors2); \ + if (!can_use_fast_route({tensors1, tensors2}, alpha)) { \ + return foreach_tensor_##NAME##_list_kernel_slow_( \ + tensors1, tensors2, alpha); \ + } \ + \ + xpu::FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL_NAME(NAME)( \ + tensors1, tensors2, alpha); \ + } \ + \ + std::vector foreach_tensor_##NAME##_list_kernel_xpu( \ + TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \ + check_foreach_api_restrictions(tensors1, tensors2); \ + if (!can_use_fast_route({tensors1, tensors2}, alpha)) { \ + return foreach_tensor_##NAME##_list_kernel_slow( \ + tensors1, tensors2, alpha); \ + } \ + \ + return xpu::FOREACH_BINARY_LIST_ALPHA_KERNEL_NAME(NAME)( \ + tensors1, tensors2, alpha); \ } FOREACH_BINARY_OP_LIST_ALPHA(add); FOREACH_BINARY_OP_LIST(mul, false); FOREACH_BINARY_OP_LIST(div, true); -#define FOREACH_POINTWISE_OP_TENSOR(NAME) \ - std::vector XPUNativeFunctions::_foreach_##NAME( \ - TensorList input, \ - TensorList tensors1, \ - TensorList tensors2, \ - const Tensor& scalars_) { \ - auto scalars = \ - at::native::convert_tensor_to_scalar_list(scalars_, input.size()); \ - at::native::check_foreach_api_restrictions( \ - input, tensors1, tensors2, scalars); \ - if (!at::native::can_use_fast_route({input, tensors1, tensors2}) || \ - at::native::has_integral_tensor(input, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_slow( \ - input, tensors1, tensors2, scalars); \ - } \ - \ - return native::xpu::foreach_##NAME##_kernel( \ - input, tensors1, tensors2, scalars); \ - } \ - \ - void XPUNativeFunctions::_foreach_##NAME##_( \ - TensorList input, \ - TensorList tensors1, \ - TensorList tensors2, \ - const Tensor& scalars_) { \ - auto scalars = \ - at::native::convert_tensor_to_scalar_list(scalars_, input.size()); \ - at::native::check_foreach_api_restrictions( \ - input, tensors1, tensors2, scalars); \ - if (!at::native::can_use_fast_route( \ - {input, tensors1, tensors2}, scalars) || \ - at::native::has_integral_tensor(input, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_slow_( \ - input, tensors1, tensors2, scalars); \ - } \ - \ - native::xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars); \ +::std::vector foreach_tensor_addcmul_scalarlist_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); +void foreach_tensor_addcmul_scalarlist_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); + +::std::vector foreach_tensor_addcdiv_scalarlist_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); +void foreach_tensor_addcdiv_scalarlist_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); + +#define FOREACH_POINTWISE_OP_TENSOR(NAME) \ + std::vector foreach_tensor_##NAME##_list_kernel_xpu( \ + TensorList input, \ + TensorList tensors1, \ + TensorList tensors2, \ + const Tensor& scalars_) { \ + auto scalars = \ + at::native::convert_tensor_to_scalar_list(scalars_, input.size()); \ + at::native::check_foreach_api_restrictions( \ + input, tensors1, tensors2, scalars); \ + if (!at::native::can_use_fast_route({input, tensors1, tensors2}) || \ + at::native::has_integral_tensor(input, /* includeBool */ true)) { \ + return at::native::foreach_tensor_##NAME##_scalarlist_slow( \ + input, tensors1, tensors2, scalars); \ + } \ + \ + return native::xpu::foreach_##NAME##_kernel( \ + input, tensors1, tensors2, scalars); \ + } \ + \ + void foreach_tensor_##NAME##_list_kernel_xpu_( \ + TensorList input, \ + TensorList tensors1, \ + TensorList tensors2, \ + const Tensor& scalars_) { \ + auto scalars = convert_tensor_to_scalar_list(scalars_, input.size()); \ + check_foreach_api_restrictions(input, tensors1, tensors2, scalars); \ + if (!can_use_fast_route({input, tensors1, tensors2}, scalars) || \ + has_integral_tensor(input, /* includeBool */ true)) { \ + return foreach_tensor_##NAME##_scalarlist_slow_( \ + input, tensors1, tensors2, scalars); \ + } \ + \ + xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars); \ } FOREACH_POINTWISE_OP_TENSOR(addcmul) FOREACH_POINTWISE_OP_TENSOR(addcdiv) -std::vector XPUNativeFunctions::_foreach_lerp( +::std::vector foreach_tensor_ternary_lerp_slow( + at::TensorList self, + at::TensorList tensors1, + at::TensorList weights); + +std::vector foreach_tensor_lerp_ternary_xpu( TensorList tensors1, TensorList tensors2, TensorList tensors3) { - at::native::check_foreach_api_restrictions(tensors1, tensors2, tensors3); - if (!at::native::can_use_fast_route( - {tensors1, tensors2, tensors3}, {}, true)) { - return at::native::foreach_tensor_ternary_lerp_slow( - tensors1, tensors2, tensors3); + check_foreach_api_restrictions(tensors1, tensors2, tensors3); + if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) { + return foreach_tensor_ternary_lerp_slow(tensors1, tensors2, tensors3); } std::vector vec_res; vec_res.reserve(tensors1.size()); for (const auto& t : tensors1) { - vec_res.emplace_back(at::native::empty_like(t)); + vec_res.emplace_back(at::empty_like(t)); } - native::xpu::foreach_lerp_list_kernel(tensors1, tensors2, tensors3, vec_res); + xpu::foreach_lerp_list_kernel(tensors1, tensors2, tensors3, vec_res); return vec_res; } -void XPUNativeFunctions::_foreach_lerp_( +void foreach_tensor_ternary_lerp_slow_( + at::TensorList self, + at::TensorList tensors1, + at::TensorList weights); + +void foreach_tensor_lerp_ternary_xpu_( TensorList tensors1, TensorList tensors2, TensorList tensors3) { - at::native::check_foreach_api_restrictions(tensors1, tensors2, tensors3); - if (!at::native::can_use_fast_route( - {tensors1, tensors2, tensors3}, {}, true)) { - return at::native::foreach_tensor_ternary_lerp_slow_( - tensors1, tensors2, tensors3); + check_foreach_api_restrictions(tensors1, tensors2, tensors3); + if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) { + return foreach_tensor_ternary_lerp_slow_(tensors1, tensors2, tensors3); } - native::xpu::foreach_lerp_list_kernel_(tensors1, tensors2, tensors3); + xpu::foreach_lerp_list_kernel_(tensors1, tensors2, tensors3); // TODO: Handle version bump in codegen. // increment_version @@ -144,4 +189,5 @@ void XPUNativeFunctions::_foreach_lerp_( } } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ForeachOpScalar.cpp b/src/ATen/native/xpu/ForeachOpScalar.cpp index 95238d0dc..46b908ced 100644 --- a/src/ATen/native/xpu/ForeachOpScalar.cpp +++ b/src/ATen/native/xpu/ForeachOpScalar.cpp @@ -3,109 +3,155 @@ #include #include #include -#include namespace at { -#define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP) \ - void XPUNativeFunctions::_foreach_##NAME##_( \ - TensorList tensors, const Scalar& scalar) { \ - at::native::check_foreach_api_restrictions(tensors); \ - if (!at::native::can_use_fast_route(tensors, scalar, DIV_OP)) { \ - return at::native::foreach_tensor_##NAME##_scalar_kernel_slow_( \ - tensors, scalar); \ - } \ - \ - at::native::xpu::FOREACH_BINARY_SCALAR_INPLACE_KERNEL_NAME(NAME)( \ - tensors, scalar); \ - } \ - \ - std::vector XPUNativeFunctions::_foreach_##NAME( \ - TensorList tensors, const Scalar& scalar) { \ - at::native::check_foreach_api_restrictions(tensors); \ - if (!at::native::can_use_fast_route(tensors, scalar, DIV_OP)) { \ - return at::native::foreach_tensor_##NAME##_scalar_kernel_slow( \ - tensors, scalar); \ - } \ - \ - return at::native::xpu::FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)( \ - tensors, scalar); \ +namespace native { + +::std::vector foreach_tensor_add_scalar_kernel_slow( + at::TensorList self, + const at::Scalar& scalar); +void foreach_tensor_add_scalar_kernel_slow_( + at::TensorList self, + const at::Scalar& scalar); + +::std::vector foreach_tensor_mul_scalar_kernel_slow( + at::TensorList self, + const at::Scalar& scalar); +void foreach_tensor_mul_scalar_kernel_slow_( + at::TensorList self, + const at::Scalar& scalar); + +::std::vector foreach_tensor_div_scalar_kernel_slow( + at::TensorList self, + const at::Scalar& scalar); +void foreach_tensor_div_scalar_kernel_slow_( + at::TensorList self, + const at::Scalar& scalar); + +#define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP) \ + void foreach_tensor_##NAME##_scalar_kernel_xpu_( \ + TensorList tensors, const Scalar& scalar) { \ + check_foreach_api_restrictions(tensors); \ + if (!can_use_fast_route(tensors, scalar, DIV_OP)) { \ + return foreach_tensor_##NAME##_scalar_kernel_slow_(tensors, scalar); \ + } \ + \ + xpu::FOREACH_BINARY_SCALAR_INPLACE_KERNEL_NAME(NAME)(tensors, scalar); \ + } \ + \ + std::vector foreach_tensor_##NAME##_scalar_kernel_xpu( \ + TensorList tensors, const Scalar& scalar) { \ + check_foreach_api_restrictions(tensors); \ + if (!can_use_fast_route(tensors, scalar, DIV_OP)) { \ + return foreach_tensor_##NAME##_scalar_kernel_slow(tensors, scalar); \ + } \ + \ + return xpu::FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)(tensors, scalar); \ } FOREACH_BINARY_OP_SCALAR(add, /*div_op*/ false); FOREACH_BINARY_OP_SCALAR(mul, /*div_op*/ false); FOREACH_BINARY_OP_SCALAR(div, /*div_op*/ true); -#define FOREACH_POINTWISE_OP_SCALAR(NAME) \ - std::vector XPUNativeFunctions::_foreach_##NAME( \ - TensorList input, \ - TensorList tensors1, \ - TensorList tensors2, \ - const Scalar& scalar) { \ - at::native::check_foreach_api_restrictions(input, tensors1, tensors2); \ - \ - if (!at::native::can_use_fast_route( \ - {input, tensors1, tensors2}, scalar) || \ - at::native::has_integral_tensor(input, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##NAME##_scalar_slow( \ - input, tensors1, tensors2, scalar); \ - } \ - \ - return native::xpu::foreach_##NAME##_kernel( \ - input, tensors1, tensors2, scalar); \ - } \ - \ - void XPUNativeFunctions::_foreach_##NAME##_( \ - TensorList input, \ - TensorList tensors1, \ - TensorList tensors2, \ - const Scalar& scalar) { \ - at::native::check_foreach_api_restrictions(input, tensors1, tensors2); \ - \ - if (!at::native::can_use_fast_route( \ - {input, tensors1, tensors2}, scalar) || \ - at::native::has_integral_tensor(input, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##NAME##_scalar_slow_( \ - input, tensors1, tensors2, scalar); \ - } \ - \ - native::xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalar); \ +::std::vector foreach_tensor_addcmul_scalar_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value); +void foreach_tensor_addcmul_scalar_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value); + +::std::vector foreach_tensor_addcdiv_scalar_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value); +void foreach_tensor_addcdiv_scalar_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value); + +#define FOREACH_POINTWISE_OP_SCALAR(NAME) \ + std::vector foreach_tensor_##NAME##_scalar_xpu( \ + TensorList input, \ + TensorList tensors1, \ + TensorList tensors2, \ + const Scalar& scalar) { \ + check_foreach_api_restrictions(input, tensors1, tensors2); \ + \ + if (!can_use_fast_route({input, tensors1, tensors2}, scalar) || \ + has_integral_tensor(input, /* includeBool */ true)) { \ + return foreach_tensor_##NAME##_scalar_slow( \ + input, tensors1, tensors2, scalar); \ + } \ + \ + return xpu::foreach_##NAME##_kernel(input, tensors1, tensors2, scalar); \ + } \ + \ + void foreach_tensor_##NAME##_scalar_xpu_( \ + TensorList input, \ + TensorList tensors1, \ + TensorList tensors2, \ + const Scalar& scalar) { \ + check_foreach_api_restrictions(input, tensors1, tensors2); \ + \ + if (!can_use_fast_route({input, tensors1, tensors2}, scalar) || \ + has_integral_tensor(input, /* includeBool */ true)) { \ + return foreach_tensor_##NAME##_scalar_slow_( \ + input, tensors1, tensors2, scalar); \ + } \ + \ + xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalar); \ } FOREACH_POINTWISE_OP_SCALAR(addcmul) FOREACH_POINTWISE_OP_SCALAR(addcdiv) -std::vector XPUNativeFunctions::_foreach_lerp( +::std::vector foreach_tensor_lerp_list_kernel_slow( + at::TensorList self, + at::TensorList tensors1, + const at::Scalar& weight); +void foreach_tensor_lerp_list_kernel_slow_( + at::TensorList self, + at::TensorList tensors1, + const at::Scalar& weight); + +std::vector foreach_tensor_lerp_list_xpu( TensorList tensors1, TensorList tensors2, const Scalar& weight) { - at::native::check_foreach_api_restrictions(tensors1, tensors2); - if (!at::native::can_use_fast_route({tensors1, tensors2}, {}, true)) { - return at::native::foreach_tensor_lerp_list_kernel_slow( - tensors1, tensors2, weight); + check_foreach_api_restrictions(tensors1, tensors2); + if (!can_use_fast_route({tensors1, tensors2}, {}, true)) { + return foreach_tensor_lerp_list_kernel_slow(tensors1, tensors2, weight); } std::vector vec_res; vec_res.reserve(tensors1.size()); for (const auto& t : tensors1) { - vec_res.emplace_back(at::native::empty_like(t)); + vec_res.emplace_back(at::empty_like(t)); } - native::xpu::foreach_lerp_scalar_kernel(tensors1, tensors2, weight, vec_res); + xpu::foreach_lerp_scalar_kernel(tensors1, tensors2, weight, vec_res); return vec_res; } -void XPUNativeFunctions::_foreach_lerp_( +void foreach_tensor_lerp_list_xpu_( TensorList tensors1, TensorList tensors2, const Scalar& weight) { - at::native::check_foreach_api_restrictions(tensors1, tensors2); - if (!at::native::can_use_fast_route({tensors1, tensors2}, {}, true)) { - return at::native::foreach_tensor_lerp_list_kernel_slow_( - tensors1, tensors2, weight); + check_foreach_api_restrictions(tensors1, tensors2); + if (!can_use_fast_route({tensors1, tensors2}, {}, true)) { + return foreach_tensor_lerp_list_kernel_slow_(tensors1, tensors2, weight); } - native::xpu::foreach_lerp_scalar_kernel_(tensors1, tensors2, weight); + xpu::foreach_lerp_scalar_kernel_(tensors1, tensors2, weight); } + +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp index 7752395db..6ac047476 100644 --- a/src/ATen/native/xpu/ForeachOpScalarList.cpp +++ b/src/ATen/native/xpu/ForeachOpScalarList.cpp @@ -2,78 +2,154 @@ #include #include -#include + +#include +#include namespace at { +namespace native { +::std::vector foreach_tensor_add_scalar_kernel_slow( + at::TensorList self, + const at::Scalar& scalar); +void foreach_tensor_add_scalar_kernel_slow_( + at::TensorList self, + const at::Scalar& scalar); +::std::vector foreach_tensor_mul_scalar_kernel_slow( + at::TensorList self, + const at::Scalar& scalar); +void foreach_tensor_mul_scalar_kernel_slow_( + at::TensorList self, + const at::Scalar& scalar); -#define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP) \ - void XPUNativeFunctions::_foreach_##NAME##_( \ - TensorList tensors, at::ArrayRef scalars) { \ - at::native::check_foreach_api_restrictions(tensors, scalars); \ - if (!at::native::can_use_fast_route(tensors, scalars, DIV_OP)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_( \ - tensors, scalars); \ - } \ - \ - at::native::xpu::FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL_NAME(NAME)( \ - tensors, scalars); \ - } \ - \ - std::vector XPUNativeFunctions::_foreach_##NAME( \ - TensorList tensors, at::ArrayRef scalars) { \ - at::native::check_foreach_api_restrictions(tensors, scalars); \ - if (!at::native::can_use_fast_route(tensors, scalars, DIV_OP)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow( \ - tensors, scalars); \ - } \ - \ - return at::native::xpu::FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)( \ - tensors, scalars); \ - } +::std::vector foreach_tensor_add_scalarlist_kernel_slow( + at::TensorList self, + at::ArrayRef scalars); +void foreach_tensor_add_scalarlist_kernel_slow_( + at::TensorList self, + at::ArrayRef scalars); +::std::vector foreach_tensor_mul_scalarlist_kernel_slow( + at::TensorList self, + at::ArrayRef scalars); +void foreach_tensor_mul_scalarlist_kernel_slow_( + at::TensorList self, + at::ArrayRef scalars); -FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false); -FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false); -FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true); +::std::vector foreach_tensor_div_scalar_kernel_slow( + at::TensorList self, + const at::Scalar& scalar); +void foreach_tensor_div_scalar_kernel_slow_( + at::TensorList self, + const at::Scalar& scalar); +::std::vector foreach_tensor_div_scalarlist_kernel_slow( + at::TensorList self, + at::ArrayRef scalars); +void foreach_tensor_div_scalarlist_kernel_slow_( + at::TensorList self, + at::ArrayRef scalars); -#define FOREACH_POINTWISE_OP_SCALARLIST(NAME) \ - std::vector XPUNativeFunctions::_foreach_##NAME( \ - TensorList input, \ - TensorList tensors1, \ - TensorList tensors2, \ - at::ArrayRef scalars) { \ - at::native::check_foreach_api_restrictions( \ - input, tensors1, tensors2, scalars); \ - \ - if (!at::native::can_use_fast_route( \ - {input, tensors1, tensors2}, scalars) || \ - at::native::has_integral_tensor(input, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_slow( \ - input, tensors1, tensors2, scalars); \ +#define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP) \ + void foreach_tensor_##NAME##_scalar_kernel_xpu_( \ + TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors, scalars); \ + if (!can_use_fast_route(tensors, scalars, DIV_OP)) { \ + return foreach_tensor_##NAME##_scalarlist_kernel_slow_( \ + tensors, scalars); \ } \ \ - return native::xpu::foreach_##NAME##_kernel( \ - input, tensors1, tensors2, scalars); \ + xpu::FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL_NAME(NAME)( \ + tensors, scalars); \ } \ \ - void XPUNativeFunctions::_foreach_##NAME##_( \ - TensorList input, \ - TensorList tensors1, \ - TensorList tensors2, \ - at::ArrayRef scalars) { \ - at::native::check_foreach_api_restrictions( \ - input, tensors1, tensors2, scalars); \ - \ - if (!at::native::can_use_fast_route( \ - {input, tensors1, tensors2}, scalars) || \ - at::native::has_integral_tensor(input, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_slow_( \ - input, tensors1, tensors2, scalars); \ + std::vector foreach_tensor_##NAME##_scalar_kernel_xpu( \ + TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors, scalars); \ + if (!can_use_fast_route(tensors, scalars, DIV_OP)) { \ + return foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars); \ } \ \ - native::xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars); \ + return xpu::FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)(tensors, scalars); \ + } + +FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false); +FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false); +FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true); + +void foreach_tensor_addcmul_scalar_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value = 1); +::std::vector foreach_tensor_addcmul_scalar_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value = 1); +::std::vector foreach_tensor_addcmul_scalarlist_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); +void foreach_tensor_addcmul_scalarlist_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); +void foreach_tensor_addcdiv_scalar_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value = 1); +::std::vector foreach_tensor_addcdiv_scalar_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + const at::Scalar& value = 1); +::std::vector foreach_tensor_addcdiv_scalarlist_slow( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); +void foreach_tensor_addcdiv_scalarlist_slow_( + at::TensorList self, + at::TensorList tensor1, + at::TensorList tensor2, + at::ArrayRef scalars); + +#define FOREACH_POINTWISE_OP_SCALARLIST(NAME) \ + std::vector foreach_tensor_##NAME##_scalarlist_xpu( \ + TensorList input, \ + TensorList tensors1, \ + TensorList tensors2, \ + at::ArrayRef scalars) { \ + check_foreach_api_restrictions(input, tensors1, tensors2, scalars); \ + \ + if (!can_use_fast_route({input, tensors1, tensors2}, scalars) || \ + has_integral_tensor(input, /* includeBool */ true)) { \ + return foreach_tensor_##NAME##_scalarlist_slow( \ + input, tensors1, tensors2, scalars); \ + } \ + \ + return xpu::foreach_##NAME##_kernel(input, tensors1, tensors2, scalars); \ + } \ + \ + void foreach_tensor_##NAME##_scalarlist_xpu_( \ + TensorList input, \ + TensorList tensors1, \ + TensorList tensors2, \ + at::ArrayRef scalars) { \ + check_foreach_api_restrictions(input, tensors1, tensors2, scalars); \ + \ + if (!can_use_fast_route({input, tensors1, tensors2}, scalars) || \ + has_integral_tensor(input, /* includeBool */ true)) { \ + return foreach_tensor_##NAME##_scalarlist_slow_( \ + input, tensors1, tensors2, scalars); \ + } \ + \ + xpu::foreach_##NAME##_kernel_(input, tensors1, tensors2, scalars); \ } FOREACH_POINTWISE_OP_SCALARLIST(addcmul) FOREACH_POINTWISE_OP_SCALARLIST(addcdiv) +}; // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ForeachReduceOp.cpp b/src/ATen/native/xpu/ForeachReduceOp.cpp index b67314db3..003f6ae14 100644 --- a/src/ATen/native/xpu/ForeachReduceOp.cpp +++ b/src/ATen/native/xpu/ForeachReduceOp.cpp @@ -1,9 +1,10 @@ #include -#include #include +#include namespace at { +namespace native { static inline void check_foreach_norm_dtype( optional opt_dtype, @@ -39,7 +40,7 @@ static inline void check_foreach_norm_dtype( } } -std::vector XPUNativeFunctions::_foreach_norm( +std::vector foreach_tensor_norm_xpu( TensorList tensors, const Scalar& ord, c10::optional dtype) { @@ -68,5 +69,5 @@ std::vector XPUNativeFunctions::_foreach_norm( return native::xpu::foreach_norm_kernel(tensors, ord, p, dtype); } - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ForeachUnaryOp.cpp b/src/ATen/native/xpu/ForeachUnaryOp.cpp index 4287488dd..89cd0ab4e 100644 --- a/src/ATen/native/xpu/ForeachUnaryOp.cpp +++ b/src/ATen/native/xpu/ForeachUnaryOp.cpp @@ -1,31 +1,33 @@ #include #include -#include namespace at { - +namespace native { // given a functor and a "dispatch function", creates the outplace and inplace // operations -#define FOREACH_UNARY_OP(op_name) \ - std::vector XPUNativeFunctions::_foreach_##op_name( \ - TensorList tensors) { \ - native::check_foreach_api_restrictions(tensors); \ - if (!native::can_use_fast_route(tensors) || \ - native::has_integral_tensor(tensors, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##op_name##_slow(tensors); \ - } \ - return native::xpu::foreach_##op_name##_kernel(tensors); \ - } \ - void XPUNativeFunctions::_foreach_##op_name##_(TensorList tensors) { \ - native::check_foreach_api_restrictions(tensors); \ - if (!native::can_use_fast_route(tensors) || \ - native::has_integral_tensor(tensors, /* includeBool */ true)) { \ - return at::native::foreach_tensor_##op_name##_slow_(tensors); \ - } \ - \ - native::xpu::foreach_##op_name##_kernel_(tensors); \ + +::std::vector foreach_tensor_sqrt_slow(at::TensorList self); +void foreach_tensor_sqrt_slow_(at::TensorList self); + +#define FOREACH_UNARY_OP(op_name) \ + std::vector foreach_tensor_##op_name##_xpu(TensorList tensors) { \ + check_foreach_api_restrictions(tensors); \ + if (!can_use_fast_route(tensors) || \ + has_integral_tensor(tensors, /* includeBool */ true)) { \ + return foreach_tensor_##op_name##_slow(tensors); \ + } \ + return xpu::foreach_##op_name##_kernel(tensors); \ + } \ + void foreach_tensor_##op_name##_xpu_(TensorList tensors) { \ + check_foreach_api_restrictions(tensors); \ + if (!can_use_fast_route(tensors) || \ + has_integral_tensor(tensors, /* includeBool */ true)) { \ + return foreach_tensor_##op_name##_slow_(tensors); \ + } \ + \ + xpu::foreach_##op_name##_kernel_(tensors); \ } FOREACH_UNARY_OP(sqrt); - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/GatedLinearUnit.cpp b/src/ATen/native/xpu/GatedLinearUnit.cpp index ef45a3bd7..5872ecbb2 100644 --- a/src/ATen/native/xpu/GatedLinearUnit.cpp +++ b/src/ATen/native/xpu/GatedLinearUnit.cpp @@ -1,49 +1,15 @@ #include #include +#include +#include #include -#include - #include namespace at { +namespace native { +REGISTER_XPU_DISPATCH(glu_stub, &xpu::glu_kernel); -TensorIterator glu_meta(const Tensor& self, int64_t dim, Tensor& out) { - // this can't pass anyway because a 0-dimensional tensor has "size" 1, which - // can't be evenly halved, but give a nicer error message here. - TORCH_CHECK(self.dim() > 0, "glu does not support 0-dimensional tensors"); - auto wrap_dim = maybe_wrap_dim(dim, self.dim()); - const int64_t nIn = self.size(wrap_dim); - TORCH_CHECK( - nIn % 2 == 0, - "Halving dimension must be even, but dimension ", - wrap_dim, - " is size ", - nIn); - - // size output to half of input - const int64_t selfSize = nIn / 2; - Tensor firstHalf = self.narrow(wrap_dim, 0, selfSize); - Tensor secondHalf = self.narrow(wrap_dim, selfSize, selfSize); - return TensorIterator::borrowing_binary_op(out, firstHalf, secondHalf); -} - -Tensor& XPUNativeFunctions::glu_out( - const Tensor& self, - int64_t dim, - Tensor& out) { - auto iter = glu_meta(self, dim, out); - native::xpu::glu_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::glu(const Tensor& self, int64_t dim) { - Tensor out; - auto iter = glu_meta(self, dim, out); - native::xpu::glu_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::glu_backward_out( +Tensor& glu_backward_xpu_out( const Tensor& grad_output, const Tensor& input, int64_t dim, @@ -91,12 +57,13 @@ Tensor& XPUNativeFunctions::glu_backward_out( return grad_input; } -Tensor XPUNativeFunctions::glu_backward( +Tensor glu_backward_xpu( const Tensor& grad_output, const Tensor& input, int64_t dim) { auto grad_input = at::empty({0}, input.options()); - return glu_backward_out(grad_output, input, dim, grad_input); + return glu_backward_xpu_out(grad_output, input, dim, grad_input); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/GridSampler.cpp b/src/ATen/native/xpu/GridSampler.cpp index 17e8baf21..fa9a5d17e 100644 --- a/src/ATen/native/xpu/GridSampler.cpp +++ b/src/ATen/native/xpu/GridSampler.cpp @@ -1,22 +1,24 @@ -#include #include -#include #include +#include +#include +#include namespace at { +namespace native { -Tensor XPUNativeFunctions::grid_sampler_2d( +Tensor grid_sampler_2d_xpu( const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { - return native::xpu::grid_sampler_2d_kernel( + return xpu::grid_sampler_2d_kernel( input, grid, interpolation_mode, padding_mode, align_corners); } -std::tuple XPUNativeFunctions::grid_sampler_2d_backward( +std::tuple grid_sampler_2d_backward_xpu( const Tensor& grad_output, const Tensor& input, const Tensor& grid, @@ -33,7 +35,7 @@ std::tuple XPUNativeFunctions::grid_sampler_2d_backward( } })(); auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - native::xpu::grid_sampler_2d_backward_kernel( + xpu::grid_sampler_2d_backward_kernel( grad_input, grad_grid, grad_output, @@ -45,5 +47,5 @@ std::tuple XPUNativeFunctions::grid_sampler_2d_backward( output_mask); return std::make_tuple(grad_input, grad_grid); } - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/GroupNorm.cpp b/src/ATen/native/xpu/GroupNorm.cpp index 0e0a2e558..77d788059 100644 --- a/src/ATen/native/xpu/GroupNorm.cpp +++ b/src/ATen/native/xpu/GroupNorm.cpp @@ -1,145 +1,15 @@ -#include #include +#include #include +#include #include -#include +#include namespace at { - -template -void check_group_norm_inputs( - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - T C, - int64_t num_groups) { - TORCH_CHECK( - num_groups > 0, - "Expected num groups to be greater than 0, got ", - num_groups); - TORCH_CHECK( - C % num_groups == 0, - "Expected number of channels in input to be divisible by ", - "num_groups, but got input of shape ", - input.sizes(), - " and " - "num_groups=", - num_groups); - TORCH_CHECK( - !weight.defined() || - (weight.dim() == 1 && at::symint::numel(weight) == C), - "Expected weight to be a vector of size equal to the number of ", - "channels in input, but got weight of shape ", - weight.sizes(), - " and input of shape ", - input.sizes()); - TORCH_CHECK( - !bias.defined() || (bias.dim() == 1 && at::symint::numel(bias) == C), - "Expected bias to be a vector of size equal to the number of ", - "channels in input, but got bias of shape ", - weight.sizes(), - " and input of shape ", - input.sizes()); -} - -std::tuple XPUNativeFunctions::native_group_norm( - const Tensor& X, - const std::optional& gamma_opt, - const std::optional& beta_opt, - int64_t N, - int64_t C, - int64_t HxW, - int64_t group, - double eps) { - // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned gamma_maybe_owned = - at::borrow_from_optional_tensor(gamma_opt); - const Tensor& gamma = *gamma_maybe_owned; - const Tensor& beta = c10::value_or_else(beta_opt, [] { return Tensor(); }); - - // repeated check so expanded weights can call native_group_norm directly but - // save mean and variance from forward - check_group_norm_inputs(X, gamma, beta, C, group); - - bool mixed_type = at::native::is_mixed_type(X, gamma, beta); - if (mixed_type) { - at::native::check_mixed_data_type(X, gamma, beta); - } - - Tensor Y = at::native::empty_like( - X, - c10::nullopt /* dtype */, - c10::nullopt /* layout */, - c10::nullopt /* device */, - c10::nullopt /* pin_memory */, - MemoryFormat::Contiguous); - const auto dtype = at::native::param_scalar_type(X, mixed_type); - Tensor mean = at::empty({N, group}, X.options().dtype(dtype)); - Tensor rstd = at::empty({N, group}, X.options().dtype(dtype)); - native::xpu::group_norm_kernel( - X, gamma, beta, N, C, HxW, group, eps, Y, mean, rstd, dtype); - return std::make_tuple(Y, mean, rstd); -} - -std::tuple XPUNativeFunctions:: - native_group_norm_backward( - const Tensor& dY, - const Tensor& X, - const Tensor& mean, - const Tensor& rstd, - const c10::optional& gamma_opt, - int64_t N, - int64_t C, - int64_t HxW, - int64_t group, - std::array grad_input_mask) { - // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned gamma_maybe_owned = - at::borrow_from_optional_tensor(gamma_opt); - const Tensor& gamma = *gamma_maybe_owned; - TORCH_CHECK( - X.scalar_type() == dY.scalar_type(), - "Expected scalar types of X and dY are same."); - bool mixed_type = at::native::is_mixed_type(X, mean, rstd); - if (mixed_type) { - at::native::check_mixed_data_type(X, mean, rstd); - } - auto memory_format = X.device().is_cpu() ? X.suggest_memory_format() - : at::MemoryFormat::Contiguous; - - Tensor dX; - Tensor dgamma; - Tensor dbeta; - if (grad_input_mask[0]) { - dX = at::native::empty_like( - X, - c10::nullopt /* dtype */, - c10::nullopt /* layout */, - c10::nullopt /* device */, - c10::nullopt /* pin_memory */, - memory_format); - } - if (grad_input_mask[1]) { - dgamma = at::native::empty_like( - gamma, - c10::nullopt /* dtype */, - c10::nullopt /* layout */, - c10::nullopt /* device */, - c10::nullopt /* pin_memory */, - at::MemoryFormat::Contiguous); - } - if (grad_input_mask[2]) { - dbeta = at::native::empty_like( - gamma, - c10::nullopt /* dtype */, - c10::nullopt /* layout */, - c10::nullopt /* device */, - c10::nullopt /* pin_memory */, - at::MemoryFormat::Contiguous); - } - native::xpu::group_norm_backward_kernel( - dY, X, mean, rstd, gamma, N, C, HxW, group, dX, dgamma, dbeta); - return std::make_tuple(dX, dgamma, dbeta); -} - +namespace native { +REGISTER_XPU_DISPATCH(GroupNormKernel, &xpu::group_norm_kernel); +REGISTER_XPU_DISPATCH( + GroupNormBackwardKernel, + &xpu::group_norm_backward_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Histogram.cpp b/src/ATen/native/xpu/Histogram.cpp index f1d675286..3b0dcf5c1 100644 --- a/src/ATen/native/xpu/Histogram.cpp +++ b/src/ATen/native/xpu/Histogram.cpp @@ -1,330 +1,17 @@ #include #include +#include #include #include -#include namespace at { -/* Checks properties of input tensors input, bins, and weight. - */ -void histogramdd_check_inputs( - const Tensor& input, - const TensorList& bins, - const std::optional& weight) { - TORCH_CHECK( - input.dim() >= 2, - "torch.histogramdd: input tensor should have at least 2 dimensions, but got ", - input.dim()); - - const int64_t N = input.size(-1); - - TORCH_CHECK( - static_cast(bins.size()) == N, - "torch.histogramdd: expected ", - N, - " sequences of bin edges for a ", - N, - "-dimensional histogram but got ", - bins.size()); - - auto input_dtype = input.dtype(); - for (const auto dim : c10::irange(N)) { - const Tensor& dim_bins = bins[dim]; - - auto bins_dtype = dim_bins.dtype(); - TORCH_CHECK( - input_dtype == bins_dtype, - "torch.histogramdd: input tensor and bins tensors should", - " have the same dtype, but got input with dtype ", - input_dtype, - " and bins for dimension ", - dim, - " with dtype ", - bins_dtype); - - const int64_t dim_bins_dim = dim_bins.dim(); - TORCH_CHECK( - dim_bins_dim == 1, - "torch.histogramdd: bins tensor should have one dimension,", - " but got ", - dim_bins_dim, - " dimensions in the bins tensor for dimension ", - dim); - - const int64_t numel = dim_bins.numel(); - TORCH_CHECK( - numel > 0, - "torch.histogramdd: bins tensor should have at least 1 element,", - " but got ", - numel, - " elements in the bins tensor for dimension ", - dim); - } - - if (weight.has_value()) { - TORCH_CHECK( - input.dtype() == weight.value().dtype(), - "torch.histogramdd: if weight tensor is provided," - " input tensor and weight tensor should have the same dtype, but got input(", - input.dtype(), - ")", - ", and weight(", - weight.value().dtype(), - ")"); - - /* If a weight tensor is provided, we expect its shape to match that of - * the input tensor excluding its innermost dimension N. - */ - auto input_sizes = input.sizes().vec(); - input_sizes.pop_back(); - - auto weight_sizes = weight.value().sizes().vec(); - if (weight_sizes.empty()) { - // correctly handle scalars - weight_sizes = {1}; - } - - TORCH_CHECK( - input_sizes == weight_sizes, - "torch.histogramdd: if weight tensor is provided it should have" - " the same shape as the input tensor excluding its innermost dimension, but got input with shape ", - input.sizes(), - " and weight with shape ", - weight.value().sizes()); - } -} - -/* Checks properties of output tensors hist and bin_edges, then resizes them. - */ -void histogramdd_prepare_out( - const Tensor& input, - const std::vector& bin_ct, - const Tensor& hist, - const TensorList& bin_edges) { - const int64_t N = input.size(-1); - - TORCH_INTERNAL_ASSERT((int64_t)bin_ct.size() == N); - TORCH_INTERNAL_ASSERT((int64_t)bin_edges.size() == N); - - TORCH_CHECK( - input.dtype() == hist.dtype(), - "torch.histogram: input tensor and hist tensor should", - " have the same dtype, but got input ", - input.dtype(), - " and hist ", - hist.dtype()); - - for (const auto dim : c10::irange(N)) { - TORCH_CHECK( - input.dtype() == bin_edges[dim].dtype(), - "torch.histogram: input tensor and bin_edges tensor should", - " have the same dtype, but got input ", - input.dtype(), - " and bin_edges ", - bin_edges[dim].dtype(), - " for dimension ", - dim); - - TORCH_CHECK( - bin_ct[dim] > 0, - "torch.histogram(): bins must be > 0, but got ", - bin_ct[dim], - " for dimension ", - dim); - - at::native::resize_output(bin_edges[dim], bin_ct[dim] + 1); - } - - at::native::resize_output(hist, bin_ct); -} - -void histogramdd_prepare_out( - const Tensor& input, - TensorList bins, - const Tensor& hist, - const TensorList& bin_edges) { - std::vector bin_ct(bins.size()); - std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](Tensor t) { - return t.numel() - 1; - }); - histogramdd_prepare_out(input, bin_ct, hist, bin_edges); -} - -void histogram_select_outer_bin_edges_kernel( - const Tensor& input, - const int64_t N, - std::vector& leftmost_edges, - std::vector& rightmost_edges) { - auto [min, max] = at::aminmax(input, 0); - - for (const auto i : c10::irange(N)) { - leftmost_edges[i] = min[i].item().to(); - rightmost_edges[i] = max[i].item().to(); - } -} - -/* Determines the outermost bin edges. For simplicity when calling into aminmax, - * assumes that input has already been reshaped to (M, N). - */ -std::pair, std::vector> select_outer_bin_edges( - const Tensor& input, - std::optional> range) { - TORCH_INTERNAL_ASSERT( - input.dim() == 2, "expected input to have shape (M, N)"); - const int64_t N = input.size(-1); - - // Default ranges for empty input matching numpy.histogram's default - std::vector leftmost_edges(N, 0.); - std::vector rightmost_edges(N, 1.); - - if (range.has_value()) { - // range is specified - TORCH_CHECK( - (int64_t)range.value().size() == 2 * N, - "torch.histogramdd: for a ", - N, - "-dimensional histogram", - " range should have ", - 2 * N, - " elements, but got ", - range.value().size()); - - for (const auto dim : c10::irange(N)) { - leftmost_edges[dim] = range.value()[2 * dim]; - rightmost_edges[dim] = range.value()[2 * dim + 1]; - } - } else if (input.numel() > 0) { - // non-empty input - - histogram_select_outer_bin_edges_kernel( - input, N, leftmost_edges, rightmost_edges); - } - - for (const auto dim : c10::irange(N)) { - double leftmost_edge = leftmost_edges[dim]; - double rightmost_edge = rightmost_edges[dim]; - - TORCH_CHECK( - std::isfinite(leftmost_edge) && std::isfinite(rightmost_edge), - "torch.histogramdd: dimension ", - dim, - "'s range [", - leftmost_edge, - ", ", - rightmost_edge, - "] is not finite"); - - TORCH_CHECK( - leftmost_edge <= rightmost_edge, - "torch.histogramdd: min should not exceed max, but got", - " min ", - leftmost_edge, - " max ", - rightmost_edge, - " for dimension ", - dim); - - // Expand empty range to match numpy behavior and avoid division by 0 in - // normalization - if (leftmost_edge == rightmost_edge) { - leftmost_edges[dim] -= 0.5; - rightmost_edges[dim] += 0.5; - } - } - - return std::make_pair(leftmost_edges, rightmost_edges); -} - -static Tensor& histogramdd_out( - const Tensor& self, - TensorList bins, - const std::optional& weight, - bool density, - Tensor& hist, - TensorList& bin_edges) { - histogramdd_check_inputs(self, bins, weight); - histogramdd_prepare_out(self, bins, hist, bin_edges); - - for (const auto dim : c10::irange(bins.size())) { - bin_edges[dim].copy_(bins[dim]); - } - - at::native::xpu::histogramdd_kernel(self, weight, density, hist, bin_edges); - return hist; -} - -std::tuple XPUNativeFunctions::histogram_out( - const Tensor& self, - const Tensor& bins, - const std::optional& weight, - bool density, - Tensor& hist, - Tensor& bin_edges) { - Tensor reshaped_self = self.reshape({self.numel(), 1}); - std::optional reshaped_weight = weight.has_value() - ? weight.value().reshape({weight.value().numel()}) - : weight; - TensorList bins_in = bins; - TensorList bins_out = bin_edges; - - histogramdd_out( - reshaped_self, bins_in, reshaped_weight, density, hist, bins_out); - - return std::forward_as_tuple(hist, bin_edges); -} - -std::tuple XPUNativeFunctions::histogram( - const Tensor& self, - const Tensor& bins, - const std::optional& weight, - bool density) { - Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); - Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous); - return histogram_out(self, bins, weight, density, hist, bin_edges); -} - -std::tuple XPUNativeFunctions::histogram_out( - const Tensor& self, - int64_t bin_ct, - std::optional> range, - const std::optional& weight, - bool density, - Tensor& hist, - Tensor& bin_edges) { - Tensor reshaped_self = self.reshape({self.numel(), 1}); - std::optional reshaped_weight = weight.has_value() - ? weight.value().reshape({weight.value().numel()}) - : weight; - TensorList bins_in = bin_edges; - TensorList bins_out = bin_edges; - - histogramdd_prepare_out( - reshaped_self, std::vector{bin_ct}, hist, bins_out); - auto outer_bin_edges = select_outer_bin_edges(reshaped_self, range); - at::linspace_out( - bin_edges, - outer_bin_edges.first[0], - outer_bin_edges.second[0], - bin_ct + 1); - - histogramdd_check_inputs(reshaped_self, bins_in, reshaped_weight); - - at::native::xpu::histogramdd_linear_kernel( - reshaped_self, reshaped_weight, density, hist, bin_edges, true); - return std::forward_as_tuple(hist, bin_edges); -} - -std::tuple XPUNativeFunctions::histogram( - const Tensor& self, - int64_t bin_ct, - std::optional> range, - const std::optional& weight, - bool density) { - Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); - Tensor bin_edges_out = at::empty({0}, self.options()); - return histogram_out( - self, bin_ct, range, weight, density, hist, bin_edges_out); -} +namespace native { +REGISTER_XPU_DISPATCH(histogramdd_stub, &xpu::histogramdd_kernel); +REGISTER_XPU_DISPATCH(histogramdd_linear_stub, &xpu::histogramdd_linear_kernel); +REGISTER_XPU_DISPATCH( + histogram_select_outer_bin_edges_stub, + &xpu::histogram_select_outer_bin_edges_kernel); +} // namespace native } // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/Im2Col.cpp b/src/ATen/native/xpu/Im2Col.cpp index 1bfc498da..eb9f4077a 100644 --- a/src/ATen/native/xpu/Im2Col.cpp +++ b/src/ATen/native/xpu/Im2Col.cpp @@ -1,15 +1,16 @@ -#include + #include #include #include -#include #include -#include +#include -namespace at { +#include +#include -Tensor& XPUNativeFunctions::im2col_out( +namespace at::native { +Tensor& im2col_out_xpu( const Tensor& self, IntArrayRef kernel_size, IntArrayRef dilation, @@ -26,7 +27,7 @@ Tensor& XPUNativeFunctions::im2col_out( return out; } -Tensor XPUNativeFunctions::im2col( +Tensor im2col_xpu( const Tensor& self, IntArrayRef kernel_size, IntArrayRef dilation, @@ -40,5 +41,4 @@ Tensor XPUNativeFunctions::im2col( output, self, kernel_size, dilation, padding, stride); return output; } - -} // namespace at +} // namespace at::native diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp index d4d5598e6..6ba148607 100644 --- a/src/ATen/native/xpu/Indexing.cpp +++ b/src/ATen/native/xpu/Indexing.cpp @@ -1,16 +1,21 @@ -#include + #include #include #include #include #include + #include -#include #include +#include + +#include +#include namespace at { +namespace native { -Tensor& XPUNativeFunctions::index_select_out( +Tensor& index_select_out_xpu( const Tensor& self, int64_t dim, const Tensor& index, @@ -32,20 +37,17 @@ Tensor& XPUNativeFunctions::index_select_out( dim = at::maybe_wrap_dim(dim, self); TORCH_CHECK(self.dim() <= XPU_MAX_TENSORINFO_DIMS, DIM_WARNING); TORCH_CHECK(index.dim() <= XPU_MAX_TENSORINFO_DIMS, DIM_WARNING); - native::xpu::index_select_kernel(self, dim, index, out); + xpu::index_select_kernel(self, dim, index, out); return out; } -Tensor XPUNativeFunctions::index_select( - const Tensor& self, - int64_t dim, - const Tensor& index) { - auto out = at::empty({0}, self.options()); - return index_select_out(self, dim, index, out); +Tensor index_select_xpu_(const Tensor& self, int64_t dim, const Tensor& index) { + Tensor result = at::empty({0}, self.options()); + return at::native::index_select_out_xpu(self, dim, index, result); } -Tensor& XPUNativeFunctions::masked_scatter_( +Tensor& masked_scatter__xpu( Tensor& self, const Tensor& mask, const Tensor& source) { @@ -99,29 +101,27 @@ static Tensor& masked_select_out_impl( // owning and expand_outplace returns a borrow, the returned borrow // would dangle. auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp); - XPUNativeFunctions::index_out( + at::index_out( + result, *std::get<1>(mask_self_expanded), c10::List>( - {*std::move(std::get<0>(mask_self_expanded))}), - result); + {*std::move(std::get<0>(mask_self_expanded))})); return result; } -Tensor XPUNativeFunctions::masked_select( - const Tensor& self, - const Tensor& mask) { +Tensor masked_select_xpu(const Tensor& self, const Tensor& mask) { namedinference::compute_broadcast_outnames(self, mask); Tensor result = at::empty({0}, self.options()); return masked_select_out_impl(result, self, mask); } -Tensor& XPUNativeFunctions::masked_select_out( +Tensor& masked_select_out_xpu( const Tensor& self, const Tensor& mask, Tensor& result) { namedinference::compute_broadcast_outnames(self, mask); return masked_select_out_impl(result, self, mask); } - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/LayerNorm.cpp b/src/ATen/native/xpu/LayerNorm.cpp index 8a467a9f4..0addcd718 100644 --- a/src/ATen/native/xpu/LayerNorm.cpp +++ b/src/ATen/native/xpu/LayerNorm.cpp @@ -1,29 +1,29 @@ -#include #include #include #include #include #include +#include -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#endif +// #ifndef AT_PER_OPERATOR_HEADERS +// #include +// #include +// #else +// #include +// #endif #include -#include +#include +#include namespace at { - -::std::tuple XPUNativeFunctions:: - native_layer_norm( - const at::Tensor& input, - at::IntArrayRef normalized_shape, - const ::std::optional& weight_opt, - const ::std::optional& bias_opt, - double epsilon) { +namespace native { +::std::tuple layer_norm_xpu( + const at::Tensor& input, + at::IntArrayRef normalized_shape, + const ::std::optional& weight_opt, + const ::std::optional& bias_opt, + double epsilon) { std::optional common_device = std::nullopt; c10::impl::check_and_update_common_device( common_device, input, "xpu::native_layer_norm", "input"); @@ -79,16 +79,15 @@ ::std::tuple XPUNativeFunctions:: return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd)); } -::std::tuple XPUNativeFunctions:: - native_layer_norm_backward( - const at::Tensor& grad_output, - const at::Tensor& input, - at::IntArrayRef normalized_shape, - const at::Tensor& mean, - const at::Tensor& rstd, - const ::std::optional& weight_opt, - const ::std::optional& bias_opt, - ::std::array grad_input_mask) { +::std::tuple layer_norm_backward_xpu( + const at::Tensor& grad_output, + const at::Tensor& input, + at::IntArrayRef normalized_shape, + const at::Tensor& mean, + const at::Tensor& rstd, + const ::std::optional& weight_opt, + const ::std::optional& bias_opt, + ::std::array grad_input_mask) { std::optional common_device = std::nullopt; c10::impl::check_and_update_common_device( common_device, grad_output, "xpu::native_layer_norm_backward", "goutput"); @@ -177,5 +176,6 @@ ::std::tuple XPUNativeFunctions:: grad_bias, grad_input_mask); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Lerp.cpp b/src/ATen/native/xpu/Lerp.cpp index 272417b39..d64fc1acb 100644 --- a/src/ATen/native/xpu/Lerp.cpp +++ b/src/ATen/native/xpu/Lerp.cpp @@ -1,110 +1,16 @@ #include #include +#include +#include #include -#include - #include namespace at { +namespace native { -TensorIterator lerp_tensor_meta( - const Tensor& self, - const Tensor& end, - const Tensor& weight, - Tensor& out) { - TORCH_CHECK( - self.dtype() == end.dtype(), - "expected dtype ", - self.dtype(), - " for `end` but got dtype ", - end.dtype()); - TORCH_CHECK( - self.dtype() == weight.dtype(), - "expected dtype ", - self.dtype(), - " for `weight` but got dtype ", - weight.dtype()); - TensorIterator iter; - iter.build(TensorIteratorConfig() - .add_output(out) - .add_const_input(self) - .add_const_input(end) - .add_const_input(weight)); - return iter; -} - -Tensor XPUNativeFunctions::lerp( - const Tensor& self, - const Tensor& end, - const Tensor& weight) { - Tensor out; - auto iter = lerp_tensor_meta(self, end, weight, out); - native::xpu::lerp_tensor_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::lerp_( - Tensor& self, - const Tensor& end, - const Tensor& weight) { - auto iter = lerp_tensor_meta(self, end, weight, self); - native::xpu::lerp_tensor_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::lerp_out( - const Tensor& self, - const Tensor& end, - const Tensor& weight, - Tensor& out) { - auto iter = lerp_tensor_meta(self, end, weight, out); - native::xpu::lerp_tensor_kernel(iter); - return out; -} - -TensorIterator lerp_scalar_meta( - const Tensor& self, - const Tensor& end, - const Scalar& /*weight*/, - Tensor& out) { - TORCH_CHECK( - self.dtype() == end.dtype(), - "expected dtype ", - self.dtype(), - " for `end` but got dtype ", - end.dtype()); - TensorIterator iter; - iter.build_binary_op(out, self, end); - return iter; -} - -Tensor XPUNativeFunctions::lerp( - const Tensor& self, - const Tensor& end, - const Scalar& weight) { - Tensor out; - auto iter = lerp_scalar_meta(self, end, weight, out); - native::xpu::lerp_scalar_kernel(iter, weight); - return iter.output(); -} - -Tensor& XPUNativeFunctions::lerp_( - Tensor& self, - const Tensor& end, - const Scalar& weight) { - auto iter = lerp_scalar_meta(self, end, weight, self); - native::xpu::lerp_scalar_kernel(iter, weight); - return self; -} +REGISTER_XPU_DISPATCH(lerp_kernel_tensor_weight, &xpu::lerp_tensor_kernel); +REGISTER_XPU_DISPATCH(lerp_kernel_scalar_weight, &xpu::lerp_scalar_kernel); -Tensor& XPUNativeFunctions::lerp_out( - const Tensor& self, - const Tensor& end, - const Scalar& weight, - Tensor& out) { - auto iter = lerp_scalar_meta(self, end, weight, out); - native::xpu::lerp_scalar_kernel(iter, weight); - return out; -} +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/LinearAlgebra.cpp b/src/ATen/native/xpu/LinearAlgebra.cpp index 2f857f18b..719f23f6f 100644 --- a/src/ATen/native/xpu/LinearAlgebra.cpp +++ b/src/ATen/native/xpu/LinearAlgebra.cpp @@ -1,273 +1,22 @@ -#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include + #include #include -#include #include namespace at { -namespace detail { - -static void check_linalg_norm_dtype( - optional opt_dtype, - ScalarType self_dtype, - const char* const name) { - if (opt_dtype.has_value()) { - auto dtype = opt_dtype.value(); - TORCH_CHECK( - isFloatingType(dtype) || isComplexType(dtype), - name, - ": dtype should" - " be floating point or complex, but got ", - dtype); - TORCH_CHECK( - isComplexType(self_dtype) == isComplexType(dtype), - name, - ": dtype should be ", - isComplexType(self_dtype) ? "complex" : "real", - " for ", - isComplexType(self_dtype) ? "complex" : "real", - " inputs, but got ", - dtype); - TORCH_CHECK( - promoteTypes(self_dtype, dtype) == dtype, - name, - ": the dtype of the input ", - "(", - self_dtype, - ") should be convertible ", - "without narrowing to the specified dtype (", - dtype, - ")"); - } -} - -} // namespace detail - -Tensor& linalg_vector_norm_meta( - const Tensor& self, - const Scalar& scalar_ord, - OptionalIntArrayRef opt_dim, - bool keepdim, - optional opt_dtype, - Tensor& output) { - at::native::checkFloatingOrComplex(self, "linalg.vector_norm"); - - auto dim = opt_dim.value_or(IntArrayRef{}); - // Casting a large integer to a double will just introduce an error for - // values larger than 10^53 (same for negative numbers), so that's fine. - auto ord = scalar_ord.toDouble(); - - // For more context, see issue 52783 - // If the tensor is empty and norm < 0 || norm == infty - // - We cannot reduce the whole tensor - // - We cannot reduce over an empty dimension - if (self.numel() == 0 && (ord < 0. || ord == INFINITY)) { - // dim=None or dim=() reduces the whole tensor - TORCH_CHECK( - opt_dim.has_value() && !opt_dim->empty(), - "linalg.vector_norm cannot compute the ", - scalar_ord, - " norm on an empty ", - "tensor because the operation does not have an identity"); - for (auto dim_num : dim) { - TORCH_CHECK( - self.size(dim_num) != 0, - "linalg.vector_norm cannot compute the ", - scalar_ord, - " norm on the dimension ", - dim_num, - "because this dimension is empty and the operation does not have an identity"); - } - } - - at::detail::check_linalg_norm_dtype( - opt_dtype, self.scalar_type(), "linalg.vector_norm"); - - auto mask = at::native::make_dim_mask(dim, self.dim()); - auto shape = at::native::shape_from_dim_mask(self, std::move(mask), keepdim); - auto options = self.options().dtype( - toRealValueType(opt_dtype.value_or(self.scalar_type()))); - if (output.defined()) { - at::xpu::resize_out(output, shape, {}, options); - } else { - output = at::xpu::create_out(shape, {}, options); - } - return output; -} - -static void check_1d(const Tensor& t, const char* arg, const char* fn) { - TORCH_CHECK( - t.dim() == 1, - fn, - ": Expected 1-D argument ", - arg, - ", but got ", - t.dim(), - "-D"); -} - -static void check_addr_scalar( - const ScalarType dtype, - const Scalar& scalar, - const std::string& scalar_name) { - TORCH_CHECK( - !scalar.isBoolean() || dtype == ScalarType::Bool, - "Boolean ", - scalar_name, - " only supported for Boolean results."); - TORCH_CHECK( - isFloatingType(dtype) || isComplexType(dtype) || scalar.isIntegral(true), - "For integral input tensors, " - "argument ", - scalar_name, - " must not be a floating point number."); -} - -static TensorIterator build_addr_iter( - Tensor& result, - const Tensor& self, - const Tensor& vec1, - const Tensor& vec2) { - check_1d(vec1, "vec1", "addr"); - check_1d(vec2, "vec2", "addr"); - - const auto vec1_size0 = vec1.sizes()[0]; - const auto vec2_size0 = vec2.sizes()[0]; - auto self_ = &result == &self - ? c10::MaybeOwned::borrowed(self) - : expand_size(self, {vec1_size0, vec2_size0}, "addr"); - TORCH_CHECK( - self_->dim() == 2, - "2D tensor expected, got ", - self_->dim(), - "D tensor for input"); - TORCH_CHECK( - self_->sizes()[0] == vec1_size0 && self_->sizes()[1] == vec2_size0, - "size mismatch, input: ", - self_->sizes(), - ", v1: ", - vec1.sizes(), - ", v2: ", - vec2.sizes()); - - auto iter = TensorIteratorConfig() - .set_check_mem_overlap(true) - .add_output(result) - .add_owned_const_input(*self_) - .add_owned_const_input(vec1.reshape({vec1_size0, 1})) - .add_const_input(vec2) - .allow_cpu_scalars(true) - .promote_inputs_to_common_dtype(true) - .cast_common_dtype_to_outputs(true) - .enforce_safe_casting_to_output(true) - .build(); - return iter; -} - -Tensor XPUNativeFunctions::addr( - const Tensor& self, - const Tensor& vec1, - const Tensor& vec2, - const Scalar& beta, - const Scalar& alpha) { - Tensor result; - auto iter = build_addr_iter(result, self, vec1, vec2); - - check_addr_scalar(iter.dtype(), beta, "beta"); - check_addr_scalar(iter.dtype(), alpha, "alpha"); - - native::xpu::addr_kernel(iter, beta, alpha); - return iter.output(); -} - -Tensor& XPUNativeFunctions::addr_out( - const Tensor& self, - const Tensor& vec1, - const Tensor& vec2, - const Scalar& beta, - const Scalar& alpha, - Tensor& out) { - auto iter = build_addr_iter(out, self, vec1, vec2); - check_addr_scalar(iter.dtype(), beta, "beta"); - check_addr_scalar(iter.dtype(), alpha, "alpha"); - - native::xpu::addr_kernel(iter, beta, alpha); - return out; -} - -Tensor XPUNativeFunctions::linalg_vector_norm( - const Tensor& self, - const Scalar& scalar_ord, - OptionalIntArrayRef opt_dim, - bool keepdim, - optional opt_dtype) { - Tensor result; - linalg_vector_norm_out(self, scalar_ord, opt_dim, keepdim, opt_dtype, result); - return result; -} - -Tensor& XPUNativeFunctions::linalg_vector_norm_out( - const Tensor& self, - const Scalar& scalar_ord, - OptionalIntArrayRef opt_dim, - bool keepdim, - optional opt_dtype, - Tensor& result) { - result = linalg_vector_norm_meta( - self, scalar_ord, opt_dim, keepdim, opt_dtype, result); - auto ord = scalar_ord.toDouble(); - auto dim = opt_dim.value_or(IntArrayRef{}); - auto size = self.sizes(); - auto ndim = self.dim(); - - auto opt_dim_ = dim.vec(); - maybe_wrap_dims(opt_dim_, ndim); - - using Int = IntArrayRef::value_type; - std::vector all_dim(ndim); - std::iota(all_dim.begin(), all_dim.end(), 0); - - bool is_all_reduce = !opt_dim.has_value() || opt_dim.value().empty(); - auto reduce_dim = is_all_reduce ? all_dim : opt_dim_; - - bool is_reduce_over_1D_vector = true; - for (auto i : reduce_dim) { - if (size[i] != 1) { - is_reduce_over_1D_vector = false; - break; - } - } - - if (is_reduce_over_1D_vector) { - Tensor self_; - if (opt_dtype.has_value()) { - self_ = self.to(*opt_dtype); - } else { - self_ = self; - } - if (ord != 0.0) { - keepdim ? at::abs_outf(self_, const_cast(result)) - : at::abs_outf( - self_.squeeze(reduce_dim), const_cast(result)); - } else { - keepdim ? at::ne_outf(self_, 0, const_cast(result)) - : at::ne_outf( - self_.squeeze(reduce_dim), 0, const_cast(result)); - } - return result; - } - - auto iter = at::native::make_reduction( - "vector_norm", - const_cast(result), - self, - dim, - keepdim, - result.scalar_type()); - native::xpu::norm_kernel(iter, ord); - return result; -} - +namespace native { +REGISTER_XPU_DISPATCH(addr_stub, &xpu::addr_kernel); +REGISTER_XPU_DISPATCH(norm_stub, &xpu::norm_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Loss.cpp b/src/ATen/native/xpu/Loss.cpp index f2ca7d9c0..c3ca675dc 100644 --- a/src/ATen/native/xpu/Loss.cpp +++ b/src/ATen/native/xpu/Loss.cpp @@ -1,143 +1,25 @@ -#include #include #include +#include +#include +#include +#include + #include #include #include -#include #include namespace at { - -static inline at::Tensor apply_loss_reduction( - const at::Tensor& unreduced, - int64_t reduction) { - if (reduction == at::Reduction::Mean) { - return unreduced.mean(); - } else if (reduction == at::Reduction::Sum) { - return unreduced.sum(); - } - return unreduced; -} - -Tensor& XPUNativeFunctions::mse_loss_out( - const Tensor& input, - const Tensor& target, - int64_t reduction, - Tensor& result) { - if (reduction != Reduction::None) { - TORCH_INTERNAL_ASSERT( - reduction == Reduction::Mean || reduction == Reduction::Sum); - result.resize_({}); - Tensor loss; - auto iter = TensorIterator::borrowing_binary_op(loss, input, target); - native::xpu::mse_kernel(iter); - if (reduction == Reduction::Mean) { - at::mean_out(const_cast(result), iter.output(), IntArrayRef{}); - } else { - at::sum_out(const_cast(result), iter.output(), IntArrayRef{}); - } - } else { - auto iter = TensorIterator::borrowing_binary_op(result, input, target); - native::xpu::mse_kernel(iter); - } - return result; -} - -Tensor XPUNativeFunctions::mse_loss( - const Tensor& input, - const Tensor& target, - int64_t reduction) { - Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - result = XPUNativeFunctions::mse_loss_out(input, target, reduction, result); - return result; -} - -Tensor XPUNativeFunctions::mse_loss_backward( - const Tensor& grad_output, - const Tensor& input, - const Tensor& target, - int64_t reduction) { - Tensor grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - return at::mse_loss_backward_out( - grad_input, grad_output, input, target, reduction); -} - -Tensor& XPUNativeFunctions::mse_loss_backward_out( - const Tensor& grad_output, - const Tensor& input, - const Tensor& target, - int64_t reduction, - Tensor& grad_input) { - auto norm = reduction == Reduction::Mean ? 2. / input.numel() : 2.; - auto iter = at::TensorIteratorConfig() - .add_output(grad_input) - .add_const_input(input) - .add_const_input(target) - .add_const_input(grad_output) - .build(); - native::xpu::mse_backward_kernel(iter, norm); - return grad_input; -} - -Tensor& XPUNativeFunctions::smooth_l1_loss_out( - const Tensor& input, - const Tensor& target, - int64_t reduction, - double beta, - Tensor& result) { - if (reduction != Reduction::None) { - TORCH_INTERNAL_ASSERT( - reduction == Reduction::Mean || reduction == Reduction::Sum); - result.resize_({}); - Tensor loss; - auto iter = TensorIterator::borrowing_binary_op(loss, input, target); - native::xpu::smooth_l1_kernel(iter, beta); - if (reduction == Reduction::Mean) { - at::mean_out(const_cast(result), iter.output(), IntArrayRef{}); - } else { - at::sum_out(const_cast(result), iter.output(), IntArrayRef{}); - } - } else { - auto iter = TensorIterator::borrowing_binary_op(result, input, target); - native::xpu::smooth_l1_kernel(iter, beta); - } - return result; -} - -Tensor XPUNativeFunctions::smooth_l1_loss( - const Tensor& input, - const Tensor& target, - int64_t reduction, - double beta) { - Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - result = XPUNativeFunctions::smooth_l1_loss_out( - input, target, reduction, beta, result); - return result; -} - -Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out( - const Tensor& grad_output, - const Tensor& input, - const Tensor& target, - int64_t reduction, - double beta, - Tensor& grad_input) { - auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.; - auto iter = at::TensorIteratorConfig() - .add_output(grad_input) - .add_const_input(input) - .add_const_input(target) - .add_const_input(grad_output) - .promote_inputs_to_common_dtype(true) - .cast_common_dtype_to_outputs(true) - .enforce_safe_casting_to_output(true) - .build(); - native::xpu::smooth_l1_backward_kernel(iter, norm, beta); - return grad_input; -} - -Tensor XPUNativeFunctions::binary_cross_entropy( +namespace native { +REGISTER_XPU_DISPATCH(mse_stub, &xpu::mse_kernel); +REGISTER_XPU_DISPATCH(mse_backward_stub, &xpu::mse_backward_kernel); +REGISTER_XPU_DISPATCH(huber_stub, &xpu::huber_kernel); +REGISTER_XPU_DISPATCH(huber_backward_stub, &xpu::huber_backward_kernel); +REGISTER_XPU_DISPATCH(smooth_l1_stub, &xpu::smooth_l1_kernel); +REGISTER_XPU_DISPATCH(smooth_l1_backward_stub, &xpu::smooth_l1_backward_kernel); + +Tensor binary_cross_entropy_xpu( const Tensor& self, const Tensor& target, const std::optional& weight_opt, @@ -150,7 +32,7 @@ Tensor XPUNativeFunctions::binary_cross_entropy( self, target, weight, reduction, loss); } -Tensor& XPUNativeFunctions::binary_cross_entropy_out( +Tensor& binary_cross_entropy_out_xpu( const Tensor& self, const Tensor& target, const std::optional& weight_opt, @@ -163,7 +45,7 @@ Tensor& XPUNativeFunctions::binary_cross_entropy_out( self, target, weight, reduction, loss); } -Tensor XPUNativeFunctions::binary_cross_entropy_backward( +Tensor binary_cross_entropy_backward_xpu( const Tensor& grad_output, const Tensor& self, const Tensor& target, @@ -177,7 +59,7 @@ Tensor XPUNativeFunctions::binary_cross_entropy_backward( grad_output, self, target, weight, reduction, grad_input); } -Tensor& XPUNativeFunctions::binary_cross_entropy_backward_out( +Tensor& binary_cross_entropy_backward_out_xpu( const Tensor& grad_output, const Tensor& self, const Tensor& target, @@ -191,53 +73,5 @@ Tensor& XPUNativeFunctions::binary_cross_entropy_backward_out( grad_output, self, target, weight, reduction, grad_input); } -Tensor XPUNativeFunctions::huber_loss( - const Tensor& input, - const Tensor& target, - int64_t reduction, - double delta) { - TORCH_CHECK( - delta > 0, "huber_loss does not support non-positive values for delta.") - Tensor loss = at::empty_like(input); - auto iter = TensorIterator::borrowing_binary_op(loss, input, target); - native::xpu::huber_kernel(iter, delta); - return apply_loss_reduction(loss, reduction); -} - -Tensor& XPUNativeFunctions::huber_loss_out( - const Tensor& input, - const Tensor& target, - int64_t reduction, - double delta, - Tensor& result) { - TORCH_CHECK( - delta > 0, "huber_loss does not support non-positive values for delta.") - auto iter = TensorIterator::borrowing_binary_op(result, input, target); - native::xpu::huber_kernel(iter, delta); - if (reduction != Reduction::None) { - auto reduced = apply_loss_reduction(result, reduction); - result.resize_({}); - result.copy_(reduced); - } - return result; -} - -Tensor& XPUNativeFunctions::huber_loss_backward_out( - const Tensor& grad_output, - const Tensor& input, - const Tensor& target, - int64_t reduction, - double delta, - Tensor& grad_input) { - auto norm = (reduction == Reduction::Mean) ? (1. / input.numel()) : 1.; - auto iter = at::TensorIteratorConfig() - .add_output(grad_input) - .add_const_input(input) - .add_const_input(target) - .add_const_input(grad_output) - .build(); - native::xpu::huber_backward_kernel(iter, norm, delta); - return grad_input; -} - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/LossNLL.cpp b/src/ATen/native/xpu/LossNLL.cpp index d30eb7258..d80fef746 100644 --- a/src/ATen/native/xpu/LossNLL.cpp +++ b/src/ATen/native/xpu/LossNLL.cpp @@ -1,105 +1,28 @@ -#include #include #include #include #include -#include #include +#include -namespace at { -void nll_loss_forward_meta( - const Tensor& self, - const Tensor& target, - const OptionalTensorRef weight_opt, - int64_t reduction, - int64_t ignore_index, - Tensor& output, - Tensor& total_weight) { - const Tensor& weight = weight_opt.getTensorRef(); - - TORCH_CHECK( - self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D"); - TORCH_CHECK( - target.dim() <= 1, - "0D or 1D target tensor expected, multi-target not supported"); - - auto no_batch_dim = self.dim() == 1 && target.dim() == 0; - TORCH_CHECK( - no_batch_dim || (self.size(0) == target.size(0)), - "size mismatch (got input: ", - self.sizes(), - ", target: ", - target.sizes(), - ")") - - const auto n_classes = self.size(-1); - - TORCH_CHECK( - !weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes), - "weight tensor should be defined either for all ", - n_classes, - " classes or no classes" - " but got weight tensor of shape: ", - weight.sizes()); - - const auto n_dims = self.dim(); - const auto batch_size = self.size(0); +#include +#include - if (reduction == Reduction::None && n_dims == 2) { - if (output.defined()) { - at::xpu::resize_out(output, {batch_size}, {}, self.options()); - } else { - output = at::xpu::create_out({batch_size}, {}, self.options()); - } - } else { - // produce scalar output when reducing or input is 1d - if (output.defined()) { - at::xpu::resize_out(output, {}, {}, self.options()); - } else { - output = at::xpu::create_out({}, {}, self.options()); - } - } - if (total_weight.defined()) { - at::xpu::resize_out(total_weight, {}, {}, self.options()); - } else { - total_weight = at::xpu::create_out({}, {}, self.options()); - } -} - -std::tuple XPUNativeFunctions::nll_loss_forward_out( - const Tensor& self, - const Tensor& target, - const c10::optional& weight, - int64_t reduction, - int64_t ignore_index, - Tensor& output, - Tensor& total_weight) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, output, "xpu::nll_loss_forward_out", "output"); - c10::impl::check_and_update_common_device( - common_device, total_weight, "xpu::nll_loss_forward_out", "total_weight"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::nll_loss_forward_out", "self"); - c10::impl::check_and_update_common_device( - common_device, target, "xpu::nll_loss_forward_out", "target"); - c10::impl::check_and_update_common_device( - common_device, weight, "xpu::nll_loss_forward_out", "weight"); - nll_loss_forward_meta( - self, - target, - ((weight.has_value() && (*weight).defined()) - ? at::OptionalTensorRef(*weight) - : at::OptionalTensorRef()), - reduction, - ignore_index, - output, - total_weight); - return native::xpu::nll_loss_forward_kernel( +namespace at { +namespace native { +TORCH_IMPL_FUNC(nll_loss_forward_out_xpu) +(const Tensor& self, + const Tensor& target, + const OptionalTensorRef weight_opt, + int64_t reduction, + int64_t ignore_index, + const Tensor& output, + const Tensor& total_weight) { + xpu::nll_loss_forward_kernel( self, target, - ((weight.has_value() && (*weight).defined()) - ? at::OptionalTensorRef(*weight) + ((weight_opt.has_value() && (*weight_opt).defined()) + ? at::OptionalTensorRef(*weight_opt) : at::OptionalTensorRef()), reduction, ignore_index, @@ -107,122 +30,22 @@ std::tuple XPUNativeFunctions::nll_loss_forward_out( total_weight); } -std::tuple XPUNativeFunctions::nll_loss_forward( - const Tensor& self, - const Tensor& target, - const c10::optional& weight, - int64_t reduction, - int64_t ignore_index) { - Tensor output; - Tensor total_weight; - return nll_loss_forward_out( - self, target, weight, reduction, ignore_index, output, total_weight); -} - -void nll_loss_backward_meta( - const Tensor& grad_output, - const Tensor& self, - const Tensor& target, - OptionalTensorRef weight_opt, - int64_t reduction, - int64_t ignore_index, - const Tensor& total_weight, - Tensor& grad_input) { - TORCH_CHECK( - self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D"); - TORCH_CHECK( - target.dim() <= 1, - "0D or 1D target tensor expected, multi-target not supported"); - - auto no_batch_dim = self.dim() == 1 && target.dim() == 0; - TORCH_CHECK( - no_batch_dim || (self.size(0) == target.size(0)), - "size mismatch (got input: ", - self.sizes(), - ", target: ", - target.sizes(), - ")") - TORCH_CHECK( - total_weight.numel() == 1, - "expected total_weight to be a single element tensor, got: ", - total_weight.sizes(), - " (", - total_weight.numel(), - " elements)"); - - const auto& weight = weight_opt.getTensorRef(); - - TORCH_CHECK( - !weight.defined() || weight.numel() == self.size(-1), - "weight tensor should be defined either for all or no classes"); - - const auto n_dims = self.dim(); - - if (reduction == Reduction::None && n_dims == 2) { - const auto batch_size = self.size(0); - check_dim_size(grad_output, 1, 0, batch_size); - } else { - TORCH_CHECK( - grad_output.dim() <= 1 && grad_output.numel() == 1, - "Expected a single element grad_output tensor, but got: ", - grad_output.sizes()); - } - if (grad_input.defined()) { - at::xpu::resize_out( - grad_input, - self.sizes(), - {}, - self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); - } else { - grad_input = at::xpu::create_out( - self.sizes(), - {}, - self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); - } -} - -Tensor& XPUNativeFunctions::nll_loss_backward_out( - const Tensor& grad_output, - const Tensor& self, - const Tensor& target, - const c10::optional& weight, - int64_t reduction, - int64_t ignore_index, - const Tensor& total_weight, - Tensor& grad_input) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, grad_input, "xpu::nll_loss_backward_out", "grad_input"); - c10::impl::check_and_update_common_device( - common_device, grad_output, "xpu::nll_loss_backward_out", "grad_output"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::nll_loss_backward_out", "self"); - c10::impl::check_and_update_common_device( - common_device, target, "xpu::nll_loss_backward_out", "target"); - c10::impl::check_and_update_common_device( - common_device, weight, "xpu::nll_loss_backward_out", "weight"); - c10::impl::check_and_update_common_device( - common_device, - total_weight, - "xpu::nll_loss_backward_out", - "total_weight"); - nll_loss_backward_meta( +TORCH_IMPL_FUNC(nll_loss_backward_out_xpu) +(const Tensor& grad_output, + const Tensor& self, + const Tensor& target, + OptionalTensorRef weight_opt, + int64_t reduction, + int64_t ignore_index, + const Tensor& total_weight, + const Tensor& grad_input) { + grad_input.zero_(); + xpu::nll_loss_backward_kernel( grad_output, self, target, - ((weight.has_value() && (*weight).defined()) - ? at::OptionalTensorRef(*weight) - : at::OptionalTensorRef()), - reduction, - ignore_index, - total_weight, - grad_input); - return native::xpu::nll_loss_backward_kernel( - grad_output, - self, - target, - ((weight.has_value() && (*weight).defined()) - ? at::OptionalTensorRef(*weight) + ((weight_opt.has_value() && (*weight_opt).defined()) + ? at::OptionalTensorRef(*weight_opt) : at::OptionalTensorRef()), reduction, ignore_index, @@ -230,23 +53,5 @@ Tensor& XPUNativeFunctions::nll_loss_backward_out( grad_input); } -Tensor XPUNativeFunctions::nll_loss_backward( - const Tensor& grad_output, - const Tensor& self, - const Tensor& target, - const c10::optional& weight, - int64_t reduction, - int64_t ignore_index, - const Tensor& total_weight) { - Tensor grad_input; - return nll_loss_backward_out( - grad_output, - self, - target, - weight, - reduction, - ignore_index, - total_weight, - grad_input); -} +} // namespace native } // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/LossNLL2d.cpp b/src/ATen/native/xpu/LossNLL2d.cpp index 08f9d464e..7aaca4911 100644 --- a/src/ATen/native/xpu/LossNLL2d.cpp +++ b/src/ATen/native/xpu/LossNLL2d.cpp @@ -1,9 +1,10 @@ #include #include -#include namespace at { +namespace native { +namespace { void check_inputs_nll_loss2d( const Tensor& input, const Tensor& target, @@ -30,8 +31,9 @@ void check_inputs_nll_loss2d( ", input ", input.sizes()); } +} // namespace -std::tuple XPUNativeFunctions::nll_loss2d_forward( +std::tuple nll_loss2d_forward_xpu( const Tensor& self, const Tensor& target, const ::std::optional& weight_opt, @@ -51,7 +53,7 @@ std::tuple XPUNativeFunctions::nll_loss2d_forward( return std::make_tuple(output, total_weight); } -std::tuple XPUNativeFunctions::nll_loss2d_forward_out( +std::tuple nll_loss2d_forward_out_xpu( const Tensor& self, const Tensor& target, const std::optional& weight_opt, @@ -70,7 +72,7 @@ std::tuple XPUNativeFunctions::nll_loss2d_forward_out( return std::tuple(output, total_weight); } -Tensor XPUNativeFunctions::nll_loss2d_backward( +Tensor nll_loss2d_backward_xpu( const Tensor& grad_output, const Tensor& self, const Tensor& target, @@ -97,7 +99,7 @@ Tensor XPUNativeFunctions::nll_loss2d_backward( return grad_input; } -Tensor& XPUNativeFunctions::nll_loss2d_backward_out( +Tensor& nll_loss2d_backward_out_xpu( const Tensor& grad_output, const Tensor& self, const Tensor& target, @@ -123,4 +125,5 @@ Tensor& XPUNativeFunctions::nll_loss2d_backward_out( return grad_input; } +} // namespace native } // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/NMS.cpp b/src/ATen/native/xpu/NMS.cpp index ea1ac4e9c..dc4fa666b 100644 --- a/src/ATen/native/xpu/NMS.cpp +++ b/src/ATen/native/xpu/NMS.cpp @@ -1,7 +1,7 @@ -#include #include #include #include +#include #include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp index 1aa45021f..deb646f6c 100644 --- a/src/ATen/native/xpu/Nonzero.cpp +++ b/src/ATen/native/xpu/Nonzero.cpp @@ -1,13 +1,12 @@ #include #include -#include #include #include namespace at { - -Tensor& XPUNativeFunctions::nonzero_out(const Tensor& self, Tensor& out) { +namespace native{ +Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { TORCH_CHECK( self.numel() < std::numeric_limits::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \ @@ -30,14 +29,14 @@ Tensor& XPUNativeFunctions::nonzero_out(const Tensor& self, Tensor& out) { MAX_DIMS, " dimensions"); - at::native::xpu::nonzero_kernel(self, out); + xpu::nonzero_kernel(self, out); return out; } -Tensor XPUNativeFunctions::nonzero(const Tensor& self) { +Tensor nonzero_xpu(const Tensor& self) { Tensor out = at::detail::empty_xpu({0}, self.options().dtype(kLong)); - XPUNativeFunctions::nonzero_out(self, out); + nonzero_out_xpu(self, out); return out; } - -} // namespace at +} +} // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/Normalization.cpp b/src/ATen/native/xpu/Normalization.cpp index 3bc170da6..34422df11 100644 --- a/src/ATen/native/xpu/Normalization.cpp +++ b/src/ATen/native/xpu/Normalization.cpp @@ -1,113 +1,15 @@ -#include #include #include +#include +#include #include -#include + #include namespace at { - -void renorm_meta( - const Tensor& self, - const Scalar& p, - int64_t dim, - const Scalar& maxnorm, - Tensor& output) { - TORCH_CHECK(!p.isComplex(), "renorm: p must be real-valued"); - TORCH_CHECK(p.toDouble() > 0.0, "renorm: non-positive-norm not supported"); - TORCH_CHECK(!maxnorm.isComplex(), "renorm: maxnorm must be real-valued"); - TORCH_CHECK( - maxnorm.toDouble() >= 0.0, - "renorm: expected maxnorm to be >= 0 but got ", - maxnorm.toDouble()); - const auto ndim = self.dim(); - TORCH_CHECK( - ndim > 1, - "renorm: input needs at least 2 dimensions, got ", - ndim, - " dimensions"); - if (output.defined()) { - xpu::resize_out(output, self.sizes(), {}, self.options()); - } else { - output = xpu::create_out(self.sizes(), {}, self.options()); - } -} - -Tensor& renorm_impl( - const Tensor& self, - const Scalar& p, - int64_t dim, - const Scalar& maxnorm, - Tensor& out) { - auto self_sizes = self.sizes(); - dim = c10::maybe_wrap_dim(dim, self_sizes.size()); - - DimVector reduce_dims(self_sizes.size()); - std::iota(reduce_dims.begin(), reduce_dims.end(), 0); - reduce_dims.erase(reduce_dims.begin() + dim); - - auto dtype = self.scalar_type(); - - // This is a device-independent accumulate type, and we follow PyTorch's design. - auto acc_type = at::toAccumulateType(dtype, true); - - Tensor norm; - if (acc_type != dtype) { - norm = at::linalg_vector_norm( - self, - p.toDouble(), - reduce_dims, - /*keepdim=*/true, - /*dtype=*/acc_type); - } else { - norm = at::linalg_vector_norm( - self, - p.toDouble(), - reduce_dims, - /*keepdim=*/true); - } - - auto factor = (acc_type == c10::toRealValueType(dtype)) - ? norm - : at::empty(norm.sizes(), self.options()); - auto iter = TensorIteratorConfig() - .add_output(factor) - .add_input(norm) - .set_check_mem_overlap(false) - .cast_common_dtype_to_outputs(true) - .build(); - - at::native::xpu::renorm_scale_factor_kernel(iter, maxnorm.toDouble()); - return at::mul_outf(self, factor, const_cast(out)); -} - -Tensor& XPUNativeFunctions::renorm_( - Tensor& self, - const Scalar& p, - int64_t dim, - const Scalar& maxnorm) { - renorm_meta(self, p, dim, maxnorm, self); - renorm_impl(self, p, dim, maxnorm, self); - return self; -} -Tensor& XPUNativeFunctions::renorm_out( - const Tensor& self, - const Scalar& p, - int64_t dim, - const Scalar& maxnorm, - Tensor& out) { - renorm_meta(self, p, dim, maxnorm, out); - renorm_impl(self, p, dim, maxnorm, out); - return out; -} -Tensor XPUNativeFunctions::renorm( - const Tensor& self, - const Scalar& p, - int64_t dim, - const Scalar& maxnorm) { - Tensor out; - renorm_meta(self, p, dim, maxnorm, out); - renorm_impl(self, p, dim, maxnorm, out); - return out; +namespace native { +REGISTER_XPU_DISPATCH( + renorm_scale_factor_stub, + &xpu::renorm_scale_factor_kernel); } } // namespace at diff --git a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp new file mode 100644 index 000000000..a12b686b2 --- /dev/null +++ b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp @@ -0,0 +1,38 @@ +#include + +#include +#include + +#include + +namespace at { +namespace native { +// Note: The user must call is_pinned(device='xpu') to explicitly call here. +bool is_pinned_xpu(const Tensor& self, c10::optional device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !device.has_value() || device->type() == c10::DeviceType::XPU); + + return at::detail::getXPUHooks().isPinnedPtr(self.storage().data()); +} + +// Note: The user must call tensor.pin_memory(device='xpu') to explicitly call +// here. +Tensor _pin_memory_xpu(const Tensor& self, c10::optional device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !device.has_value() || device->type() == c10::DeviceType::XPU); + + auto* allocator = at::xpu::getPinnedMemoryAllocator(); + auto storage = c10::Storage( + c10::Storage::use_byte_size_t(), + at::detail::computeStorageNbytes( + self.sizes(), self.strides(), self.dtype().itemsize()), + allocator, + /*resizable=*/false); + auto tensor = at::cpu::empty({0}, self.options()) + .set_(storage, 0, self.sizes(), self.strides()); + tensor.copy_(self); + return tensor; +} + +} // namespace native +} // namespace at diff --git a/src/ATen/native/xpu/PointwiseOps.cpp b/src/ATen/native/xpu/PointwiseOps.cpp index a01bdc391..f95a90a93 100644 --- a/src/ATen/native/xpu/PointwiseOps.cpp +++ b/src/ATen/native/xpu/PointwiseOps.cpp @@ -1,109 +1,11 @@ #include +#include #include -#include - #include namespace at { - -TensorIterator addcdiv_meta( - const Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value, - Tensor& out) { - if (isIntegralType(tensor1.scalar_type(), /*includeBool=*/true) && - isIntegralType(tensor2.scalar_type(), /*includeBool=*/true)) { - TORCH_CHECK( - false, - "Integer division with addcdiv is no longer supported, and in a future ", - "release addcdiv will perform a true division of tensor1 and tensor2. ", - "The historic addcdiv behavior can be implemented as ", - "(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) ", - "for integer inputs and as ", - "(input + value * tensor1 / tensor2) for float inputs. ", - "The future addcdiv behavior is just the latter implementation: ", - "(input + value * tensor1 / tensor2), for all dtypes."); - } - - TensorIterator iter; - iter.build_ternary_op(out, self, tensor1, tensor2); - return iter; -} - -Tensor& XPUNativeFunctions::addcdiv_out( - const Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value, - Tensor& out) { - auto iter = addcdiv_meta(self, tensor1, tensor2, value, out); - native::xpu::addcdiv_kernel(iter, value); - return out; -} - -Tensor XPUNativeFunctions::addcdiv( - const Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value) { - Tensor out; - auto iter = addcdiv_meta(self, tensor1, tensor2, value, out); - native::xpu::addcdiv_kernel(iter, value); - return iter.output(); -} - -Tensor& XPUNativeFunctions::addcdiv_( - Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value) { - auto iter = addcdiv_meta(self, tensor1, tensor2, value, self); - native::xpu::addcdiv_kernel(iter, value); - return self; -} - -TensorIterator addcmul_meta( - const Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value, - Tensor& out) { - TensorIterator iter; - iter.build_ternary_op(out, self, tensor1, tensor2); - return iter; -} - -Tensor& XPUNativeFunctions::addcmul_out( - const Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value, - Tensor& out) { - auto iter = addcmul_meta(self, tensor1, tensor2, value, out); - native::xpu::addcmul_kernel(iter, value); - return out; -} - -Tensor XPUNativeFunctions::addcmul( - const Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value) { - Tensor out; - auto iter = addcmul_meta(self, tensor1, tensor2, value, out); - native::xpu::addcmul_kernel(iter, value); - return iter.output(); -} - -Tensor& XPUNativeFunctions::addcmul_( - Tensor& self, - const Tensor& tensor1, - const Tensor& tensor2, - const Scalar& value) { - auto iter = addcmul_meta(self, tensor1, tensor2, value, self); - native::xpu::addcmul_kernel(iter, value); - return self; -} - +namespace native { +REGISTER_XPU_DISPATCH(addcmul_stub, &xpu::addcmul_kernel); +REGISTER_XPU_DISPATCH(addcdiv_stub, &xpu::addcdiv_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/Pow.cpp b/src/ATen/native/xpu/Pow.cpp index 97dc5a0c2..4b88036db 100644 --- a/src/ATen/native/xpu/Pow.cpp +++ b/src/ATen/native/xpu/Pow.cpp @@ -1,118 +1,13 @@ #include #include #include +#include #include #include -#include namespace at { - -TensorIterator pow_tensor_tensor_meta( - const Tensor& base, - const Tensor& exp, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_binary_op(out, base, exp); - return iter; -} - -TensorIterator pow_tensor_scalar_meta( - const Tensor& base, - const Scalar& exp, - Tensor& out) { - // Numpy compatibility check: - TORCH_CHECK( - !(isIntegralType(base.scalar_type(), true) && exp.isIntegral(true) && - exp.toLong() < 0), - "Integers to negative integer powers are not allowed."); - - auto common_dtype = at::result_type(base, exp); - TensorIterator iter; - iter.build_output_borrowing_argument_owning_unary_op( - out, base.to(common_dtype)); - return iter; -} - -Tensor XPUNativeFunctions::pow(const Tensor& self, const Tensor& exponent) { - Tensor out; - auto iter = pow_tensor_tensor_meta(self, exponent, out); - native::xpu::pow_tensor_tensor_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::pow_(Tensor& self, const Tensor& exponent) { - auto iter = pow_tensor_tensor_meta(self, exponent, self); - native::xpu::pow_tensor_tensor_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::pow_out( - const Tensor& base, - const Tensor& exp, - Tensor& out) { - auto iter = pow_tensor_tensor_meta(base, exp, out); - native::xpu::pow_tensor_tensor_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::pow(const Tensor& self, const Scalar& exponent) { - Tensor out; - auto iter = pow_tensor_scalar_meta(self, exponent, out); - if (exponent.equal(0.0) || exponent.equal(false)) { - iter.output().fill_(1); - } else if (exponent.equal(1.0) || exponent.equal(true)) { - iter.output().copy_(self); - } else { - native::xpu::pow_tensor_scalar_kernel(iter, exponent); - } - return iter.output(); -} - -Tensor& XPUNativeFunctions::pow_(Tensor& self, const Scalar& exponent) { - auto iter = pow_tensor_scalar_meta(self, exponent, self); - if (exponent.equal(0.0) || exponent.equal(false)) { - self.fill_(1); - } else if (exponent.equal(1.0) || exponent.equal(true)) { - } else { - native::xpu::pow_tensor_scalar_kernel(iter, exponent); - } - return self; -} - -Tensor& XPUNativeFunctions::pow_out( - const Tensor& self, - const Scalar& exponent, - Tensor& out) { - auto iter = pow_tensor_scalar_meta(self, exponent, out); - if (exponent.equal(0.0) || exponent.equal(false)) { - out.fill_(1); - } else if (exponent.equal(1.0) || exponent.equal(true)) { - out.copy_(self); - } else { - native::xpu::pow_tensor_scalar_kernel(iter, exponent); - } - return out; -} - -Tensor XPUNativeFunctions::pow(const Scalar& self, const Tensor& exponent) { - Tensor out; - auto iter = TensorIterator::binary_op( - out, native::wrapped_scalar_tensor(self), exponent); - native::xpu::pow_tensor_tensor_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::pow_out( - const Scalar& self, - const Tensor& exponent, - Tensor& out) { - if (self.equal(1.0)) { - out.fill_(1); - } else { - return XPUNativeFunctions::pow_out( - native::wrapped_scalar_tensor(self), exponent, out); - } - return out; -} - +namespace native { +REGISTER_XPU_DISPATCH(pow_tensor_tensor_stub, &xpu::pow_tensor_tensor_kernel); +REGISTER_XPU_DISPATCH(pow_tensor_scalar_stub, &xpu::pow_tensor_scalar_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index 3aca0d5c7..ad0a6ffc6 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -1,17 +1,19 @@ -#include #include #include #include #include #include +#include +#include #include #include -#include +#include #include namespace at { -Tensor& XPUNativeFunctions::arange_out( +namespace native { +Tensor& arange_out_xpu( const Scalar& start, const Scalar& end, const Scalar& step, @@ -83,10 +85,10 @@ Tensor& XPUNativeFunctions::arange_out( } }); - return at::native::xpu::arange_kernel(start, end, step, out); + return xpu::arange_kernel(start, end, step, out); } -Tensor& XPUNativeFunctions::range_out( +Tensor& range_xpu_out( const Scalar& start, const Scalar& end, const Scalar& step, @@ -113,4 +115,5 @@ Tensor& XPUNativeFunctions::range_out( return at::native::xpu::range_kernel(start, end, step, out); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ReduceAllOps.cpp b/src/ATen/native/xpu/ReduceAllOps.cpp index 9d5e6a1d3..9719d51fa 100644 --- a/src/ATen/native/xpu/ReduceAllOps.cpp +++ b/src/ATen/native/xpu/ReduceAllOps.cpp @@ -1,11 +1,12 @@ -#include #include +#include +#include #include #include -#include - #include #include +#include +#include namespace at { @@ -16,28 +17,6 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) { native::xpu::min_all_kernel(iter); } -Tensor XPUNativeFunctions::min(const Tensor& self) { - TORCH_CHECK( - self.numel() > 0, - "min(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument."); - Tensor result = at::empty({}, self.options()); - min_all_kernel_impl(result, self.contiguous()); - return result; -} - -Tensor& XPUNativeFunctions::min_out(const Tensor& self, Tensor& out) { - // First check if the devices match (CPU vs GPU) - TORCH_CHECK(self.device() == out.device()); - - TORCH_CHECK(canCast( - typeMetaToScalarType(self.dtype()), typeMetaToScalarType(out.dtype()))); - - at::native::resize_output(out, {}); - - min_all_kernel_impl(out, self.contiguous()); - return out; -} - void max_all_kernel_impl(Tensor& result, const Tensor& input) { auto dtype = input.scalar_type(); auto iter = native::make_reduction( @@ -45,33 +24,9 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) { native::xpu::max_all_kernel(iter); } -Tensor XPUNativeFunctions::max(const Tensor& self) { - TORCH_CHECK( - self.numel() > 0, - "max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument."); - Tensor result = at::empty({}, self.options()); - max_all_kernel_impl(result, self.contiguous()); - return result; -} - -Tensor& XPUNativeFunctions::max_out(const Tensor& self, Tensor& out) { - // First check if the devices match (CPU vs GPU) - TORCH_CHECK(self.device() == out.device()); - - TORCH_CHECK(canCast( - typeMetaToScalarType(self.dtype()), typeMetaToScalarType(out.dtype()))); - - at::native::resize_output(out, {}); - - max_all_kernel_impl(out, self.contiguous()); - return out; -} - -std::tuple XPUNativeFunctions::_aminmax(const Tensor& self) { - TORCH_WARN_ONCE( - "_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead." - " This warning will only appear once per process."); - return XPUNativeFunctions::aminmax(self, {}, false); -} +namespace native { +REGISTER_XPU_DISPATCH(min_all_stub, &min_all_kernel_impl); +REGISTER_XPU_DISPATCH(max_all_stub, &max_all_kernel_impl); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp index 834bb3a04..db72e5fbb 100644 --- a/src/ATen/native/xpu/ReduceOps.cpp +++ b/src/ATen/native/xpu/ReduceOps.cpp @@ -1,608 +1,48 @@ -#include + #include #include #include #include #include + +#include #include +#include +#include #include #include #include +#include #include #include #include #include #include -#include #include +#include -namespace at { - -using namespace at::xpu; - -template -void impl_func_cum_ops( - const Tensor& self, - int64_t dim, - const Tensor& result, - Stub& stub) { - NoNamesGuard guard; - if (self.dim() == 0) { - result.fill_(self); - } else if (self.numel() == 0) { - result.zero_(); - } else { - dim = maybe_wrap_dim(dim, self.dim()); - stub(result, self.to(result.scalar_type()), dim); - } -} - -static void cum_ops_meta( - const char* name, - const Tensor& self, - int64_t dim, - std::optional dtype, - Tensor& result) { - // Checking whether 'dim' is valid. - maybe_wrap_dim(dim, self.dim()); - - ScalarType out_dtype; - if (result.defined()) { - out_dtype = dtype.value_or(result.scalar_type()); - at::xpu::resize_out( - result, self.sizes(), {}, self.options().dtype(out_dtype)); - } else { - auto is_integral = - at::isIntegralType(self.scalar_type(), /*includeBool=*/true); - out_dtype = - dtype.value_or(is_integral ? ScalarType::Long : self.scalar_type()); - result = - at::xpu::create_out(self.sizes(), {}, self.options().dtype(out_dtype)); - } - - namedinference::propagate_names(result, self); -} - -Tensor& XPUNativeFunctions::cumsum_out( - const Tensor& self, - int64_t dim, - c10::optional dtype, - Tensor& result) { - cum_ops_meta("cumsum", self, dim, dtype, result); - - impl_func_cum_ops(self, dim, result, at::native::xpu::cumsum_kernel); - return result; -} - -Tensor XPUNativeFunctions::cumsum( - const Tensor& self, - int64_t dim, - c10::optional dtype) { - Tensor result; - return XPUNativeFunctions::cumsum_out(self, dim, dtype, result); -} - -Tensor& XPUNativeFunctions::cumsum_( - Tensor& self, - int64_t dim, - c10::optional dtype) { - return XPUNativeFunctions::cumsum_out(self, dim, dtype, self); -} - -Tensor& XPUNativeFunctions::cumprod_out( - const Tensor& self, - int64_t dim, - c10::optional dtype, - Tensor& result) { - cum_ops_meta("cumprod", self, dim, dtype, result); - - impl_func_cum_ops(self, dim, result, at::native::xpu::cumprod_kernel); - return result; -} - -Tensor XPUNativeFunctions::cumprod( - const Tensor& self, - int64_t dim, - c10::optional dtype) { - Tensor result; - return XPUNativeFunctions::cumprod_out(self, dim, dtype, result); -} - -Tensor& XPUNativeFunctions::cumprod_( - Tensor& self, - int64_t dim, - c10::optional dtype) { - return XPUNativeFunctions::cumprod_out(self, dim, dtype, self); -} - -static ScalarType infer_dtype_from_optional( - const Tensor& self, - const optional& opt_dtype, - const Tensor& result) { - // 'opt_dtype' has the priority for both cases. - if (result.defined()) { - // Otherwise, get the result type, if defined. - return opt_dtype.value_or(result.scalar_type()); - } else { - // Last case is to get the self type. - // If the self type is an integer, we promote it to kLong. - return at::native::get_dtype_from_self(self, opt_dtype, true); - } -} - -inline bool should_use_acc_buffer(at::TensorIterator& iter) { - const auto ndim = iter.ndim(); - if (!iter.device().is_cpu() || iter.noutputs() != 1) { - return false; - } - if (!at::isReducedFloatingType(iter.common_dtype())) { - return false; - } - if (ndim < 2) { - return false; - } - auto out_strides = iter.strides(0); - for (const auto dim : c10::irange(0, 2)) { - if (out_strides[dim] != 0) { - return false; - } - } - return true; -} - -Tensor& XPUNativeFunctions::sum_out( - const Tensor& self, - OptionalIntArrayRef opt_dim, - bool keepdim, - c10::optional opt_dtype, - Tensor& result) { - auto out_dtype = infer_dtype_from_optional(self, opt_dtype, result); - result = resize_reduction(result, self, opt_dim, keepdim, out_dtype); - auto iter = meta::make_reduction_from_out_ty( - self, result, opt_dim, keepdim, result.scalar_type()); - if (iter.numel() == 0) { - result.zero_(); - } else { - // Here is a limitation of TensorIterator reductions for permuted input with - // lower precision on CPU. Consider the case: TensorIterator coalesces such - // input and output to >= 2 dims tensors, and the output stride is [0, 0, x, - // x, ...] with x >= 0 (two reduced dimensions and non-reduced dims). Since - // the reduction loop only operates on two dimensions at a time, the - // intermediate sums is forced to do accumulation in the second reduced dim - // with lower precision. See https://github.com/pytorch/pytorch/issues/83149 - if (should_use_acc_buffer(iter)) { - auto tmp_output = - at::empty(result.sizes(), result.options().dtype(kFloat)); - at::sum_outf( - self.to(ScalarType::Float), - opt_dim, - keepdim, - /*dtype=*/c10::nullopt, - tmp_output); - result.copy_(tmp_output); - } else { - native::xpu::sum_kernel(iter); - } - } - return result; -} - -Tensor XPUNativeFunctions::sum( - const Tensor& self, - OptionalIntArrayRef dim, - bool keepdim, - c10::optional opt_dtype) { - Tensor out; - return XPUNativeFunctions::sum_out(self, dim, keepdim, opt_dtype, out); -} - -Tensor& prod_meta( - const Tensor& self, - int64_t dim, - bool keepdim, - std::optional dtype, - Tensor& result) { - auto out_dtype = infer_dtype_from_optional(self, dtype, result); - result = resize_reduction(result, self, dim, keepdim, out_dtype); - return result; -} - -static void impl_func_prod( - const Tensor& self, - IntArrayRef dims, - bool keepdim, - std::optional dtype, - Tensor& result) { - auto iter = meta::make_reduction_from_out_ty( - self, result, dims, keepdim, result.scalar_type()); - if (iter.numel() == 0) { - result.fill_(1); - } else { - native::xpu::prod_kernel(iter); - } -} - -Tensor& XPUNativeFunctions::prod_out( - const Tensor& self, - int64_t dim, - bool keepdim, - std::optional dtype, - Tensor& result) { - result = prod_meta(self, dim, keepdim, dtype, result); - impl_func_prod(self, dim, keepdim, dtype, result); - return result; -} - -Tensor XPUNativeFunctions::prod( - const Tensor& self, - std::optional opt_dtype) { - auto dtype = at::native::get_dtype_from_self(self, opt_dtype, true); - auto shape = meta::get_reduction_shape(self, {}, false); - Tensor result = at::empty(shape, self.options().dtype(dtype)); - impl_func_prod(self, {}, false, dtype, result); - return result; -} - -Tensor XPUNativeFunctions::prod( - const Tensor& self, - int64_t dim, - bool keepdim, - std::optional dtype) { - Tensor result; - result = prod_meta(self, dim, keepdim, dtype, result); - impl_func_prod(self, dim, keepdim, dtype, result); - return result; -} - -Tensor& mean_meta( - const Tensor& self, - OptionalIntArrayRef opt_dim, - bool keepdim, - optional opt_dtype, - Tensor& out) { - auto in_dtype = at::native::get_dtype_from_self(self, opt_dtype, true); - if (!at::isFloatingType(in_dtype) && !at::isComplexType(in_dtype)) { - std::string what = "Input"; - std::string dtype = toString(self.scalar_type()); - - if (opt_dtype.has_value()) { - what = "Optional"; - dtype = toString(opt_dtype.value()); - } - - TORCH_CHECK( - false, - "mean(): could not infer output dtype. ", - what, - " dtype must be either a floating point or complex dtype. ", - "Got: ", - dtype); - } - - auto out_dtype = infer_dtype_from_optional(self, opt_dtype, out); - out = resize_reduction(out, self, opt_dim, keepdim, out_dtype); - return out; -} - -Tensor& XPUNativeFunctions::mean_out( - const Tensor& self, - OptionalIntArrayRef opt_dim, - bool keepdim, - c10::optional opt_dtype, - Tensor& result) { - result = mean_meta(self, opt_dim, keepdim, opt_dtype, result); - ScalarType dtype = result.scalar_type(); - // device is not CPU - auto iter = at::meta::make_reduction_from_out_ty( - self, result, opt_dim, keepdim, dtype); - if (iter.numel() == 0) { - result.fill_(std::numeric_limits::quiet_NaN()); - } else { - native::xpu::mean_kernel(iter); - } - return result; -} - -Tensor XPUNativeFunctions::mean( - const Tensor& self, - OptionalIntArrayRef dim, - bool keepdim, - ::std::optional dtype) { - Tensor out; - out = mean_meta(self, dim, keepdim, dtype, out); - out = XPUNativeFunctions::mean_out(self, dim, keepdim, dtype, out); - return out; -} - -inline TensorIterator get_allany_iter( - const Tensor& self, - const Tensor& result, - OptionalIntArrayRef dims, - bool keepdim) { - return meta::make_reduction_from_out_ty( - self, result, dims, keepdim, result.scalar_type()); -} - -template -inline void allany_impl( - const Tensor& self, - const Tensor& result, - OptionalIntArrayRef dims, - bool keepdim, - Stub& stub) { - if (self.numel() == 0) { - result.fill_(identity); - } else if (self.numel() == 1) { - result.copy_(self.view_as(result).to(at::kBool)); - } else { - auto iter = get_allany_iter(self, result, dims, keepdim); - stub(iter); - } -} - -static ScalarType get_result_or_bytebool_dtype( - const Tensor& self, - const Tensor& result) { - // Refer [all, any : uint8 compatibility] - if (result.defined()) { - return result.scalar_type(); - } else { - return (self.scalar_type() == kByte) ? kByte : kBool; - } -} - -static void check_result_is_bytebool( - const char* name, - const Tensor& self, - const Tensor& result) { - if (result.defined()) { - // Refer [all, any : uint8 compatibility] - TORCH_CHECK( - result.scalar_type() == ScalarType::Bool || - result.scalar_type() == ScalarType::Byte, - name, - " only supports bool tensor for result, got: ", - result.scalar_type()); - } -} - -Tensor& allany_meta( - Tensor& result, - const char* name, - const Tensor& self, - OptionalIntArrayRef dims, - bool keepdim) { - check_result_is_bytebool(name, self, result); - auto out_dtype = get_result_or_bytebool_dtype(self, result); - result = resize_reduction( - result, self, dims, keepdim, out_dtype, /*allow_empty_dims=*/true); - return result; -} - -// aten::all.dim -Tensor XPUNativeFunctions::all(const Tensor& self, int64_t dim, bool keepdim) { - Tensor out; - out = allany_meta(out, "all", self, dim, keepdim); - allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel); - return out; -} - -// aten::all.out -Tensor& XPUNativeFunctions::all_out( - const Tensor& self, - int64_t dim, - bool keepdim, - Tensor& out) { - out = allany_meta(out, "all", self, dim, keepdim); - allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel); - return out; -} - -// aten::all.dims -Tensor XPUNativeFunctions::all( - const Tensor& self, - OptionalIntArrayRef dim, - bool keepdim) { - Tensor out; - out = allany_meta(out, "all", self, dim, keepdim); - allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel); - return out; -} - -// aten::all.dims_out -Tensor& XPUNativeFunctions::all_out( - const Tensor& self, - OptionalIntArrayRef dim, - bool keepdim, - Tensor& out) { - out = allany_meta(out, "all", self, dim, keepdim); - allany_impl<1>(self, out, dim, keepdim, native::xpu::and_kernel); - return out; -} - -// aten::all -Tensor XPUNativeFunctions::all(const Tensor& self) { - Tensor out; - out = allany_meta(out, "all", self, {}, false); - allany_impl<1>(self, out, {}, false, native::xpu::and_kernel); - return out; -} - -// aten::all.all_out -Tensor& XPUNativeFunctions::all_out(const Tensor& self, Tensor& out) { - out = allany_meta(out, "all", self, {}, false); - allany_impl<1>(self, out, {}, false, native::xpu::and_kernel); - return out; -} - -// aten::any.dim -Tensor XPUNativeFunctions::any(const Tensor& self, int64_t dim, bool keepdim) { - Tensor out; - out = allany_meta(out, "any", self, dim, keepdim); - allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel); - return out; -} - -// aten::any.out -Tensor& XPUNativeFunctions::any_out( - const Tensor& self, - int64_t dim, - bool keepdim, - Tensor& out) { - out = allany_meta(out, "any", self, dim, keepdim); - allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel); - return out; -} - -// aten::any.dims -Tensor XPUNativeFunctions::any( - const Tensor& self, - OptionalIntArrayRef dim, - bool keepdim) { - Tensor out; - out = allany_meta(out, "any", self, dim, keepdim); - allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel); - return out; -} - -// aten::any.dims_out -Tensor& XPUNativeFunctions::any_out( - const Tensor& self, - OptionalIntArrayRef dim, - bool keepdim, - Tensor& out) { - out = allany_meta(out, "any", self, dim, keepdim); - allany_impl<0>(self, out, dim, keepdim, native::xpu::or_kernel); - return out; -} - -// aten::any -Tensor XPUNativeFunctions::any(const Tensor& self) { - Tensor out; - out = allany_meta(out, "any", self, {}, false); - allany_impl<0>(self, out, {}, false, native::xpu::or_kernel); - return out; -} +#include +#include +#include +#include +#include -// aten::any.any_out -Tensor& XPUNativeFunctions::any_out(const Tensor& self, Tensor& out) { - out = allany_meta(out, "any", self, {}, false); - allany_impl<0>(self, out, {}, false, native::xpu::or_kernel); - return out; -} - -template -void argmax_argmin_impl( - const Tensor& self, - c10::optional dim, - bool keepdim, - const Tensor& result, - Stub& stub) { - c10::MaybeOwned in; - DimVector dims; - int64_t _dim = 0; - - if (dim.has_value()) { - _dim = maybe_wrap_dim(dim.value(), self.dim()); - auto sizes = self.sizes(); - - if (sizes[_dim] == 1) { - result.fill_(0); - return; - } - - dims = IntArrayRef(_dim); - in = c10::MaybeOwned::borrowed(self); - } else { - in = c10::MaybeOwned::owned(self.reshape({-1})); - keepdim = false; - } - - auto iter = - meta::make_reduction(*in, result, dims, keepdim, self.scalar_type()); - - if (iter.numel() != 0) { - stub(iter); - } -} - -static void check_argmax_argmin( - const char* name, - const Tensor& self, - const c10::optional& dim) { - if (dim.has_value()) { - auto dim_ = maybe_wrap_dim(dim.value(), self.dim()); - native::zero_numel_check_dims(self, dim_, name); - } else { - TORCH_CHECK_INDEX( - self.numel() != 0, - name, - ": Expected reduction dim to be specified for input.numel() == 0."); - } -} - -static IntArrayRef optional_to_arrayref(const c10::optional& opt) { - return opt.has_value() ? opt.value() : IntArrayRef{}; -} - -Tensor& argmax_meta( - const Tensor& self, - c10::optional dim, - bool keepdim, - Tensor& out) { - check_argmax_argmin("argmax()", self, dim); - return resize_reduction(out, self, optional_to_arrayref(dim), keepdim, kLong); -} - -Tensor& XPUNativeFunctions::argmax_out( - const Tensor& self, - c10::optional dim, - bool keepdim, - Tensor& out) { - out = argmax_meta(self, dim, keepdim, out); - argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmax_kernel); - return out; -} - -Tensor XPUNativeFunctions::argmax( - const Tensor& self, - c10::optional dim, - bool keepdim) { - Tensor out; - out = argmax_meta(self, dim, keepdim, out); - argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmax_kernel); - return out; -} - -Tensor& argmin_meta( - const Tensor& self, - c10::optional dim, - bool keepdim, - Tensor& out) { - check_argmax_argmin("argmin()", self, dim); - return resize_reduction(out, self, optional_to_arrayref(dim), keepdim, kLong); -} - -Tensor& XPUNativeFunctions::argmin_out( - const Tensor& self, - c10::optional dim, - bool keepdim, - Tensor& out) { - out = argmin_meta(self, dim, keepdim, out); - argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmin_kernel); - return out; -} - -Tensor XPUNativeFunctions::argmin( - const Tensor& self, - c10::optional dim, - bool keepdim) { - Tensor out; - out = argmin_meta(self, dim, keepdim, out); - argmax_argmin_impl(self, dim, keepdim, out, native::xpu::argmin_kernel); - return out; -} +namespace at { +namespace native { +REGISTER_XPU_DISPATCH(sum_stub, &xpu::sum_kernel); +REGISTER_XPU_DISPATCH(mean_stub, &xpu::mean_kernel); +REGISTER_XPU_DISPATCH(prod_stub, &xpu::prod_kernel); +REGISTER_XPU_DISPATCH(argmax_stub, &xpu::argmax_kernel); +REGISTER_XPU_DISPATCH(argmin_stub, &xpu::argmin_kernel); +REGISTER_XPU_DISPATCH(and_stub, &xpu::and_kernel); +REGISTER_XPU_DISPATCH(or_stub, &xpu::or_kernel); +REGISTER_XPU_DISPATCH(max_values_stub, &xpu::max_values_kernel); +REGISTER_XPU_DISPATCH(min_values_stub, &xpu::min_values_kernel); +REGISTER_XPU_DISPATCH(std_var_stub, &xpu::std_var_kernel); +REGISTER_XPU_DISPATCH(cumsum_stub, &xpu::cumsum_kernel); +REGISTER_XPU_DISPATCH(cumprod_stub, &xpu::cumprod_kernel); +REGISTER_XPU_DISPATCH(nansum_stub, &xpu::nansum_kernel); static inline void warn_invalid_degrees_of_freedom( const char* fname, @@ -793,7 +233,7 @@ static inline TensorOptions options_to_value_type(TensorOptions opts) { return opts.dtype(c10::toRealValueType(scalar_type)); } -Tensor XPUNativeFunctions::std( +Tensor std_xpu( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, @@ -802,7 +242,7 @@ Tensor XPUNativeFunctions::std( return std_var_out("std", result, self, dim, correction, keepdim, true); } -Tensor& XPUNativeFunctions::std_out( +Tensor& std_xpu_out( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, @@ -811,7 +251,7 @@ Tensor& XPUNativeFunctions::std_out( return std_var_out("std", result, self, dim, correction, keepdim, true); } -Tensor& XPUNativeFunctions::var_out( +Tensor& var_xpu_out( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, @@ -820,7 +260,7 @@ Tensor& XPUNativeFunctions::var_out( return std_var_out("var", result, self, dim, correction, keepdim, false); } -Tensor XPUNativeFunctions::var( +Tensor var_xpu( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, @@ -829,7 +269,7 @@ Tensor XPUNativeFunctions::var( return std_var_out("var", result, self, dim, correction, keepdim, false); } -std::tuple XPUNativeFunctions::var_mean( +std::tuple var_mean_xpu( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, @@ -840,7 +280,7 @@ std::tuple XPUNativeFunctions::var_mean( "var_mean", result1, result2, self, dim, correction, keepdim, false); } -std::tuple XPUNativeFunctions::std_mean( +std::tuple std_mean_xpu( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, @@ -851,354 +291,31 @@ std::tuple XPUNativeFunctions::std_mean( "std_mean", result1, result2, self, dim, correction, keepdim, true); } -static Tensor& amax_amin_meta( - Tensor& result, - const char* name, - const Tensor& self, - IntArrayRef dim, - bool keepdim) { - if (result.defined()) { - TORCH_CHECK( - self.scalar_type() == result.scalar_type(), - "Expected the dtype for input and out to match, but got ", - self.scalar_type(), - " for input's dtype and ", - result.scalar_type(), - " for out's dtype."); - } - if (self.numel() == 0) { - at::native::zero_numel_check_dims(self, dim, "amax()"); - } - const ScalarType& out_dtype = - result.defined() ? result.scalar_type() : self.scalar_type(); - return resize_reduction(result, self, dim, keepdim, out_dtype); -} - -template -void amax_amin_impl( - const Tensor& self, - IntArrayRef dim, - bool keepdim, - const Tensor& result, - Stub& stub) { - auto iter = - meta::make_reduction(self, result, dim, keepdim, self.scalar_type()); - - if (iter.numel() != 0) { - stub(iter); - } -} - -Tensor& XPUNativeFunctions::amax_out( - const Tensor& self, - IntArrayRef dim, - bool keepdim, - Tensor& out) { - out = amax_amin_meta(out, "amax()", self, dim, keepdim); - amax_amin_impl(self, dim, keepdim, out, native::xpu::max_all_kernel); - return out; -} - -Tensor XPUNativeFunctions::amax( - const Tensor& self, - IntArrayRef dim, - bool keepdim) { - Tensor out; - out = amax_amin_meta(out, "amax()", self, dim, keepdim); - amax_amin_impl(self, dim, keepdim, out, native::xpu::max_all_kernel); - return out; -} - -Tensor& XPUNativeFunctions::amin_out( - const Tensor& self, - IntArrayRef dim, - bool keepdim, - Tensor& out) { - out = amax_amin_meta(out, "amin()", self, dim, keepdim); - amax_amin_impl(self, dim, keepdim, out, native::xpu::min_all_kernel); - return out; -} - -Tensor XPUNativeFunctions::amin( - const Tensor& self, - IntArrayRef dim, - bool keepdim) { - Tensor out; - out = amax_amin_meta(out, "amin()", self, dim, keepdim); - amax_amin_impl(self, dim, keepdim, out, native::xpu::min_all_kernel); - return out; -} - -Tensor& XPUNativeFunctions::nansum_out( - const Tensor& self, - at::OptionalIntArrayRef dim, - bool keepdim, - optional opt_dtype, - Tensor& result) { - // For integral types, use existing sum as - // integral types don't have `Nan`. - if (c10::isIntegralType(self.scalar_type(), true)) { - return at::sum_out(result, self, dim, keepdim, opt_dtype); - } - - auto out_dtype = infer_dtype_from_optional(self, opt_dtype, result); - result = resize_reduction(result, self, dim, keepdim, out_dtype); - auto iter = meta::make_reduction_from_out_ty( - self, result, dim, keepdim, result.scalar_type()); - - if (iter.numel() == 0) { - result = result.zero_(); - } else { - native::xpu::nansum_kernel(iter); - } - return result; -} - -Tensor XPUNativeFunctions::nansum( - const Tensor& self, - at::OptionalIntArrayRef dim, - bool keepdim, - std::optional opt_dtype) { - Tensor result; - return XPUNativeFunctions::nansum_out(self, dim, keepdim, opt_dtype, result); -} - -static ScalarType get_result_or_self_value_dtype( - const Tensor& self, - const Tensor& result, - const std::optional& dtype) { - if (result.defined()) { - return result.scalar_type(); - } else { - return dtype.value_or(toRealValueType(self.scalar_type())); - } -} - -Tensor& norm_scalaropt_dim_dtype_meta( - const Tensor& self, - const OptionalScalarRef p, - IntArrayRef dim, - bool keepdim, - ScalarType dtype, - Tensor& result) { - TORCH_CHECK( - at::isFloatingType(dtype) || at::isComplexType(dtype), - "norm(): the desired output dtype should be either floating point or complex. " - "Got ", - dtype, - " instead."); - auto out_dtype = get_result_or_self_value_dtype(self, result, dtype); - return resize_reduction(result, self, dim, keepdim, out_dtype); -} - -static void impl_func_norm( - const Tensor& self, - const OptionalScalarRef& opt_p, - IntArrayRef dim, - bool keepdim, - optional opt_dtype, - const Tensor& result) { - // Left this implementation without deprecating it as it is called in a number - // of places in the codebase. We should swap those by linalg_vector_norm - auto p = opt_p.has_value() ? opt_p.get() : Scalar(2.0).to(); - at::linalg_vector_norm_out( - const_cast(result), self, p, dim, keepdim, opt_dtype); -} - -Tensor XPUNativeFunctions::norm( - const Tensor& self, - const std::optional& p, - IntArrayRef dim, - bool keepdim, - ScalarType dtype) { - Tensor result; - auto p_ = - (p.has_value() ? at::OptionalScalarRef(&(p.value())) - : at::OptionalScalarRef()); - result = norm_scalaropt_dim_dtype_meta(self, p_, dim, keepdim, dtype, result); - impl_func_norm(self, p_, dim, keepdim, dtype, result); - return result; -} - -Tensor& XPUNativeFunctions::norm_out( - const Tensor& self, - const std::optional& p, - IntArrayRef dim, - bool keepdim, - ScalarType dtype, - Tensor& result) { - auto p_ = - (p.has_value() ? at::OptionalScalarRef(&(p.value())) - : at::OptionalScalarRef()); - result = norm_scalaropt_dim_dtype_meta(self, p_, dim, keepdim, dtype, result); - impl_func_norm(self, p_, dim, keepdim, dtype, result); - return result; -} - -Tensor& norm_scalaropt_dim_meta( - const Tensor& self, - const OptionalScalarRef p, - IntArrayRef dim, - bool keepdim, - Tensor& result) { - TORCH_CHECK( - at::isFloatingType(self.scalar_type()) || - at::isComplexType(self.scalar_type()), - "norm(): input dtype should be either floating point or complex. " - "Got ", - self.scalar_type(), - " instead."); - - auto out_dtype = get_result_or_self_value_dtype(self, result, c10::nullopt); - return resize_reduction(result, self, dim, keepdim, out_dtype); -} - -Tensor XPUNativeFunctions::norm( - const Tensor& self, - const std::optional& p, - IntArrayRef dim, - bool keepdim) { - auto p_ = - (p.has_value() ? at::OptionalScalarRef(&(p.value())) - : at::OptionalScalarRef()); - Tensor result; - result = norm_scalaropt_dim_meta(self, p_, dim, keepdim, result); - impl_func_norm(self, p_, dim, keepdim, c10::nullopt, result); - return result; -} - -Tensor& XPUNativeFunctions::norm_out( - const Tensor& self, - const std::optional& p, - IntArrayRef dim, - bool keepdim, - Tensor& result) { - auto p_ = - (p.has_value() ? at::OptionalScalarRef(&(p.value())) - : at::OptionalScalarRef()); - result = norm_scalaropt_dim_meta(self, p_, dim, keepdim, result); - impl_func_norm(self, p_, dim, keepdim, c10::nullopt, result); - return result; -} - -TensorIterator meta_aminmax( - const Tensor& self, - std::optional dim_opt, - bool keepdim, - Tensor& min, - Tensor& max) { - TensorIterator iter; - auto dtype = self.scalar_type(); - DimVector shape; - if (dim_opt.has_value()) { - auto dim = maybe_wrap_dim(dim_opt.value(), self.ndimension()); - native::zero_numel_check_dims(self, dim, "aminmax"); - shape = meta::get_reduction_shape(self, dim, keepdim); - iter = at::native::make_reduction( - "aminmax_xpu", min, max, self, dim, keepdim, dtype); - } else { - TORCH_CHECK( - self.numel() > 0, - "aminmax(): cannot compute aminmax over an empty dimension as the " - "operation has no identity."); - if (keepdim) { - shape = DimVector(self.ndimension(), 1); - } - iter = at::native::make_reduction( - "aminmax_xpu", - min, - max, - self.contiguous(), - IntArrayRef{}, - false, - dtype); - } - const auto options = self.options(); - iter.set_output_raw_strided( - 0, shape, {}, options, min.has_names() ? min.names() : DimnameList{}); - iter.set_output_raw_strided( - 1, shape, {}, options, max.has_names() ? max.names() : DimnameList{}); - return iter; -} - void aminmax_impl( const Tensor& self, - std::optional dim_opt, + int64_t dim_opt, bool keepdim, Tensor& min, Tensor& max) { - TensorIterator iter; - iter = meta_aminmax(self, dim_opt, keepdim, min, max); + auto dtype = self.scalar_type(); + TensorIterator iter = make_reduction( + "aminmax_xpu", min, max, self, dim_opt, keepdim, dtype); if (iter.numel() != 0) { native::xpu::aminmax_kernel(iter); } } void aminmax_allreduce_impl(const Tensor& self, Tensor& min, Tensor& max) { - TensorIterator iter; - iter = meta_aminmax(self, {}, false, min, max); + auto dtype = self.scalar_type(); + auto iter = make_reduction( + "aminmax_xpu", min, max, self, IntArrayRef{}, false, dtype); TORCH_CHECK( iter.numel() > 0, "min_max on a tensor with no elements is not defined."); native::xpu::aminmax_allreduce_kernel(iter); } -std::tuple XPUNativeFunctions::aminmax( - const Tensor& self, - std::optional dim_opt, - bool keepdim) { - Tensor min; - Tensor max; - return XPUNativeFunctions::aminmax_out(self, dim_opt, keepdim, min, max); -} - -std::tuple XPUNativeFunctions::aminmax_out( - const Tensor& self, - std::optional dim_opt, - bool keepdim, - Tensor& min, - Tensor& max) { - if (!min.defined()) { - min = native::create_reduction_result( - self, - dim_opt.has_value() ? dim_opt.value() : IntArrayRef{}, - false, - self.scalar_type()); - } - if (!max.defined()) { - max = native::create_reduction_result( - self, - dim_opt.has_value() ? dim_opt.value() : IntArrayRef{}, - false, - self.scalar_type()); - } - - TORCH_CHECK( - self.dtype() == min.dtype(), - "Expected out tensor to have dtype ", - self.dtype(), - ", but got ", - min.dtype(), - " instead"); - - TORCH_CHECK( - self.dtype() == max.dtype(), - "Expected out tensor to have dtype ", - self.dtype(), - ", but got ", - max.dtype(), - " instead"); - - if (dim_opt.has_value()) { - aminmax_impl( - self, - maybe_wrap_dim(dim_opt.value(), self.ndimension()), - keepdim, - min, - max); - } else { - aminmax_allreduce_impl(self.contiguous(), min, max); - } - return std::tuple(min, max); -} +REGISTER_XPU_DISPATCH(aminmax_stub, &aminmax_impl); +REGISTER_XPU_DISPATCH(aminmax_allreduce_stub, &aminmax_allreduce_impl) +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ReflectionPad.cpp b/src/ATen/native/xpu/ReflectionPad.cpp index 2488ed229..a88151914 100644 --- a/src/ATen/native/xpu/ReflectionPad.cpp +++ b/src/ATen/native/xpu/ReflectionPad.cpp @@ -3,321 +3,35 @@ #include #include #include -#include -#include -namespace at { - -void reflection_pad1d_meta( - Tensor& output, - const Tensor& input, - IntArrayRef padding) { - int64_t dim_plane = 0; - int64_t dim_w = 1; - int64_t nbatch = 1; - - if (input.ndimension() == 3) { - nbatch = input.size(0); - dim_w++; - dim_plane++; - } - - at::native::padding::check_valid_input<1>(input, padding); - - /* sizes */ - auto pad_l = padding[0]; - auto pad_r = padding[1]; - - int64_t nplane = input.size(dim_plane); - int64_t input_w = input.size(dim_w); - int64_t output_w = input_w + pad_l + pad_r; - - TORCH_CHECK( - pad_l < input_w && pad_r < input_w, - "Argument #4: Padding size " - "should be less than the corresponding input dimension, but got: padding (", - pad_l, - ", ", - pad_r, - ") at dimension ", - dim_w, - " of input ", - input.sizes()); - - TORCH_CHECK( - output_w >= 1, - "input (W: ", - input_w, - ") is too small. Calculated output W: ", - output_w); - - if (output.defined()) { - if (input.ndimension() == 2) { - xpu::resize_out(output, {nplane, output_w}, {}, input.options()); - } else { - xpu::resize_out(output, {nbatch, nplane, output_w}, {}, input.options()); - } - } else { - if (input.ndimension() == 2) { - output = xpu::create_out({nplane, output_w}, {}, input.options()); - } else { - output = xpu::create_out({nbatch, nplane, output_w}, {}, input.options()); - } - } -} - -void reflection_pad1d_backward_meta( - Tensor& grad_input, - const Tensor& grad_output, - const Tensor& input, - IntArrayRef padding) { - int64_t dim_w = 1; - if (input.ndimension() == 3) { - dim_w++; - } - - /* sizes */ - auto pad_l = padding[0]; - auto pad_r = padding[1]; - int64_t input_w = input.size(dim_w); - int64_t output_w = input_w + pad_l + pad_r; - - TORCH_CHECK( - pad_l < input_w && pad_r < input_w, - "Argument #4: Padding size " - "should be less than the corresponding input dimension, but got: padding (", - pad_l, - ", ", - pad_r, - ") at dimension ", - dim_w, - " of input ", - input.sizes()); - - TORCH_CHECK( - output_w == grad_output.size(dim_w), - "grad_output width unexpected." - " Expected: ", - output_w, - ", Got: ", - grad_output.size(dim_w)); - - if (grad_input.defined()) { - xpu::resize_out(grad_input, input.sizes(), {}, input.options()); - } else { - grad_input = xpu::create_out(input.sizes(), {}, input.options()); - } -} - -void reflection_pad3d_meta( - Tensor& output, - const Tensor& input, - IntArrayRef padding) { - int64_t pad_left = padding[0]; - int64_t pad_right = padding[1]; - int64_t pad_top = padding[2]; - int64_t pad_bottom = padding[3]; - int64_t pad_front = padding[4]; - int64_t pad_back = padding[5]; - int64_t dim_w = 3; - int64_t dim_h = 2; - int64_t dim_d = 1; - int64_t dim_plane = 0; - - at::native::padding::check_valid_input<3>(input, padding); - - bool batch_mode = (input.dim() == 5); - if (batch_mode) { - dim_w++; - dim_h++; - dim_d++; - dim_plane++; - } - - int64_t nplane = input.size(dim_plane); - int64_t input_d = input.size(dim_d); - int64_t input_h = input.size(dim_h); - int64_t input_w = input.size(dim_w); - int64_t output_d = input_d + pad_front + pad_back; - int64_t output_h = input_h + pad_top + pad_bottom; - int64_t output_w = input_w + pad_left + pad_right; - - TORCH_CHECK( - pad_left < input_w && pad_right < input_w, - "Argument #4: Padding size " - "should be less than the corresponding input dimension, but got: padding (", - pad_left, - ", ", - pad_right, - ") at dimension ", - dim_w, - " of input ", - input.sizes()); - TORCH_CHECK( - pad_top < input_h && pad_bottom < input_h, - "Argument #6: Padding size " - "should be less than the corresponding input dimension, but got: padding (", - pad_top, - ", ", - pad_bottom, - ") at dimension ", - dim_h, - " of input ", - input.sizes()); - TORCH_CHECK( - pad_front < input_d && pad_back < input_d, - "Argument #8: Padding size " - "should be less than the corresponding input dimension, but got: padding (", - pad_front, - ", ", - pad_back, - ") at dimension ", - dim_d, - " of input ", - input.sizes()); - - TORCH_CHECK( - output_w >= 1 || output_h >= 1 || output_d >= 1, - "input (D: ", - input_d, - " H: ", - input_h, - ", W: ", - input_w, - ") is too small." - " Calculated output D: ", - output_d, - " H: ", - output_h, - " W: ", - output_w); - - if (output.defined()) { - if (batch_mode) { - xpu::resize_out( - output, - {input.size(0), nplane, output_d, output_h, output_w}, - {}, - input.options()); - } else { - xpu::resize_out( - output, {nplane, output_d, output_h, output_w}, {}, input.options()); - } - } else { - if (batch_mode) { - output = xpu::create_out( - {input.size(0), nplane, output_d, output_h, output_w}, - {}, - input.options()); - } else { - output = xpu::create_out( - {nplane, output_d, output_h, output_w}, {}, input.options()); - } - } -} - -void reflection_pad3d_backward_meta( - Tensor& grad_input, - const Tensor& grad_output, - const Tensor& input, - IntArrayRef padding) { - TORCH_CHECK(padding.size() == 6, "padding size is expected to be 6"); - TORCH_CHECK(input.dim() > 3); - TORCH_CHECK(grad_output.dim() == input.dim()); - - int64_t pad_left = padding[0]; - int64_t pad_right = padding[1]; - int64_t pad_top = padding[2]; - int64_t pad_bottom = padding[3]; - int64_t pad_front = padding[4]; - int64_t pad_back = padding[5]; - int64_t dim_w = 3; - int64_t dim_h = 2; - int64_t dim_d = 1; - - if (input.dim() == 5) { - // batch mode - dim_w++; - dim_h++; - dim_d++; - } - - int64_t input_d = input.size(dim_d); - int64_t input_h = input.size(dim_h); - int64_t input_w = input.size(dim_w); - int64_t output_d = input_d + pad_front + pad_back; - int64_t output_h = input_h + pad_top + pad_bottom; - int64_t output_w = input_w + pad_left + pad_right; - - TORCH_CHECK( - output_w == grad_output.size(dim_w), - "grad_output width unexpected." - " Expected: ", - output_w, - ", Got: ", - grad_output.size(dim_w)); - TORCH_CHECK( - output_h == grad_output.size(dim_h), - "grad_output height unexpected." - " Expected: ", - output_h, - ", Got: ", - grad_output.size(dim_h)); - TORCH_CHECK( - output_d == grad_output.size(dim_d), - "grad_output depth unexpected." - " Expected: ", - output_d, - ", Got: ", - grad_output.size(dim_d)); - - if (grad_input.defined()) { - xpu::resize_out(grad_input, input.sizes(), {}, input.options()); - } else { - grad_input = xpu::create_out(input.sizes(), {}, input.options()); - } -} - -Tensor XPUNativeFunctions::reflection_pad1d( - const Tensor& input, - IntArrayRef padding) { - Tensor output; - reflection_pad1d_meta(output, input, padding); - native::xpu::reflection_pad1d_kernel(output, input, padding); - return output; -} +#include +#include +#include +#include +#include +#include +#include +#include +#include "ATen/TensorMeta.h" -Tensor& XPUNativeFunctions::reflection_pad1d_out( - const Tensor& input, - IntArrayRef padding, - Tensor& output) { - reflection_pad1d_meta(output, input, padding); - native::xpu::reflection_pad1d_kernel(output, input, padding); - return output; -} +namespace at { +namespace native { -Tensor XPUNativeFunctions::reflection_pad1d_backward( - const Tensor& grad_output, - const Tensor& input, - IntArrayRef padding) { - Tensor grad_input; - reflection_pad1d_backward_meta(grad_input, grad_output, input, padding); - native::xpu::reflection_pad1d_backward_kernel( - grad_input, grad_output, input, padding); - return grad_input; +TORCH_IMPL_FUNC(reflection_pad1d_out_xpu) +(const Tensor& input_, IntArrayRef padding, const Tensor& output) { + xpu::reflection_pad1d_kernel(output, input_, padding); } -Tensor& XPUNativeFunctions::reflection_pad1d_backward_out( - const Tensor& grad_output, - const Tensor& input, - IntArrayRef padding, - Tensor& grad_input) { - native::xpu::reflection_pad1d_backward_kernel( +TORCH_IMPL_FUNC(reflection_pad1d_backward_out_xpu) +(const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + const Tensor& grad_input) { + xpu::reflection_pad1d_backward_kernel( grad_input, grad_output, input, padding); - return grad_input; } -Tensor& XPUNativeFunctions::reflection_pad2d_out( +Tensor& reflection_pad2d_out_xpu( const Tensor& input, IntArrayRef padding, Tensor& output) { @@ -325,15 +39,13 @@ Tensor& XPUNativeFunctions::reflection_pad2d_out( return output; } -Tensor XPUNativeFunctions::reflection_pad2d( - const Tensor& input, - IntArrayRef padding) { +Tensor reflection_pad2d_xpu(const Tensor& input, IntArrayRef padding) { auto output = at::empty({0}, input.options()); native::xpu::reflection_pad2d_kernel(output, input, padding); return output; } -Tensor& XPUNativeFunctions::reflection_pad2d_backward_out( +Tensor& reflection_pad2d_backward_out_xpu( const Tensor& grad_output, const Tensor& input, IntArrayRef padding, @@ -348,7 +60,7 @@ Tensor& XPUNativeFunctions::reflection_pad2d_backward_out( return grad_input; } -Tensor XPUNativeFunctions::reflection_pad2d_backward( +Tensor reflection_pad2d_backward_xpu( const Tensor& grad_output, const Tensor& input, IntArrayRef padding) { @@ -361,44 +73,19 @@ Tensor XPUNativeFunctions::reflection_pad2d_backward( return grad_input; } -Tensor XPUNativeFunctions::reflection_pad3d( - const Tensor& input, - IntArrayRef padding) { - Tensor output; - reflection_pad3d_meta(output, input, padding); - native::xpu::reflection_pad3d_kernel(output, input, padding); - return output; +TORCH_IMPL_FUNC(reflection_pad3d_out_xpu) +(const Tensor& input_, IntArrayRef padding, const Tensor& output) { + xpu::reflection_pad3d_kernel(output, input_, padding); } -Tensor& XPUNativeFunctions::reflection_pad3d_out( - const Tensor& input, - IntArrayRef padding, - Tensor& output) { - reflection_pad3d_meta(output, input, padding); - native::xpu::reflection_pad3d_kernel(output, input, padding); - return output; -} - -Tensor XPUNativeFunctions::reflection_pad3d_backward( - const Tensor& grad_output, - const Tensor& input, - at::IntArrayRef padding) { - Tensor grad_input; - reflection_pad3d_backward_meta(grad_input, grad_output, input, padding); - native::xpu::reflection_pad3d_backward_kernel( - grad_input, grad_output, input, padding); - return grad_input; -} - -Tensor& XPUNativeFunctions::reflection_pad3d_backward_out( - const Tensor& grad_output, - const Tensor& input, - IntArrayRef padding, - Tensor& grad_input) { - reflection_pad3d_backward_meta(grad_input, grad_output, input, padding); - native::xpu::reflection_pad3d_backward_kernel( +TORCH_IMPL_FUNC(reflection_pad3d_backward_out_xpu) +(const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + const Tensor& grad_input) { + xpu::reflection_pad3d_backward_kernel( grad_input, grad_output, input, padding); - return grad_input; } +} // namespace native } // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/Repeat.cpp b/src/ATen/native/xpu/Repeat.cpp index 38e5ae8da..c62348dd2 100644 --- a/src/ATen/native/xpu/Repeat.cpp +++ b/src/ATen/native/xpu/Repeat.cpp @@ -1,10 +1,13 @@ #include #include -#include + namespace at { -Tensor XPUNativeFunctions::repeat_interleave( +namespace native { +Tensor repeat_interleave_xpu( const Tensor& repeats, c10::optional output_size) { return at::native::xpu::repeat_interleave_kernel(repeats, output_size); } + +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ReplicationPadding.cpp b/src/ATen/native/xpu/ReplicationPadding.cpp index 062d5bc1c..3f0093845 100644 --- a/src/ATen/native/xpu/ReplicationPadding.cpp +++ b/src/ATen/native/xpu/ReplicationPadding.cpp @@ -3,339 +3,82 @@ #include #include #include -#include -#include - -namespace at { - -void replication_pad1d_meta( - Tensor& output, - const Tensor& input, - IntArrayRef paddingSize) { - TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); - - int64_t dimw = 1; - int64_t dimslices = 0; - int64_t nbatch = 1; - - int64_t pad_l = paddingSize[0]; - int64_t pad_r = paddingSize[1]; - - at::native::padding::check_valid_input<1>(input, paddingSize); - - if (input.ndimension() == 3) { - nbatch = input.size(0); - dimw++; - dimslices++; - } - - /* sizes */ - int64_t nslices = input.size(dimslices); - int64_t iwidth = input.size(dimw); - int64_t owidth = iwidth + pad_l + pad_r; - - TORCH_CHECK( - owidth >= 1, - "input (W: ", - iwidth, - ") is too small." - " Calculated output W: ", - owidth); - - if (output.defined()) { - if (input.ndimension() == 2) { - xpu::resize_out(output, {nslices, owidth}, {}, input.options()); - } else { - xpu::resize_out(output, {nbatch, nslices, owidth}, {}, input.options()); - } - } else { - if (input.ndimension() == 2) { - output = xpu::create_out({nslices, owidth}, {}, input.options()); - } else { - output = xpu::create_out({nbatch, nslices, owidth}, {}, input.options()); - } - } -} - -void replication_pad1d_backward_meta( - Tensor& grad_input, - const Tensor& grad_output, - const Tensor& input, - IntArrayRef paddingSize) { - int64_t dimw = 1; - TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); - int64_t pad_l = paddingSize[0]; - int64_t pad_r = paddingSize[1]; - - if (input.ndimension() == 3) { - dimw++; - } - - /* sizes */ - int64_t iwidth = input.size(dimw); - int64_t owidth = iwidth + pad_l + pad_r; - - TORCH_CHECK( - owidth == grad_output.size(dimw), - "grad_output width unexpected. Expected: ", - owidth, - " Got: ", - grad_output.size(dimw)); - - if (grad_input.defined()) { - xpu::resize_out(grad_input, input.sizes(), {}, input.options()); - } else { - grad_input = xpu::create_out(input.sizes(), {}, input.options()); - } -} - -void replication_pad2d_meta( - Tensor& output, - const Tensor& input, - IntArrayRef paddingSize) { - TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4"); - int64_t pad_l = paddingSize[0]; - int64_t pad_r = paddingSize[1]; - int64_t pad_t = paddingSize[2]; - int64_t pad_b = paddingSize[3]; - int64_t dimw = 2; - int64_t dimh = 1; - int64_t dimslices = 0; - int64_t nbatch = 1; - - at::native::padding::check_valid_input<2>(input, paddingSize); - - if (input.dim() == 4) { - nbatch = input.size(0); - dimw++; - dimh++; - dimslices++; - } - - /* sizes */ - int64_t nslices = input.size(dimslices); - int64_t iheight = input.size(dimh); - int64_t iwidth = input.size(dimw); - int64_t oheight = iheight + pad_t + pad_b; - int64_t owidth = iwidth + pad_l + pad_r; - - TORCH_CHECK( - owidth >= 1 || oheight >= 1, - "input (H: ", - iheight, - ", W: ", - iwidth, - " ) is too small." - " Calculated output H: ", - oheight, - " W: ", - owidth); - - if (output.defined()) { - if (input.dim() == 3) { - xpu::resize_out(output, {nslices, oheight, owidth}, {}, input.options()); - } else { - xpu::resize_out( - output, {nbatch, nslices, oheight, owidth}, {}, input.options()); - } - } else { - if (input.dim() == 3) { - output = xpu::create_out({nslices, oheight, owidth}, {}, input.options()); - } else { - output = xpu::create_out( - {nbatch, nslices, oheight, owidth}, {}, input.options()); - } - } -} - -void replication_pad3d_meta( - Tensor& output, - const Tensor& input, - IntArrayRef paddingSize) { - TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6"); - int64_t pleft = paddingSize[0]; - int64_t pright = paddingSize[1]; - int64_t ptop = paddingSize[2]; - int64_t pbottom = paddingSize[3]; - int64_t pfront = paddingSize[4]; - int64_t pback = paddingSize[5]; - int64_t dimw = 3; - int64_t dimh = 2; - int64_t dimd = 1; - int64_t dimslices = 0; - int64_t nbatch = 1; - - at::native::padding::check_valid_input<3>(input, paddingSize); - - if (input.dim() == 5) { - nbatch = input.size(0); - dimw++; - dimh++; - dimd++; - dimslices++; - } - - /* sizes */ - int64_t nslices = input.size(dimslices); - int64_t idepth = input.size(dimd); - int64_t iheight = input.size(dimh); - int64_t iwidth = input.size(dimw); - int64_t odepth = idepth + pfront + pback; - int64_t oheight = iheight + ptop + pbottom; - int64_t owidth = iwidth + pleft + pright; - TORCH_CHECK( - owidth >= 1 || oheight >= 1 || odepth >= 1, - "input (D: ", - idepth, - " H: ", - iheight, - ", W: ", - iwidth, - ") is too small." - " Calculated output D: ", - odepth, - " H: ", - oheight, - " W: ", - owidth); +#include - if (output.defined()) { - if (input.dim() == 4) { - xpu::resize_out( - output, {nslices, odepth, oheight, owidth}, {}, input.options()); - } else { - xpu::resize_out( - output, - {nbatch, nslices, odepth, oheight, owidth}, - {}, - input.options()); - } - } else { - if (input.dim() == 4) { - output = xpu::create_out( - {nslices, odepth, oheight, owidth}, {}, input.options()); - } else { - output = xpu::create_out( - {nbatch, nslices, odepth, oheight, owidth}, {}, input.options()); - } - } -} +#include +#include +#include +#include +#include +#include -Tensor XPUNativeFunctions::replication_pad1d( - const Tensor& input, - IntArrayRef padding) { - Tensor output; - replication_pad1d_meta(output, input, padding); - native::xpu::replication_pad1d_kernel(output, input, padding); - return output; -} +namespace at { +namespace native { -Tensor& XPUNativeFunctions::replication_pad1d_out( - const Tensor& input, - IntArrayRef padding, - Tensor& output) { - replication_pad1d_meta(output, input, padding); - native::xpu::replication_pad1d_kernel(output, input, padding); - return output; +TORCH_IMPL_FUNC(replication_pad1d_out_xpu) +(const Tensor& input, IntArrayRef paddingSize, const Tensor& output) { + xpu::replication_pad1d_kernel(output, input, paddingSize); } -Tensor XPUNativeFunctions::replication_pad1d_backward( - const Tensor& grad_output, - const Tensor& input, - IntArrayRef padding) { - Tensor grad_input; - replication_pad1d_backward_meta(grad_input, grad_output, input, padding); - native::xpu::replication_pad1d_backward_kernel( - grad_input, grad_output, input, padding); - return grad_input; +TORCH_IMPL_FUNC(replication_pad1d_backward_out_xpu) +(const Tensor& gradOutput, + const Tensor& input, + IntArrayRef paddingSize, + const Tensor& gradInput) { + xpu::replication_pad1d_backward_kernel( + gradInput, gradOutput, input, paddingSize); } -Tensor& XPUNativeFunctions::replication_pad1d_backward_out( - const Tensor& grad_output, - const Tensor& input, - IntArrayRef padding, - Tensor& grad_input) { - replication_pad1d_backward_meta(grad_input, grad_output, input, padding); - native::xpu::replication_pad1d_backward_kernel( - grad_input, grad_output, input, padding); - return grad_input; -} - -Tensor& XPUNativeFunctions::replication_pad2d_out( - const Tensor& input, - IntArrayRef padding, - Tensor& output) { - replication_pad2d_meta(output, input, padding); - native::xpu::replication_pad2d_kernel(output, input, padding); - return output; -} - -Tensor XPUNativeFunctions::replication_pad2d( - const Tensor& input, - IntArrayRef padding) { - Tensor output; - replication_pad2d_meta(output, input, padding); - native::xpu::replication_pad2d_kernel(output, input, padding); - return output; +TORCH_IMPL_FUNC(replication_pad2d_out_xpu) +(const Tensor& input, IntArrayRef paddingSize, const Tensor& output) { + xpu::replication_pad2d_kernel(output, input, paddingSize); } -Tensor& XPUNativeFunctions::replication_pad2d_backward_out( +Tensor& replication_pad2d_backward_out_xpu( const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input) { - native::xpu::replication_pad2d_backward_kernel( + xpu::replication_pad2d_backward_kernel( grad_input, grad_output, input, padding); return grad_input; } -Tensor XPUNativeFunctions::replication_pad2d_backward( +Tensor replication_pad2d_backward_xpu( const Tensor& grad_output, const Tensor& input, IntArrayRef padding) { auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - native::xpu::replication_pad2d_backward_kernel( + xpu::replication_pad2d_backward_kernel( grad_input, grad_output, input, padding); return grad_input; } -Tensor XPUNativeFunctions::replication_pad3d( - const Tensor& input, - IntArrayRef padding) { - Tensor output; - replication_pad3d_meta(output, input, padding); - native::xpu::replication_pad3d_kernel(output, input, padding); - return output; -} - -Tensor& XPUNativeFunctions::replication_pad3d_out( - const Tensor& input, - IntArrayRef padding, - Tensor& output) { - replication_pad3d_meta(output, input, padding); - native::xpu::replication_pad3d_kernel(output, input, padding); - return output; +TORCH_IMPL_FUNC(replication_pad3d_out_xpu) +(const Tensor& input, IntArrayRef paddingSize, const Tensor& output) { + xpu::replication_pad3d_kernel(output, input, paddingSize); } -Tensor XPUNativeFunctions::replication_pad3d_backward( +Tensor replication_pad3d_backward_xpu( const Tensor& grad_output, const Tensor& input, at::IntArrayRef padding) { auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - native::xpu::replication_pad3d_backward_kernel( + xpu::replication_pad3d_backward_kernel( grad_input, grad_output, input, padding); return grad_input; } -Tensor& XPUNativeFunctions::replication_pad3d_backward_out( +Tensor& replication_pad3d_backward_out_xpu( const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input) { - native::xpu::replication_pad3d_backward_kernel( + xpu::replication_pad3d_backward_kernel( grad_input, grad_output, input, padding); return grad_input; } +} // namespace native } // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/Resize.cpp b/src/ATen/native/xpu/Resize.cpp index 719b7ea84..66c95302b 100644 --- a/src/ATen/native/xpu/Resize.cpp +++ b/src/ATen/native/xpu/Resize.cpp @@ -1,21 +1,25 @@ -#include #include #include #include -#include #include +#include #include -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#endif +#include +#include +#include +#include #include namespace at { + +namespace native { +const at::Tensor& resize_( + const at::Tensor& self, + at::IntArrayRef size, + ::std::optional memory_format = ::std::nullopt); +} namespace native::xpu { const Tensor& resize_xpu_( @@ -50,7 +54,7 @@ const Tensor& resize_as_( const Tensor& self, const Tensor& the_template, c10::optional optional_memory_format = c10::nullopt) { - return resize_(self, the_template.sizes(), optional_memory_format); + return resize_xpu_(self, the_template.sizes(), optional_memory_format); } Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) { @@ -60,17 +64,15 @@ Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) { } else { at::native::resize_(dst, self.sizes()); } - return at::XPUNativeFunctions::copy_(const_cast(dst), self, false); + return const_cast(dst.copy_(self, false)); } -// For test infrastructure Tensor _copy_from(const Tensor& self, const Tensor& dst, bool non_blocking) { dst.resize_as_(self); - return at::XPUNativeFunctions::copy_( - const_cast(dst), self, non_blocking); + return const_cast(dst.copy_(self, non_blocking)); } -// Should not register the operator. Desc of +// Should not register the operator. Desc of resize_as_ and // _copy_from_and_resize native_function.yaml is simplistic since PyTorch // intends backend should not register it (e.g. CPU/CUDA) or handle // sanity check by backend (e.g. MPS). @@ -80,23 +82,18 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { TORCH_FN(_copy_from_and_resize)); m.impl(TORCH_SELECTIVE_NAME("aten::_copy_from"), TORCH_FN(_copy_from)); } - } // namespace native::xpu -const at::Tensor& XPUNativeFunctions::resize_( +namespace native { + +const at::Tensor& resize_xpu_( const at::Tensor& self, at::IntArrayRef size, c10::optional memory_format) { return native::xpu::resize_xpu_(self, size, memory_format); } -Tensor& XPUNativeFunctions::set_(Tensor& self, Storage source) { - int64_t new_size = - static_cast(source.nbytes() / self.dtype().itemsize()); - return self.set_(source, 0, new_size, {}); -} - -Tensor& XPUNativeFunctions::set_( +Tensor& set_storage_xpu_( Tensor& self, Storage source, int64_t storage_offset, @@ -112,16 +109,12 @@ Tensor& XPUNativeFunctions::set_( return self; } -Tensor& XPUNativeFunctions::set_(Tensor& self, const at::Tensor& source) { - return at::native::set_tensor_(self, source); -} - -Tensor& XPUNativeFunctions::set_(Tensor& result) { +Tensor& set_xpu_(Tensor& result) { caffe2::TypeMeta dtype = result.dtype(); Storage storage(Storage::use_byte_size_t(), 0, c10::GetAllocator(kXPU), true); result.set_(storage, 0, {0}, {}); TORCH_INTERNAL_ASSERT(dtype == result.dtype()); return result; } - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp index 3c469b23d..95824577a 100644 --- a/src/ATen/native/xpu/SoftMax.cpp +++ b/src/ATen/native/xpu/SoftMax.cpp @@ -1,148 +1,30 @@ -#include + #include #include #include -#include #include - -namespace at { - -Tensor& _softmax_meta( - const Tensor& input, - const int64_t dim, - const bool half_to_float, - Tensor& out) { - int64_t dim_ = maybe_wrap_dim(dim, input.dim()); - - auto output_options = - input.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT); - - if (half_to_float) { - output_options = output_options.dtype(ScalarType::Float); - } - - int64_t input_dim = input.dim() > 0 ? input.dim() : 1; - TORCH_CHECK( - dim_ >= 0 && dim_ < input_dim, - "dim must be non-negative and less than input dimensions"); - - if (out.defined()) { - xpu::resize_out(out, input.sizes(), {}, output_options); - } else { - out = xpu::create_out(input.sizes(), {}, output_options); - } - - return out; -} - -Tensor XPUNativeFunctions::_softmax( - const Tensor& self, - int64_t dim, - bool half_to_float) { - Tensor out; - out = _softmax_meta(self, dim, half_to_float, out); - native::xpu::_softmax_kernel(self, dim, half_to_float, out); - return out; -} - -Tensor& XPUNativeFunctions::_softmax_out( - const Tensor& self, - int64_t dim, - bool half_to_float, - Tensor& out) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, out, "xpu::_softmax_out_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::_softmax_out_out", "self"); - out = _softmax_meta(self, dim, half_to_float, out); - return native::xpu::_softmax_kernel(self, dim, half_to_float, out); +#include + +#include +#include +#include +#include +namespace at::native { + +TORCH_IMPL_FUNC(softmax_xpu_out) +(const Tensor& input, + const int64_t dim, + const bool half_to_float, + const Tensor& output) { + xpu::_softmax_kernel(input, dim, half_to_float, output); } -Tensor XPUNativeFunctions::_log_softmax( - const Tensor& self, - int64_t dim, - bool half_to_float) { - Tensor out; - out = _softmax_meta(self, dim, half_to_float, out); - native::xpu::_log_softmax_kernel(self, dim, half_to_float, out); - return out; -} - -Tensor& XPUNativeFunctions::_log_softmax_out( - const Tensor& self, - int64_t dim, - bool half_to_float, - Tensor& out) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, out, "xpu::_log_softmax_out_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::_log_softmax_out_out", "self"); - out = _softmax_meta(self, dim, half_to_float, out); - return native::xpu::_log_softmax_kernel(self, dim, half_to_float, out); -} - -Tensor& _softmax_backward_data_meta( - const Tensor& grad, - const Tensor& output, - int64_t dim, - ScalarType input_dtype, - Tensor& grad_input) { - TensorArg grad_arg{grad, "grad", 1}, output_arg{output, "output", 2}; - checkSameSize("softmax_backward", grad_arg, output_arg); - - int64_t dim_ = maybe_wrap_dim(dim, grad.dim()); - - auto grad_input_options = - grad.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT); - - bool half_to_float = grad.scalar_type() != input_dtype; - if (half_to_float) { - if (grad.scalar_type() == ScalarType::Float && - input_dtype == ScalarType::Half) { - grad_input_options = grad_input_options.dtype(ScalarType::Half); - } - } - - int64_t grad_dim = grad.dim() > 0 ? grad.dim() : 1; - TORCH_CHECK( - dim_ >= 0 && dim_ < grad_dim, - "dim must be non-negative and less than input dimensions"); - - if (grad_input.defined()) { - xpu::resize_out(grad_input, grad.sizes(), {}, grad_input_options); - } else { - grad_input = xpu::create_out(grad.sizes(), {}, grad_input_options); - } - - return grad_input; -} - -Tensor XPUNativeFunctions::_softmax_backward_data( - const at::Tensor& grad_output, - const at::Tensor& output, - int64_t dim, - at::ScalarType input_dtype) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, grad_output, "xpu::_softmax_backward_data", "grad_output"); - c10::impl::check_and_update_common_device( - common_device, output, "xpu::_softmax_backward_data", "output"); - Tensor grad_input; - grad_input = _softmax_backward_data_meta( - grad_output, output, dim, input_dtype, grad_input); - native::xpu::_softmax_backward_kernel( - grad_output, output, dim, false, grad_input); - return grad_input; -} - -Tensor& XPUNativeFunctions::_softmax_backward_data_out( - const at::Tensor& grad_output, - const at::Tensor& output, - int64_t dim, - at::ScalarType input_dtype, - Tensor& grad_input) { +TORCH_IMPL_FUNC(softmax_backward_xpu_out) +(const Tensor& grad, + const Tensor& output, + int64_t dim, + ScalarType input_dtype, + const Tensor& grad_input) { std::optional common_device = std::nullopt; c10::impl::check_and_update_common_device( common_device, @@ -151,44 +33,21 @@ Tensor& XPUNativeFunctions::_softmax_backward_data_out( "grad_input"); c10::impl::check_and_update_common_device( common_device, - grad_output, + output, "xpu::_softmax_backward_data_out_out", "grad_output"); c10::impl::check_and_update_common_device( common_device, output, "xpu::_softmax_backward_data_out_out", "output"); - grad_input = _softmax_backward_data_meta( - grad_output, output, dim, input_dtype, grad_input); - return native::xpu::_softmax_backward_kernel( - grad_output, output, dim, false, grad_input); -} -Tensor XPUNativeFunctions::_log_softmax_backward_data( - const at::Tensor& grad_output, - const at::Tensor& output, - int64_t dim, - at::ScalarType input_dtype) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, - grad_output, - "xpu::_log_softmax_backward_data", - "grad_output"); - c10::impl::check_and_update_common_device( - common_device, output, "xpu::_log_softmax_backward_data", "output"); - Tensor grad_input; - grad_input = _softmax_backward_data_meta( - grad_output, output, dim, input_dtype, grad_input); - native::xpu::_log_softmax_backward_kernel( - grad_output, output, dim, false, grad_input); - return grad_input; + native::xpu::_softmax_backward_kernel(grad, output, dim, false, grad_input); } -Tensor& XPUNativeFunctions::_log_softmax_backward_data_out( - const at::Tensor& grad_output, - const at::Tensor& output, - int64_t dim, - at::ScalarType input_dtype, - Tensor& grad_input) { +TORCH_IMPL_FUNC(log_softmax_backward_xpu_out) +(const Tensor& grad, + const Tensor& output, + int64_t dim, + ScalarType input_dtype, + const Tensor& grad_input) { std::optional common_device = std::nullopt; c10::impl::check_and_update_common_device( common_device, @@ -197,7 +56,7 @@ Tensor& XPUNativeFunctions::_log_softmax_backward_data_out( "grad_input"); c10::impl::check_and_update_common_device( common_device, - grad_output, + output, "xpu::_log_softmax_backward_data_out_out", "grad_output"); c10::impl::check_and_update_common_device( @@ -205,10 +64,16 @@ Tensor& XPUNativeFunctions::_log_softmax_backward_data_out( output, "xpu::_log_softmax_backward_data_out_out", "output"); - grad_input = _softmax_backward_data_meta( - grad_output, output, dim, input_dtype, grad_input); - return native::xpu::_log_softmax_backward_kernel( - grad_output, output, dim, false, grad_input); + native::xpu::_log_softmax_backward_kernel( + grad, output, dim, false, grad_input); +} + +TORCH_IMPL_FUNC(log_softmax_xpu_out) +(const Tensor& input, + const int64_t dim, + const bool half_to_float, + const Tensor& output) { + xpu::_log_softmax_kernel(input, dim, half_to_float, output); } -} // namespace at +} // namespace at::native diff --git a/src/ATen/native/xpu/Sorting.cpp b/src/ATen/native/xpu/Sorting.cpp index e934347c2..5e7bbc0cb 100644 --- a/src/ATen/native/xpu/Sorting.cpp +++ b/src/ATen/native/xpu/Sorting.cpp @@ -1,81 +1,21 @@ -#include + #include #include +#include #include +#include #include + #include -#include -#include #include +#include -namespace at { - -void sort_stable_meta( - const Tensor& self, - Tensor& values, - Tensor& indices, - int64_t dim) { - maybe_wrap_dim(dim, self.dim()); - - // See issue: https://github.com/pytorch/pytorch/issues/65863 - // Strides should be dense, so as not to allocate too much memory. - // We either use 'self' strides, or infer dense strides from them. - std::vector strides = (self.is_non_overlapping_and_dense()) - ? self.strides().vec() - : at::infer_dense_strides(self.sizes(), self.strides()); - auto sizes = self.sizes(); - if (values.defined()) { - at::xpu::resize_out(values, sizes, strides, self.options()); - } else { - values = at::xpu::create_out(sizes, strides, self.options()); - } - if (indices.defined()) { - at::xpu::resize_out(indices, sizes, strides, self.options().dtype(kLong)); - } else { - indices = at::xpu::create_out(sizes, strides, self.options().dtype(kLong)); - } -} - -::std::tuple XPUNativeFunctions::sort( - const Tensor& self, - ::std::optional stable, - int64_t dim, - bool descending) { - Tensor values, indices; - sort_stable_meta(self, values, indices, dim); - return native::xpu::sort_stable_kernel( - self, stable, values, indices, dim, descending); -} +#include +#include -::std::tuple XPUNativeFunctions::sort_out( - const Tensor& self, - ::std::optional stable, - int64_t dim, - bool descending, - Tensor& values, - Tensor& indices) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, values, "xpu::sort_out_values_stable", "values"); - c10::impl::check_and_update_common_device( - common_device, indices, "xpu::sort_out_values_stable", "indices"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::sort_out_values_stable", "self"); - sort_stable_meta(self, values, indices, dim); - return native::xpu::sort_stable_kernel( - self, stable, values, indices, dim, descending); -} - -Tensor XPUNativeFunctions::argsort( - const Tensor& self, - bool stable, - int64_t dim, - bool descending) { - Tensor values, indices; - sort_stable_meta(self, values, indices, dim); - return std::get<1>(native::xpu::sort_stable_kernel( - self, stable, values, indices, dim, descending)); -} +namespace at { +namespace native { +REGISTER_XPU_DISPATCH(sort_stub, xpu::sort_stable_kernel); std::tuple median_with_indices_impl( Tensor& values, @@ -161,7 +101,7 @@ Tensor median_impl(const Tensor& self, bool ignore_nan) { } } -std::tuple XPUNativeFunctions::median_out( +std::tuple median_out_xpu( const Tensor& self, int64_t dim, bool keepdim, @@ -171,11 +111,11 @@ std::tuple XPUNativeFunctions::median_out( values, indices, self, dim, keepdim, /*ignore_nan=*/false); } -Tensor XPUNativeFunctions::median(const Tensor& self) { +Tensor median_xpu(const Tensor& self) { return median_impl(self, /*ignore_nan=*/false); } -std::tuple XPUNativeFunctions::nanmedian_out( +std::tuple nanmedian_out_xpu( const Tensor& self, int64_t dim, bool keepdim, @@ -185,8 +125,9 @@ std::tuple XPUNativeFunctions::nanmedian_out( values, indices, self, dim, keepdim, /*ignore_nan=*/true); } -Tensor XPUNativeFunctions::nanmedian(const Tensor& self) { +Tensor nanmedian_xpu(const Tensor& self) { return median_impl(self, /*ignore_nan=*/true); } -} // namespace at +} // namespace native +} // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/SummaryOps.cpp b/src/ATen/native/xpu/SummaryOps.cpp index cf4cf3f27..87ee9f74a 100644 --- a/src/ATen/native/xpu/SummaryOps.cpp +++ b/src/ATen/native/xpu/SummaryOps.cpp @@ -1,9 +1,10 @@ #include -#include #include +#include namespace at { -Tensor XPUNativeFunctions::bincount( +namespace native { +Tensor _bincount_xpu( const Tensor& self, const c10::optional& weights_opt, int64_t minlength) { @@ -20,5 +21,6 @@ Tensor XPUNativeFunctions::bincount( return native::xpu::bincount_kernel(self, weights, minlength); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp index 99d50acbf..69c09804d 100644 --- a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp +++ b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp @@ -1,131 +1,64 @@ -#include #include #include #include #include +#include #include #include -#include -#include -#include -#include +#include +#include #include #include #include #include #include -#include -#include +#include +#include +#include -namespace at { - -using namespace at::native; -using namespace at::native::xpu; - -// TODO: Should reuse source in stock PyTorch when in-tree. - -static bool all_strides_match(TensorList tensors) { - TORCH_CHECK(!tensors.empty()); - auto strides = tensors[0].strides(); - for (auto& tensor : tensors.slice(1)) { - if (!strides.equals(tensor.strides())) { - return false; - } - } - return true; -} - -// Replace indexed dimensions in src with stride 0 and the size of the result -// tensor. The offset in these dimensions is computed by the kernel using the -// index tensor's values and the stride of src. The new shape is not meaningful. -// It's used to make the shape compatible with the result tensor. -static Tensor restride_src( - const Tensor& src, - int64_t dims_before, - int64_t dims_indexed, - IntArrayRef replacement_shape) { - auto shape = DimVector(src.sizes()); - auto strides = DimVector(src.strides()); - int64_t end = dims_before + dims_indexed; - shape.erase(shape.begin() + dims_before, shape.begin() + end); - strides.erase(strides.begin() + dims_before, strides.begin() + end); - shape.insert( - shape.begin() + dims_before, - replacement_shape.begin(), - replacement_shape.end()); - strides.insert(strides.begin() + dims_before, replacement_shape.size(), 0); - return src.as_strided(shape, strides); -} - -// Add dimensions of size 1 to an index tensor so that it can be broadcast to -// the result shape and iterated over element-wise like the result tensor and -// the restrided src. -static Tensor reshape_indexer( - const Tensor& index, - int64_t dims_before, - int64_t dims_after) { - auto orig_shape = index.sizes(); - auto shape = DimVector(); - shape.append(dims_before, 1); - shape.append(orig_shape.begin(), orig_shape.end()); - shape.append(dims_after, 1); - return index.reshape(shape); -} - -native::AdvancedIndex::AdvancedIndex( - const Tensor& src, - TensorList indices_list) { - int64_t element_size_bytes = src.element_size(); - int64_t dims_before = 0, dims_after = 0, dims_indexed = 0; - IntArrayRef replacement_shape; - for (const auto dim : c10::irange(indices_list.size())) { - if (!indices_list[dim].defined()) { - if (dims_indexed == 0) { - dims_before++; - } else { - dims_after++; - } - } else { - dims_indexed++; - replacement_shape = indices_list[dim].sizes(); - indexed_sizes.push_back(src.size(dim)); - indexed_strides.push_back(src.stride(dim) * element_size_bytes); - } - } - - // Check if the indexed subspace contains a dim of size 0, but the replacement - // shape does not. This implies that an index is out of bounds, because there - // is no number that's a valid index for an empty tensor. Normally, out of - // bounds is handled in the indexing kernel, but this case fails earlier in - // restride_src with an unhelpful error message. - if (std::find(indexed_sizes.begin(), indexed_sizes.end(), 0) != - indexed_sizes.end() && - std::find(replacement_shape.begin(), replacement_shape.end(), 0) == - replacement_shape.end()) { - TORCH_CHECK_INDEX( - false, "index is out of bounds for dimension with size 0"); - } +#include +#include - this->dims_before = dims_before; - this->dims_after = dims_after; - this->src = restride_src(src, dims_before, dims_indexed, replacement_shape); - - for (auto& index : indices_list) { - if (index.defined()) { - indices.push_back(reshape_indexer(index, dims_before, dims_after)); - } - } +namespace at { - if (indices.size() >= 2 && (this->src.device().type() == kXPU)) { - if (!all_strides_match(indices)) { - for (auto& indice : indices) { - indice = indice.contiguous(); - } - } - } +namespace native { +REGISTER_XPU_DISPATCH(index_stub, &xpu::index_kernel); +REGISTER_XPU_DISPATCH(index_put_stub, &xpu::index_put_kernel); +REGISTER_XPU_DISPATCH( + index_put_with_sort_stub, + &xpu::index_put_deterministic_kernel); +// REGISTER_XPU_DISPATCH(index_stub, &xpu::index_kernel); +REGISTER_XPU_DISPATCH(scatter_stub, &xpu::scatter_kernel); +REGISTER_XPU_DISPATCH(scatter_fill_stub, &xpu::scatter_fill_kernel); +REGISTER_XPU_DISPATCH(scatter_add_stub, &xpu::scatter_add_kernel); +REGISTER_XPU_DISPATCH(scatter_reduce_stub, &xpu::scatter_reduce_kernel); +REGISTER_XPU_DISPATCH(scatter_reduce_two_stub, &xpu::scatter_reduce_two_kernel); +REGISTER_XPU_DISPATCH( + scatter_scalar_reduce_stub, + &xpu::scatter_scalar_reduce_kernel); +REGISTER_XPU_DISPATCH(gather_stub, &xpu::gather_kernel); +REGISTER_XPU_DISPATCH(index_fill_stub, &xpu::index_fill_kernel); + +TORCH_IMPL_FUNC(index_add_xpu_out) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& source, + const Scalar& alpha, + const Tensor& result) { + std::optional common_device = std::nullopt; + c10::impl::check_and_update_common_device( + common_device, self, "xpu::index_add_out", "self"); + c10::impl::check_and_update_common_device( + common_device, index, "xpu::index_add_out", "index"); + c10::impl::check_and_update_common_device( + common_device, source, "xpu::index_add_out", "source"); + dim = maybe_wrap_dim(dim, self.dim()); + // index_func_meta_impl(result, self, dim, index, source, "index_add"); + native::xpu::index_add_kernel(self, dim, index, source, alpha, result); } -Tensor& XPUNativeFunctions::masked_fill_( +Tensor& masked_fill__xpu( Tensor& self, const Tensor& mask, const Scalar& value) { @@ -160,12 +93,12 @@ Tensor& XPUNativeFunctions::masked_fill_( .add_const_input(*b_mask) .build(); - native::xpu::masked_fill_kernel(iter, value); + xpu::masked_fill_kernel(iter, value); namedinference::propagate_names_if_nonempty(self, maybe_outnames); return self; } -Tensor& XPUNativeFunctions::masked_fill_( +Tensor& masked_fill__xpu( Tensor& self, const Tensor& mask, const Tensor& value) { @@ -182,1278 +115,12 @@ Tensor& XPUNativeFunctions::masked_fill_( TORCH_CHECK( self.device().is_xpu(), "masked_fill_: Expected inputs to be on same device") - return XPUNativeFunctions::masked_fill_(self, mask, value.item()); -} - -void index_func_meta_impl( - Tensor& result, - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& source, - c10::string_view func) { - auto numel = index.numel(); - - TORCH_CHECK_INDEX( - index.dim() <= 1, - func, - "_(): Index is supposed to be a vector, but got dim: ", - index.dim(), - " with type: ", - index.scalar_type(), - " and size: ", - index.sizes()); - TORCH_CHECK( - index.scalar_type() == ScalarType::Long || - index.scalar_type() == ScalarType::Int, - func, - "_(): Expected dtype int32/int64 for index but got: ", - index.scalar_type()); - TORCH_CHECK( - self.scalar_type() == source.scalar_type(), - func, - "_(): self (", - self.scalar_type(), - ") and source (", - source.scalar_type(), - ") must have the same scalar type"); - TORCH_CHECK( - dim == 0 || dim < source.dim(), - func, - "_(): Indexing dim ", - dim, - " is out of bounds of the source tensor with dim ", - source.dim()); - TORCH_CHECK( - numel == (source.dim() == 0 ? 1 : source.size(dim)), - func, - "_(): Number of indices (", - numel, - ") should be equal to source.size(dim): (", - source.size(dim), - "), for dim: ", - dim); - - auto self_sizes = self.sizes().vec(); - auto source_sizes = source.sizes().vec(); - if (source.dim() != 0 && self.dim() != 0) { - self_sizes.erase(self_sizes.begin() + dim); - source_sizes.erase(source_sizes.begin() + dim); - } - TORCH_CHECK( - self_sizes == source_sizes, - "source tensor shape must match self tensor shape, excluding the specified dimension. Got self.shape = ", - self.sizes(), - " source.shape = ", - source.sizes()); - - bool is_defined = result.defined(); - - // set_output_raw_strided - auto options = self.options(); - auto sizes = self.sizes(); - if (is_defined) { - at::xpu::resize_out(result, sizes, {}, options); - } else { - result = at::xpu::create_out(sizes, {}, options); - } - - if (is_defined) { - at::assert_no_internal_overlap(result); - at::assert_no_overlap(result, index); - at::assert_no_overlap(result, source); - } - - // A hack to run TensorIterator checks in the meta function. - // See comment: - // https://github.com/pytorch/pytorch/pull/65993#discussion_r760307417 - // TODO: (@krshrimali) Try inheriting from TensorIteratorBase instead. - if (result.device() == kMeta && result.dim() > 0) { - auto selfSlice = result.select(dim, 0); - auto sourceSlice = source.select(dim, 0); - auto iter = - TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice); - } -} - -Tensor& XPUNativeFunctions::index_add_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& source, - const Scalar& alpha, - Tensor& out) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, self, "xpu::index_add_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::index_add_out", "index"); - c10::impl::check_and_update_common_device( - common_device, source, "xpu::index_add_out", "source"); - dim = maybe_wrap_dim(dim, self.dim()); - index_func_meta_impl(out, self, dim, index, source, "index_add"); - native::xpu::index_add_kernel(self, dim, index, source, alpha, out); - return out; -} - -Tensor& XPUNativeFunctions::index_add_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& source, - const Scalar& alpha) { - return index_add_out(self, dim, index, source, alpha, self); -} - -Tensor XPUNativeFunctions::index_add( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& source, - const Scalar& alpha) { - Tensor out; - return index_add_out(self, dim, index, source, alpha, out); -} - -Tensor& XPUNativeFunctions::index_fill_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& source) { - at::NoNamesGuard guard; - - TORCH_CHECK_INDEX( - index.scalar_type() == ScalarType::Long, - "index_fill_(): Expected dtype int64 for index."); - - at::assert_no_overlap(self, index); - if (at::has_internal_overlap(self) == at::MemOverlap::Yes) { - TORCH_WARN( - "Use of index_fill_ on expanded tensors is deprecated. " - "Please clone() the tensor before performing this operation. " - "This also applies to advanced indexing e.g. tensor[mask] = scalar"); - } - - if (!self.is_complex() && source.isComplex()) { - TORCH_CHECK( - false, - "index_fill_(): Converting complex Scalar to non-complex type is not supported"); - } - - TORCH_CHECK( - self.device() == index.device(), - "index_fill_(): self and index value tensors ", - "should have same device type, but got self tensor device type ", - self.device(), - " and index value ", - "tensor device type ", - index.device()); - - // Handle the case when `self` is 0-dim - Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self; - dim = at::maybe_wrap_dim(dim, self_nonzero_dim); - - native::xpu::index_fill_kernel(self, dim, index, source); - return self; -} - -Tensor& XPUNativeFunctions::index_fill_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& source) { - TORCH_CHECK( - source.dim() == 0, - "index_fill_ only supports a 0-dimensional value tensor, but got tensor " - "with ", - source.dim(), - " dimension(s)."); - return self.index_fill_(dim, index, source.item()); -} - -void check_indices_on_cpu_or_selfdevice( - const Tensor& self, - const c10::List>& indices) { - auto dev = self.device(); - bool indices_on_cpu_or_dev = std::all_of( - indices.begin(), indices.end(), [=](const c10::optional& opt) { - if (opt.has_value()) { - // for optional cases - if (!opt->defined()) { - return true; - } - return (opt->is_cpu() || opt->device() == dev); - } else { - return true; - } - }); - TORCH_CHECK( - indices_on_cpu_or_dev, - "indices should be either on ", - at::kCPU, - " or on the same device as the indexed tensor (", - dev, - ")"); -} - -static void build_index_op( - TensorIteratorBase& iter, - const native::AdvancedIndex& info, - Tensor& result) { - TensorIteratorConfig config; - // info.src is a restrided view of result - config.set_check_mem_overlap(false) - .check_all_same_dtype(false) - .add_output(result) - .add_input(info.src); - for (auto& index : info.indices) { - config.add_owned_const_input(index); - } - if (!result.defined()) { - config.declare_static_dtype_and_device( - info.src.scalar_type(), info.src.device()); - } - iter.build(config); -} - -Tensor& XPUNativeFunctions::index_out( - const Tensor& self, - const c10::List>& indices, - Tensor& result) { - TORCH_CHECK( - indices.size() <= (size_t)self.dim(), - "too many indices for tensor of dimension ", - self.dim(), - " (got ", - indices.size(), - ")"); - - check_indices_on_cpu_or_selfdevice(self, indices); - - if (result.defined()) { - TORCH_CHECK( - self.scalar_type() == result.scalar_type(), - "index_out: self (", - self.scalar_type(), - ") and result (", - result.scalar_type(), - ") must have the same scalar type"); - at::assert_no_internal_overlap(result); - at::assert_no_overlap(result, self); - for (const c10::optional& index : indices) { - if (index.has_value()) { - at::assert_no_overlap(result, *index); - } - } - } - auto info = native::make_info(self, std::move(indices)); - TensorIterator iter; - build_index_op(iter, info, result); - - native::xpu::index_kernel( - iter, - info.indexed_sizes, - info.indexed_strides, - IntArrayRef{}, - IntArrayRef{}); - - return result; -} - -Tensor XPUNativeFunctions::index( - const Tensor& self, - const c10::List>& indices) { - Tensor result; - TORCH_CHECK( - indices.size() <= (size_t)self.dim(), - "too many indices for tensor of dimension ", - self.dim(), - " (got ", - indices.size(), - ")"); - - check_indices_on_cpu_or_selfdevice(self, indices); - - auto info = native::make_info(self, std::move(indices)); - TensorIterator iter; - build_index_op(iter, info, result); - - native::xpu::index_kernel( - iter, - info.indexed_sizes, - info.indexed_strides, - IntArrayRef{}, - IntArrayRef{}); - - return iter.output(); -} - -// PyTorch defines it in cpp source. Copy it. -static TensorIterator make_index_put_iterator( - const native::AdvancedIndex& info, - const Tensor& value) { - TORCH_CHECK( - is_expandable_to(value.sizes(), info.src.sizes()), - "shape mismatch: value tensor of shape ", - value.sizes(), - " cannot be broadcast to indexing result of shape ", - info.src.sizes()); - TORCH_CHECK( - value.scalar_type() == info.src.scalar_type(), - "Index put requires the source and destination dtypes match, " - "got ", - info.src.scalar_type(), - " for the destination " - "and ", - value.scalar_type(), - " for the source."); - TensorIteratorConfig config; - // info.src is restrided by restride_src with 0 strided dimensions - config.set_check_mem_overlap(false); - config.resize_outputs(false); - config.check_all_same_dtype(false); - config.add_output(info.src); - config.add_input(value); - for (auto& index : info.indices) { - config.add_input(index); - } - return config.build(); -} - -Tensor& XPUNativeFunctions::_index_put_impl_( - Tensor& self, - const torch::List>& indices, - const Tensor& value, - const bool accumulate, - const bool unsafe) { - TORCH_CHECK_INDEX( - indices.size() <= (size_t)self.dim(), - "too many indices for tensor of dimension ", - self.dim(), - " (got ", - indices.size(), - ")"); - if (at::has_internal_overlap(self) == MemOverlap::Yes) { - TORCH_WARN( - "Use of index_put_ on expanded tensors is deprecated. " - "Please clone() the tensor before performing this operation. " - "This also applies to advanced indexing e.g. tensor[indices] = tensor"); - } - if (!accumulate) { - auto masked_fill_dispatch = - native::canDispatchToMaskedFill(self, indices, value); - if (std::get<0>(masked_fill_dispatch)) { - return self.masked_fill_(std::get<1>(masked_fill_dispatch), value.item()); - } - } - auto value_ = value; - if (value.device() != self.device() && value.numel() == 1 && - value.dim() == 0) { - value_ = value.to(self.device()); - } - at::assert_no_overlap(self, value); - // NOLINTNEXTLINE(performance-implicit-conversion-in-loop) - for (const c10::optional& index : indices) { - if (index.has_value()) { - at::assert_no_overlap(self, *index); - } - } - - if (accumulate || globalContext().deterministicAlgorithms()) { - TORCH_CHECK( - value_.device() == self.device(), - "expected device ", - self.device(), - " but got device ", - value_.device(), - " for value tensor"); - native::xpu::index_put_deterministic_kernel( - self, indices, value_, accumulate, unsafe); - return self; - } - - auto info = native::make_info(self, indices); - auto iter = make_index_put_iterator(info, value_); - native::xpu::index_put_kernel( - iter, - info.indexed_sizes, - info.indexed_strides, - IntArrayRef{}, - IntArrayRef{}, - accumulate); - return self; -} - -// ============================= scatter ============================= - -static void scatter_reduce_exclude_self_helper( - const Tensor& self, - int64_t dim, - const Tensor& index, - const ReductionType& op) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, - at::ScalarType::BFloat16, - at::ScalarType::Bool, - self.scalar_type(), - "scatter_reduce_exclude_input_init", - [&] { - scalar_t init_val; - switch (op) { - case ReductionType::SUM: - init_val = (scalar_t)0; - break; - case ReductionType::PROD: - init_val = (scalar_t)1; - break; - case ReductionType::MAX: - init_val = std::numeric_limits::has_infinity - ? -std::numeric_limits::infinity() - : std::numeric_limits::lowest(); - break; - case ReductionType::MIN: - init_val = std::numeric_limits::has_infinity - ? std::numeric_limits::infinity() - : std::numeric_limits::max(); - break; - case ReductionType::MEAN: - init_val = (scalar_t)0; - break; - } - self.scatter_(dim, index, init_val); - }); -} - -static void _scatter_via_index_put( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - const Tensor& mut_out, - bool accumulate) { - if (self.dim() == 1) { - torch::List> indices; - indices.reserve(1); - indices.push_back(index); - mut_out.index_put_(indices, src, accumulate); - } else { - Tensor mut_out_contig = mut_out.contiguous(); - - auto index_coords_sizes = index.sizes().vec(); - index_coords_sizes.push_back(self.dim()); - auto index_coords = at::empty( - index_coords_sizes, - at::TensorOptions().dtype(at::ScalarType::Long).device(self.device())); - - for (int64_t dim_other = 0; dim_other < self.dim(); dim_other++) { - if (dim_other == dim) { - continue; - } - auto dim_coord_vals = at::arange( - index.size(dim_other), at::TensorOptions().device(self.device())); - - for (int64_t dim_unsqueeze = 0; dim_unsqueeze < self.dim() - 1; - dim_unsqueeze++) { - dim_coord_vals = - dim_coord_vals.unsqueeze((dim_unsqueeze >= dim_other) ? -1 : 0); - } - - auto view_sizes = index.sizes().vec(); - view_sizes.push_back(1); - auto view_strides = index_coords.strides().vec(); - view_strides[self.dim()] = self.dim(); - - at::as_strided(index_coords, view_sizes, view_strides, dim_other) - .copy_(dim_coord_vals.unsqueeze(-1)); - } - - auto view_sizes = index.sizes().vec(); - view_sizes.push_back(1); - auto view_strides = index_coords.strides().vec(); - view_strides[self.dim()] = self.dim(); - - at::as_strided(index_coords, view_sizes, view_strides, dim) - .copy_(index.unsqueeze(-1)); - - Tensor index_coords_flat = index_coords.flatten(0, -2); - - // Copy mut_out_contig's strides into a tensor - // TODO: Is there a utility function that already does this? - IntArrayRef mut_out_contig_strides = mut_out_contig.strides(); - Tensor coord_strides = at::empty( - {mut_out_contig.dim()}, - TensorOptions().dtype(at::ScalarType::Long).device(at::kCPU)); - std::memcpy( - coord_strides.mutable_data_ptr(), - mut_out_contig_strides.data(), - coord_strides.nbytes()); - coord_strides = coord_strides.to(mut_out_contig.device()); - - // `index_flat` contains the 1-D indices corresponding with the - // flattened `mut_out` - Tensor index_flat = (index_coords_flat * coord_strides).sum({-1}); - Tensor mut_out_flat = mut_out_contig.flatten(); - Tensor src_flat = - at::as_strided(src, index.sizes(), src.strides()).flatten(); - - torch::List> indices; - indices.reserve(1); - indices.push_back(index_flat); - - mut_out_flat.index_put_(indices, src_flat, accumulate); - - if (!mut_out.is_contiguous()) { - mut_out.copy_(mut_out_flat.reshape(mut_out.sizes())); - } - } -} - -template < - bool use_new_options = false, - typename T, - typename ReduceStub, - typename FillStub> -void scatter_impl( - const Tensor& self, - int64_t dim, - const Tensor& index, - const T& src, - const Tensor& out, - ReduceStub& reduce_stub, - FillStub& fill_stub, - const c10::optional reduce = nullopt, - bool reduce_includes_self = true) { - dim = at::maybe_wrap_dim(dim, self.dim()); - auto mut_out = const_cast(out); - - if (!self.is_same(mut_out)) { - mut_out.copy_(self); - } - - if (index.numel() == 0) - return; - - auto op = ReductionType::SUM; - bool deterministic = globalContext().deterministicAlgorithms() && - self.device().type() == DeviceType::XPU; - - if (reduce.has_value()) { - op = get_operator_enum(reduce.value(), use_new_options); - if (!reduce_includes_self) { - // scatter inits for reduction to appropriate indices (used by - // scatter_reduce.two) - scatter_reduce_exclude_self_helper(mut_out, dim, index, op); - } - // _scatter_via_index_put can only handle sum and mean reduction type - deterministic = deterministic && - (op == ReductionType::SUM || op == ReductionType::MEAN); - } - - // Scalar src should already be deterministic - if (deterministic && std::is_same_v) { - // both runtime and compile check are required - if constexpr (std::is_same_v) { - bool accumulate = reduce.has_value(); - _scatter_via_index_put(self, dim, index, src, mut_out, accumulate); - return; - } - } - - if (reduce.has_value()) { - reduce_stub(mut_out, dim, index, src, op); - } else { - fill_stub(mut_out, dim, index, src); - } -} - -template -Tensor& scatter_meta_impl( - Tensor& output, - const Tensor& self, - int64_t dim, - const Tensor& index, - const c10::optional& src = nullopt, - const c10::optional reduce = nullopt) { - int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim()); - at::native::scatter_gather_dtype_check("scatter", self, index, src); - at::native::scatter_shape_check(self, wrapped_dim, index, src); - - if (output.defined()) { - at::assert_no_internal_overlap(output); - at::assert_no_overlap(output, index); - if (src.has_value()) { - at::assert_no_overlap(output, src.value()); - } - } - - if (output.defined()) { - at::xpu::resize_out(output, self.sizes(), {}, self.options()); - } else { - output = at::xpu::create_out(self.sizes(), {}, self.options()); - } - - if (reduce.has_value()) { - // Check if we have a valid reduce operator. - at::native::get_operator_enum(reduce.value(), use_new_options); - } - - return output; -} - -Tensor& scatter_src_meta( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - Tensor& out) { - return scatter_meta_impl(out, self, dim, index, src); -} - -Tensor& scatter_value_meta( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& value, - Tensor& out) { - return scatter_meta_impl(out, self, dim, index); -} - -Tensor& scatter_reduce_meta( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - const c10::string_view reduce, - Tensor& out) { - TORCH_WARN_ONCE( - "The reduce argument of torch.scatter with Tensor src is deprecated and will be removed ", - "in a future PyTorch release. Use torch.scatter_reduce instead for more reduction options."); - return scatter_meta_impl(out, self, dim, index, src, reduce); -} - -Tensor& scatter_value_reduce_meta( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& src, - const c10::string_view reduce, - Tensor& out) { - return scatter_meta_impl(out, self, dim, index, nullopt, reduce); -} - -Tensor& scatter_add_meta( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - Tensor& out) { - return scatter_meta_impl(out, self, dim, index, src, "add"); -} - -Tensor& scatter_reduce_two_meta( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - const c10::string_view reduce, - bool include_self, - Tensor& out) { - (void)include_self; - return scatter_meta_impl( - out, self, dim, index, src, reduce); -} - -Tensor XPUNativeFunctions::scatter( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_src", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_src", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_src", "src"); - Tensor out; - out = scatter_src_meta(self, dim, index, src, out); - scatter_impl( - self, dim, index, src, out, scatter_reduce_kernel, scatter_kernel); - return out; -} - -Tensor& XPUNativeFunctions::scatter_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - Tensor& out) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, out, "xpu::scatter_out_src_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_out_src_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_out_src_out", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_out_src_out", "src"); - out = scatter_src_meta(self, dim, index, src, out); - scatter_impl( - self, dim, index, src, out, scatter_reduce_kernel, scatter_kernel); - return out; -} - -Tensor& XPUNativeFunctions::scatter_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter__src", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter__src", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter__src", "src"); - self = scatter_src_meta(self, dim, index, src, self); - scatter_impl( - self, dim, index, src, self, scatter_reduce_kernel, scatter_kernel); - return self; -} - -Tensor XPUNativeFunctions::scatter( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& value) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_value", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_value", "index"); - Tensor out; - out = scatter_value_meta(self, dim, index, value, out); - scatter_impl( - self, - dim, - index, - value, - out, - scatter_scalar_reduce_kernel, - scatter_fill_kernel); - return out; -} - -Tensor& XPUNativeFunctions::scatter_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& value, - Tensor& out) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, out, "xpu::scatter_out_value_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_out_value_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_out_value_out", "index"); - out = scatter_value_meta(self, dim, index, value, out); - scatter_impl( - self, - dim, - index, - value, - out, - scatter_scalar_reduce_kernel, - scatter_fill_kernel); - return out; -} - -Tensor& XPUNativeFunctions::scatter_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& value) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter__value", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter__value", "index"); - self = scatter_value_meta(self, dim, index, value, self); - scatter_impl( - self, - dim, - index, - value, - self, - scatter_scalar_reduce_kernel, - scatter_fill_kernel); - return self; -} - -Tensor XPUNativeFunctions::scatter( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - c10::string_view reduce) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_reduce", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_reduce", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_reduce", "src"); - Tensor out; - out = scatter_reduce_meta(self, dim, index, src, reduce, out); - scatter_impl( - self, - dim, - index, - src, - out, - scatter_reduce_kernel, - scatter_kernel, - reduce); - return out; -} - -Tensor& XPUNativeFunctions::scatter_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - c10::string_view reduce, - Tensor& out) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, out, "xpu::scatter_out_reduce_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_out_reduce_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_out_reduce_out", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_out_reduce_out", "src"); - out = scatter_reduce_meta(self, dim, index, src, reduce, out); - scatter_impl( - self, - dim, - index, - src, - out, - scatter_reduce_kernel, - scatter_kernel, - reduce); - return out; -} - -Tensor& XPUNativeFunctions::scatter_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - c10::string_view reduce) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter__reduce", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter__reduce", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter__reduce", "src"); - self = scatter_reduce_meta(self, dim, index, src, reduce, self); - scatter_impl( - self, - dim, - index, - src, - self, - scatter_reduce_kernel, - scatter_kernel, - reduce); - return self; -} - -Tensor XPUNativeFunctions::scatter( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& value, - c10::string_view reduce) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_value_reduce", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_value_reduce", "index"); - Tensor out; - out = scatter_value_reduce_meta(self, dim, index, value, reduce, out); - scatter_impl( - self, - dim, - index, - value, - out, - scatter_scalar_reduce_kernel, - scatter_fill_kernel, - reduce); - return out; -} - -Tensor& XPUNativeFunctions::scatter_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& value, - c10::string_view reduce, - Tensor& out) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, out, "xpu::scatter_out_value_reduce_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_out_value_reduce_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_out_value_reduce_out", "index"); - out = scatter_value_reduce_meta(self, dim, index, value, reduce, out); - scatter_impl( - self, - dim, - index, - value, - out, - scatter_scalar_reduce_kernel, - scatter_fill_kernel, - reduce); - return out; -} - -Tensor& XPUNativeFunctions::scatter_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Scalar& value, - c10::string_view reduce) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter__value_reduce", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter__value_reduce", "index"); - self = scatter_value_reduce_meta(self, dim, index, value, reduce, self); - scatter_impl( - self, - dim, - index, - value, - self, - scatter_scalar_reduce_kernel, - scatter_fill_kernel, - reduce); - return self; -} - -Tensor& scatter_add_impl( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - Tensor& out) { - auto mut_out = const_cast(out); - dim = maybe_wrap_dim(dim, self.dim()); - - if (!self.is_same(mut_out)) { - mut_out.copy_(self); - } - - if (index.numel() == 0) - return out; - - // See Note [Enabling Deterministic Operations] - // Avoid gpuAtomicAdd for XPU if deterministic mode is turned on - if (globalContext().deterministicAlgorithms() && - self.device().type() == DeviceType::XPU) { - _scatter_via_index_put(self, dim, index, src, mut_out, /*accumulate*/ true); - } else { - // TODO: enable fast paths for GNN usage (scatter_add_expanded_index_kernel) - scatter_add_kernel(mut_out, dim, index, src); - } - return out; -} - -Tensor XPUNativeFunctions::scatter_add( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_add", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_add", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_add", "src"); - Tensor out; - out = scatter_add_meta(self, dim, index, src, out); - out = scatter_add_impl(self, dim, index, src, out); - return out; -} - -Tensor& XPUNativeFunctions::scatter_add_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - Tensor& out) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, out, "xpu::scatter_add_out_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_add_out_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_add_out_out", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_add_out_out", "src"); - out = scatter_add_meta(self, dim, index, src, out); - out = scatter_add_impl(self, dim, index, src, out); - return out; -} - -Tensor& XPUNativeFunctions::scatter_add_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_add_", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_add_", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_add_", "src"); - self = scatter_add_meta(self, dim, index, src, self); - self = scatter_add_impl(self, dim, index, src, self); - return self; -} - -Tensor& scatter_reduce_two_impl( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - const c10::string_view reduce, - bool include_self, - Tensor& out) { - dim = at::maybe_wrap_dim(dim, self.dim()); - - if (!self.is_same(out)) { - out.copy_(self); - } - - const auto op = get_operator_enum(reduce, true); - - // TODO: enable scatter_reduce_expanded_index_kernel - - scatter_impl( - self, - dim, - index, - src, - out, - scatter_reduce_two_kernel, - scatter_kernel, - reduce, - include_self); - - if (op == ReductionType::MEAN) { - auto ones = at::ones_like(src); - auto count = include_self ? at::ones_like(out) : at::zeros_like(out); - count.scatter_add_(dim, index, ones); - count.masked_fill_(count == 0, 1); - - if (out.is_floating_point() || out.is_complex()) { - out.div_(count); - } else { - out.div_(count, "floor"); - } - } - - return out; -} - -Tensor XPUNativeFunctions::scatter_reduce( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - c10::string_view reduce, - bool include_self) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_reduce_two", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_reduce_two", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_reduce_two", "src"); - Tensor out; - out = - scatter_reduce_two_meta(self, dim, index, src, reduce, include_self, out); - out = - scatter_reduce_two_impl(self, dim, index, src, reduce, include_self, out); - return out; -} - -Tensor& XPUNativeFunctions::scatter_reduce_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - c10::string_view reduce, - bool include_self, - Tensor& out) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, out, "xpu::scatter_reduce_out_two_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_reduce_out_two_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_reduce_out_two_out", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_reduce_out_two_out", "src"); - out = - scatter_reduce_two_meta(self, dim, index, src, reduce, include_self, out); - out = - scatter_reduce_two_impl(self, dim, index, src, reduce, include_self, out); - return out; -} - -Tensor& XPUNativeFunctions::scatter_reduce_( - Tensor& self, - int64_t dim, - const Tensor& index, - const Tensor& src, - c10::string_view reduce, - bool include_self) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::scatter_reduce__two", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::scatter_reduce__two", "index"); - c10::impl::check_and_update_common_device( - common_device, src, "xpu::scatter_reduce__two", "src"); - self = scatter_reduce_two_meta( - self, dim, index, src, reduce, include_self, self); - self = scatter_reduce_two_impl( - self, dim, index, src, reduce, include_self, self); - return self; -} - -// ============================= gather ============================= - -Tensor& gather_meta( - const Tensor& self, - int64_t dim, - const Tensor& index, - bool sparse_grad, - Tensor& result) { - int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim()); - - // Memory overlap checks need to be done after resizing (if required) is done. - // But it only makes sense to do these checks when result was defined, hence - // the boolean variable `check_result` here. - // For more details, see: - // https://github.com/pytorch/pytorch/pull/63312#discussion_r694794832 and - // https://github.com/pytorch/pytorch/issues/63837 - bool check_result = result.defined(); - - if (result.defined()) { - at::xpu::resize_out(result, index.sizes(), {}, self.options()); - } else { - result = at::xpu::create_out(index.sizes(), {}, self.options()); - } - - if (check_result) { - at::assert_no_internal_overlap(result); - at::assert_no_overlap(result, self); - at::assert_no_partial_overlap(result, index); - } - - auto is_index_empty = index.numel() == 0; - if (!is_index_empty) { - TORCH_CHECK( - index.scalar_type() == at::ScalarType::Long, - "gather", - "(): Expected dtype int64 for index"); - } - if (is_index_empty) - return result; - at::native::gather_shape_check(self, wrapped_dim, index); - - return result; -} - -Tensor XPUNativeFunctions::gather( - const Tensor& self, - int64_t dim, - const Tensor& index, - bool sparse_grad) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, self, "xpu::gather", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::gather", "index"); - Tensor out; - out = gather_meta(self, dim, index, sparse_grad, out); - - if (index.numel() == 0) - return out; - dim = at::maybe_wrap_dim(dim, self.dim()); - // TODO: enable gather_expanded_index_kernel - gather_kernel(out, self, dim, index); - return out; -} - -Tensor& XPUNativeFunctions::gather_out( - const Tensor& self, - int64_t dim, - const Tensor& index, - bool sparse_grad, - Tensor& out) { - std::optional common_device = std::nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device( - common_device, out, "xpu::gather_out_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::gather_out_out", "self"); - c10::impl::check_and_update_common_device( - common_device, index, "xpu::gather_out_out", "index"); - out = gather_meta(self, dim, index, sparse_grad, out); - - if (index.numel() == 0) - return out; - dim = at::maybe_wrap_dim(dim, self.dim()); - // TODO: enable gather_expanded_index_kernel - gather_kernel(out, self, dim, index); - return out; + return masked_fill__xpu(self, mask, value.item()); } -Tensor XPUNativeFunctions::count_nonzero(const Tensor& self, IntArrayRef dims) { +Tensor count_nonzero_xpu(const Tensor& self, IntArrayRef dims) { return (self != 0).sum(dims); } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/TensorCompare.cpp b/src/ATen/native/xpu/TensorCompare.cpp index 3331c31a2..ef6798c14 100644 --- a/src/ATen/native/xpu/TensorCompare.cpp +++ b/src/ATen/native/xpu/TensorCompare.cpp @@ -5,492 +5,17 @@ #include #include #include -#include - -#include #include #include #include #include -namespace at { - -template -Device out_device(Args&... inps) { - for (const auto& i : {inps...}) { - if (i.device() != at::kCPU) { - return i.device(); - } - } - return at::kCPU; -} - -Tensor& where_self_out( - const Tensor& condition, - const Tensor& self, - const Tensor& other, - Tensor& out) { - const auto result_type = at::native::result_type(self, other); - TORCH_CHECK( - out.scalar_type() == result_type, - "Expected out type to be ", - result_type, - " but got ", - out.scalar_type()); - - auto self_ = self.scalar_type() != result_type ? self.to(result_type) : self; - auto other_ = - other.scalar_type() != result_type ? other.to(result_type) : other; - auto condition_ = condition; - auto device = out_device(condition, self_, other_); - if (device != at::kCPU) { // allow CPU scalars on non-cpu device - if (condition.device() != device && condition.ndimension() == 0) { - condition_ = condition.to(device); - } - if (self_.device() != device && self_.ndimension() == 0) { - self_ = self_.to(device); - } - if (other_.device() != device && other_.ndimension() == 0) { - other_ = other_.to(device); - } - } - if (condition_.scalar_type() == ScalarType::Byte) { - TORCH_WARN_ONCE( - "where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead."); - condition_ = condition_.to(kBool); - } - TORCH_CHECK( - condition_.scalar_type() == kBool, - "where expected condition to be a boolean tensor, but got a tensor with dtype ", - condition_.scalar_type()); - // if there's still a device mismatch, let tensoriterator error out with it - auto iter = at::TensorIteratorConfig() - .check_all_same_dtype(false) - .add_output(out) - .add_const_input(condition_) - .add_const_input(self_) - .add_const_input(other_) - .build(); - native::xpu::where_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::where_out( - const Tensor& condition, - const Tensor& self, - const Tensor& other, - Tensor& out) { - return where_self_out(condition, self, other, out); -} - -Tensor XPUNativeFunctions::where( - const Tensor& condition, - const Tensor& self, - const Tensor& other) { - auto device = out_device(condition, self, other); - auto result_type = at::native::result_type(self, other); - Tensor ret = at::empty({0}, self.options().dtype(result_type).device(device)); - where_self_out(condition, self, other, ret); - return ret; -} - -TensorIterator clamp_meta( - const Tensor& self, - const OptionalScalarRef min, - const OptionalScalarRef max, - Tensor& result) { - TensorIterator iter; - if (!min && !max) { - TORCH_CHECK( - false, "torch.clamp: At least one of 'min' or 'max' must not be None"); - } - // Manual type promotion, since scalars have to participate in it - ScalarType result_type = self.scalar_type(); - TORCH_CHECK( - !isComplexType(result_type), "clamp is not supported for complex types"); - // Floating is the highest supported - if (!isFloatingType(result_type)) { - at::native::ResultTypeState state = {}; - state = at::native::update_result_type_state(self, state); - - if (min) { - state = at::native::update_result_type_state(min.get(), state); - } - if (max) { - state = at::native::update_result_type_state(max.get(), state); - } - result_type = at::native::result_type(state); - // disallow type promoting inplace op - TORCH_CHECK( - (result_type == self.scalar_type()) || - (!(result.defined()) || !(result.is_same(self))), - "result type ", - result_type, - " can't be cast to the desired output type ", - self.dtype()); - } - // make sure scalars weren't complex - TORCH_CHECK( - !isComplexType(result_type), "clamp is not supported for complex types"); - iter.build_unary_op(result, self.to(result_type)); - return iter; -} - -Tensor& clamp_out_impl( - const Tensor& self, - TensorIteratorBase& iter, - const OptionalScalarRef min, - const OptionalScalarRef max, - Tensor& result) { - using at::native::detail::ClampLimits; - if (min && max) { - if (min.get().toDouble() != min.get().toDouble() || - max.get().toDouble() != max.get().toDouble()) { - at::fill_( - const_cast(result), - std::numeric_limits::quiet_NaN()); - } else { - native::xpu::clamp_scalar_kernel(iter, min.get(), max.get()); - } - } else if (max) { - native::xpu::clamp_max_scalar_kernel(iter, max.get()); - } else if (min) { - native::xpu::clamp_min_scalar_kernel(iter, min.get()); - } - return result; -} - -Tensor XPUNativeFunctions::clamp( - const Tensor& self, - const ::std::optional& min, - const ::std::optional& max) { - auto min_ = - (min.has_value() ? at::OptionalScalarRef(&(min.value())) - : at::OptionalScalarRef()); - auto max_ = - (max.has_value() ? at::OptionalScalarRef(&(max.value())) - : at::OptionalScalarRef()); - Tensor result; - auto iter = clamp_meta(self, min_, max_, result); - result = clamp_out_impl(self, iter, min_, max_, result); - return iter.output(); -} - -Tensor& XPUNativeFunctions::clamp_out( - const Tensor& self, - const ::std::optional& min, - const ::std::optional& max, - Tensor& result) { - auto min_ = - (min.has_value() ? at::OptionalScalarRef(&(min.value())) - : at::OptionalScalarRef()); - auto max_ = - (max.has_value() ? at::OptionalScalarRef(&(max.value())) - : at::OptionalScalarRef()); - auto iter = clamp_meta(self, min_, max_, result); - result = clamp_out_impl(self, iter, min_, max_, result); - return result; -} - -Tensor& XPUNativeFunctions::clamp_( - Tensor& self, - const ::std::optional& min, - const ::std::optional& max) { - auto min_ = - (min.has_value() ? at::OptionalScalarRef(&(min.value())) - : at::OptionalScalarRef()); - auto max_ = - (max.has_value() ? at::OptionalScalarRef(&(max.value())) - : at::OptionalScalarRef()); - auto iter = clamp_meta(self, min_, max_, self); - self = clamp_out_impl(self, iter, min_, max_, self); - return self; -} - -TensorIterator clamp_tensor_meta( - const Tensor& self, - const OptionalTensorRef min, - const OptionalTensorRef max, - Tensor& result) { - TensorIterator iter; - TORCH_CHECK( - min || max, - "torch.clamp: At least one of 'min' or 'max' must not be None"); - TORCH_CHECK( - !isComplexType(self.scalar_type()), - "clamp is not supported for complex types"); -#define CLAMP_CONFIG() \ - TensorIteratorConfig() \ - .set_check_mem_overlap(true) \ - .add_output(result) \ - .add_const_input(self) \ - .promote_inputs_to_common_dtype(true) \ - .cast_common_dtype_to_outputs(true) \ - .enforce_safe_casting_to_output(true) - - if (min && max) { - iter.build(CLAMP_CONFIG().add_const_input(*min).add_const_input(*max)); - } else if (min) { - iter.build(CLAMP_CONFIG().add_const_input(*min)); - } else if (max) { - iter.build(CLAMP_CONFIG().add_const_input(*max)); - } - return iter; -} - -Tensor& clamp_tensor_out_impl( - const Tensor& self, - TensorIteratorBase& iter, - const OptionalTensorRef min, - const OptionalTensorRef max, - Tensor& result) { - if (min && max) { - native::xpu::clamp_kernel(iter); - } else if (min) { - native::xpu::maximum_kernel(iter); - } else if (max) { - native::xpu::minimum_kernel(iter); - } - return result; -} - -Tensor XPUNativeFunctions::clamp( - const Tensor& self, - const ::std::optional& min, - const ::std::optional& max) { - auto min_ = - ((min.has_value() && (*min).defined()) ? at::OptionalTensorRef(*min) - : at::OptionalTensorRef()); - auto max_ = - ((max.has_value() && (*max).defined()) ? at::OptionalTensorRef(*max) - : at::OptionalTensorRef()); - Tensor result; - auto iter = clamp_tensor_meta(self, min_, max_, result); - result = clamp_tensor_out_impl(self, iter, min_, max_, result); - return iter.output(); -} - -Tensor& XPUNativeFunctions::clamp_out( - const Tensor& self, - const ::std::optional& min, - const ::std::optional& max, - Tensor& result) { - auto min_ = - ((min.has_value() && (*min).defined()) ? at::OptionalTensorRef(*min) - : at::OptionalTensorRef()); - auto max_ = - ((max.has_value() && (*max).defined()) ? at::OptionalTensorRef(*max) - : at::OptionalTensorRef()); - auto iter = clamp_tensor_meta(self, min_, max_, result); - result = clamp_tensor_out_impl(self, iter, min_, max_, result); - return result; -} - -Tensor& XPUNativeFunctions::clamp_( - Tensor& self, - const ::std::optional& min, - const ::std::optional& max) { - auto min_ = - ((min.has_value() && (*min).defined()) ? at::OptionalTensorRef(*min) - : at::OptionalTensorRef()); - auto max_ = - ((max.has_value() && (*max).defined()) ? at::OptionalTensorRef(*max) - : at::OptionalTensorRef()); - auto iter = clamp_tensor_meta(self, min_, max_, self); - self = clamp_tensor_out_impl(self, iter, min_, max_, self); - return self; -} - -TensorIterator clamp_max_meta( - const Tensor& self, - const Scalar& max, - Tensor& result) { - TensorIterator iter; - // we could wrap max into tensor and send to tensor overload, - // but relu is implemented via clamp_min, so for perf an uniformity reasons - // do a faster but correct thing - ScalarType result_type = self.scalar_type(); - TORCH_CHECK( - !isComplexType(result_type), "clamp is not supported for complex types"); - TORCH_CHECK(!max.isComplex(), "clamp is not supported for complex types"); - // Floating is the highest supported - if (!isFloatingType(result_type)) { - auto result_type = at::native::result_type(self, max); - TORCH_CHECK( - (result_type == self.scalar_type()) || - (!(result.defined()) || !(result.is_same(self))), - "result type ", - result_type, - " can't be cast to the desired output type ", - self.dtype()); - iter.build_unary_op(result, self.to(result_type)); - } else { - iter.build_borrowing_unary_op(result, self); - } - return iter; -} - -Tensor& clamp_max_out_impl( - const Tensor& self, - TensorIteratorBase& iter, - const Scalar& max, - Tensor& result) { - if (max.toDouble() != max.toDouble()) { - // TODO this is not great, building TI again is expensive, but I can't use - // fill_stub because fill is not structured - // this is a corner case anyway - at::fill_(const_cast(result), native::wrapped_scalar_tensor(max)); - } else { - native::xpu::clamp_max_scalar_kernel(iter, max); - } - return result; -} - -Tensor XPUNativeFunctions::clamp_max(const Tensor& self, const Scalar& max) { - Tensor result; - auto iter = clamp_max_meta(self, max, result); - result = clamp_max_out_impl(self, iter, max, result); - return iter.output(); -} - -Tensor& XPUNativeFunctions::clamp_max_out( - const Tensor& self, - const Scalar& max, - Tensor& result) { - auto iter = clamp_max_meta(self, max, result); - result = clamp_max_out_impl(self, iter, max, result); - return result; -} - -Tensor& XPUNativeFunctions::clamp_max_(Tensor& self, const Scalar& max) { - auto iter = clamp_max_meta(self, max, self); - self = clamp_max_out_impl(self, iter, max, self); - return self; -} - -TensorIterator clamp_max_tensor_meta( - const Tensor& self, - const Tensor& max, - Tensor& result) { - TensorIterator iter; - iter.build_borrowing_binary_op(result, self, max); - return iter; -} - -Tensor XPUNativeFunctions::clamp_max(const Tensor& self, const Tensor& max) { - Tensor result; - auto iter = clamp_max_tensor_meta(self, max, result); - native::xpu::minimum_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::clamp_max_out( - const Tensor& self, - const Tensor& max, - Tensor& result) { - auto iter = clamp_max_tensor_meta(self, max, result); - native::xpu::minimum_kernel(iter); - return result; -} - -Tensor& XPUNativeFunctions::clamp_max_(Tensor& self, const Tensor& max) { - auto iter = clamp_max_tensor_meta(self, max, self); - native::xpu::minimum_kernel(iter); - return self; -} - -TensorIterator clamp_min_meta( - const Tensor& self, - const Scalar& min, - Tensor& result) { - TensorIterator iter; - ScalarType result_type = self.scalar_type(); - TORCH_CHECK( - !isComplexType(result_type), "clamp is not supported for complex types"); - TORCH_CHECK(!min.isComplex(), "clamp is not supported for complex types"); - // Floating is the highest supported - if (!isFloatingType(result_type)) { - auto result_type = at::native::result_type(self, min); - TORCH_CHECK( - (result_type == self.scalar_type() || !(result.defined()) || - !(result.is_same(self))), - "result type ", - result_type, - " can't be cast to the desired output type ", - self.dtype()); - iter.build_unary_op(result, self.to(result_type)); - } else { - iter.build_borrowing_unary_op(result, self); - } - return iter; -} - -Tensor& clamp_min_out_impl( - const Tensor& self, - TensorIteratorBase& iter, - const Scalar& min, - Tensor& result) { - if (min.toDouble() != min.toDouble()) { - at::fill_(const_cast(result), min); - } else { - native::xpu::clamp_min_scalar_kernel(iter, min); - } - return result; -} - -Tensor XPUNativeFunctions::clamp_min(const Tensor& self, const Scalar& min) { - Tensor result; - auto iter = clamp_min_meta(self, min, result); - result = clamp_min_out_impl(self, iter, min, result); - return iter.output(); -} - -Tensor& XPUNativeFunctions::clamp_min_out( - const Tensor& self, - const Scalar& min, - Tensor& result) { - auto iter = clamp_min_meta(self, min, result); - result = clamp_min_out_impl(self, iter, min, result); - return result; -} - -Tensor& XPUNativeFunctions::clamp_min_(Tensor& self, const Scalar& min) { - auto iter = clamp_min_meta(self, min, self); - self = clamp_min_out_impl(self, iter, min, self); - return self; -} +#include -TensorIterator clamp_min_tensor_meta( - const Tensor& self, - const Tensor& min, - Tensor& result) { - TensorIterator iter; - iter.build_borrowing_binary_op(result, self, min); - return iter; -} - -Tensor XPUNativeFunctions::clamp_min(const Tensor& self, const Tensor& min) { - Tensor result; - auto iter = clamp_min_tensor_meta(self, min, result); - native::xpu::maximum_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::clamp_min_out( - const Tensor& self, - const Tensor& min, - Tensor& result) { - auto iter = clamp_min_tensor_meta(self, min, result); - native::xpu::maximum_kernel(iter); - return result; -} - -Tensor& XPUNativeFunctions::clamp_min_(Tensor& self, const Tensor& min) { - auto iter = clamp_min_tensor_meta(self, min, self); - native::xpu::maximum_kernel(iter); - return self; -} +namespace at { +namespace native { +namespace xpu { void min_kernel_impl( const Tensor& result, const Tensor& indice, @@ -531,281 +56,15 @@ void minmax_out_impl( } } } +} // namespace xpu -static void check_unsupported_complex(const char* name, const Tensor& self) { - TORCH_CHECK(!self.is_complex(), name, ": does not support complex input"); -} - -::std::tuple XPUNativeFunctions::min_out( - const Tensor& self, - int64_t dim, - bool keepdim, - Tensor& values, - Tensor& indices) { - dim = maybe_wrap_dim(dim, self.dim()); - at::native::zero_numel_check_dims(self, dim, "min()"); - check_unsupported_complex("min()", self); - at::xpu::resize_reduction_with_indices( - values, indices, self, dim, keepdim, self.scalar_type()); - - minmax_out_impl(self, dim, keepdim, values, indices, min_kernel_impl); - return {values, indices}; -} - -std::tuple XPUNativeFunctions::max_out( - const Tensor& self, - int64_t dim, - bool keepdim, - Tensor& values, - Tensor& indices) { - dim = maybe_wrap_dim(dim, self.dim()); - at::native::zero_numel_check_dims(self, dim, "max()"); - check_unsupported_complex("max()", self); - at::xpu::resize_reduction_with_indices( - values, indices, self, dim, keepdim, self.scalar_type()); - - minmax_out_impl(self, dim, keepdim, values, indices, max_kernel_impl); - return {values, indices}; -} - -std::tuple XPUNativeFunctions::_aminmax( - const Tensor& self, - int64_t dim, - bool keepdim) { - TORCH_WARN_ONCE( - "_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead." - " This warning will only appear once per process."); - return XPUNativeFunctions::aminmax(self, dim, keepdim); -} - -static inline void check_for_unsupported_isin_dtype(const ScalarType type) { - // Bail out for dtypes unsupported by the sorting algorithm to keep the - // interface consistent. - TORCH_CHECK( - type != ScalarType::Bool && type != ScalarType::BFloat16 && - type != ScalarType::ComplexFloat && type != ScalarType::ComplexDouble, - "Unsupported input type encountered for isin(): ", - type); -} - -// Sorting-based algorithm for isin(); used when the number of test elements is -// large. -static void isin_sorting( - const Tensor& elements, - const Tensor& test_elements, - bool assume_unique, - bool invert, - const Tensor& out) { - // 1. Concatenate unique elements with unique test elements in 1D form. If - // assume_unique is true, skip calls to unique(). - Tensor elements_flat, test_elements_flat, unique_order; - if (assume_unique) { - elements_flat = elements.ravel(); - test_elements_flat = test_elements.ravel(); - } else { - std::tie(elements_flat, unique_order) = - at::_unique(elements, /*sorted=*/false, /*return_inverse=*/true); - std::tie(test_elements_flat, std::ignore) = - at::_unique(test_elements, /*sorted=*/false); - } - - // 2. Stable sort all elements, maintaining order indices to reverse the - // operation. Stable sort is necessary to keep elements before test - // elements within the sorted list. - Tensor all_elements = - at::cat({std::move(elements_flat), std::move(test_elements_flat)}); - auto [sorted_elements, sorted_order] = all_elements.sort( - /*stable=*/true, /*dim=*/0, /*descending=*/false); - - // 3. Create a mask for locations of adjacent duplicate values within the - // sorted list. Duplicate values are in both elements and test elements. - Tensor duplicate_mask = - at::empty_like(sorted_elements, TensorOptions(ScalarType::Bool)); - Tensor sorted_except_first = sorted_elements.slice(0, 1, at::indexing::None); - Tensor sorted_except_last = sorted_elements.slice(0, 0, -1); - duplicate_mask.slice(0, 0, -1).copy_( - invert ? sorted_except_first.ne(sorted_except_last) - : sorted_except_first.eq(sorted_except_last)); - duplicate_mask.index_put_({-1}, invert); - - // 4. Reorder the mask to match the pre-sorted element order. - Tensor mask = at::empty_like(duplicate_mask); - mask.index_copy_(0, sorted_order, duplicate_mask); - - // 5. Index the mask to match the pre-unique element order. If - // assume_unique is true, just take the first N items of the mask, - // where N is the original number of elements. - if (assume_unique) { - out.copy_(mask.slice(0, 0, elements.numel()).view_as(out)); - } else { - out.copy_(at::index(mask, {std::optional(unique_order)})); - } -} - -void isin_Tensor_Tensor_meta( - const Tensor& elements, - Tensor test_elements, - bool assume_unique, - bool invert, - Tensor& out) { - check_for_unsupported_isin_dtype(elements.scalar_type()); - check_for_unsupported_isin_dtype(test_elements.scalar_type()); - auto output_options = - TensorOptions(elements.device()).dtype(ScalarType::Bool); - if (out.defined()) { - xpu::resize_out(out, elements.sizes(), {}, output_options); - } else { - out = xpu::create_out(elements.sizes(), {}, output_options); - } -} - -void isin_Tensor_Tensor_impl( - const Tensor& elements, - Tensor test_elements, - bool assume_unique, - bool invert, - const Tensor& out) { - if (elements.numel() == 0) { - return; - } - - // Heuristic taken from numpy's implementation. - if (test_elements.numel() < - static_cast( - 10.0f * std::pow(static_cast(elements.numel()), 0.145))) { - out.fill_(invert); - native::xpu::isin_kernel(elements, test_elements, invert, out); - } else { - isin_sorting(elements, test_elements, assume_unique, invert, out); - } -} - -Tensor& XPUNativeFunctions::isin_out( - const Tensor& elements, - const Tensor& test_elements, - bool assume_unique, - bool invert, - Tensor& out) { - isin_Tensor_Tensor_meta(elements, test_elements, assume_unique, invert, out); - isin_Tensor_Tensor_impl(elements, test_elements, assume_unique, invert, out); - return out; -} - -Tensor XPUNativeFunctions::isin( - const Tensor& elements, - const Tensor& test_elements, - bool assume_unique, - bool invert) { - Tensor out; - isin_Tensor_Tensor_meta(elements, test_elements, assume_unique, invert, out); - isin_Tensor_Tensor_impl(elements, test_elements, assume_unique, invert, out); - return out; -} - -void isin_Tensor_Scalar_meta( - const Tensor& elements, - const Scalar& test_elements, - bool assume_unique, - bool invert, - Tensor& out) { - check_for_unsupported_isin_dtype(elements.scalar_type()); - check_for_unsupported_isin_dtype(test_elements.type()); - auto output_options = - TensorOptions(elements.device()).dtype(ScalarType::Bool); - if (out.defined()) { - xpu::resize_out(out, elements.sizes(), {}, output_options); - } else { - out = xpu::create_out(elements.sizes(), {}, output_options); - } -} - -void isin_Tensor_Scalar_impl( - const Tensor& elements, - const Scalar& test_elements, - bool assume_unique, - bool invert, - const Tensor& out) { - if (invert) { - at::ne_out(const_cast(out), elements, test_elements); - } else { - at::eq_out(const_cast(out), elements, test_elements); - } -} - -Tensor& XPUNativeFunctions::isin_out( - const Tensor& elements, - const Scalar& test_elements, - bool assume_unique, - bool invert, - Tensor& out) { - isin_Tensor_Scalar_meta(elements, test_elements, assume_unique, invert, out); - isin_Tensor_Scalar_impl(elements, test_elements, assume_unique, invert, out); - return out; -} - -Tensor XPUNativeFunctions::isin( - const Tensor& elements, - const Scalar& test_elements, - bool assume_unique, - bool invert) { - Tensor out; - isin_Tensor_Scalar_meta(elements, test_elements, assume_unique, invert, out); - isin_Tensor_Scalar_impl(elements, test_elements, assume_unique, invert, out); - return out; -} - -void isin_Scalar_Tensor_meta( - const Scalar& elements, - const Tensor& test_elements, - bool assume_unique, - bool invert, - Tensor& out) { - check_for_unsupported_isin_dtype(elements.type()); - check_for_unsupported_isin_dtype(test_elements.scalar_type()); - auto output_options = - TensorOptions(test_elements.device()).dtype(ScalarType::Bool); - if (out.defined()) { - xpu::resize_out(out, {0}, {}, output_options); - } else { - out = xpu::create_out({0}, {}, output_options); - } -} - -void isin_Scalar_Tensor_impl( - const Scalar& elements, - const Tensor& test_elements, - bool assume_unique, - bool invert, - const Tensor& out) { - // redispatch - at::isin_out( - const_cast(out), - at::native::wrapped_scalar_tensor(elements, test_elements.device()), - test_elements, - assume_unique, - invert); -} - -Tensor& XPUNativeFunctions::isin_out( - const Scalar& elements, - const Tensor& test_elements, - bool assume_unique, - bool invert, - Tensor& out) { - isin_Scalar_Tensor_meta(elements, test_elements, assume_unique, invert, out); - isin_Scalar_Tensor_impl(elements, test_elements, assume_unique, invert, out); - return out; -} - -Tensor XPUNativeFunctions::isin( - const Scalar& elements, - const Tensor& test_elements, - bool assume_unique, - bool invert) { - Tensor out; - isin_Scalar_Tensor_meta(elements, test_elements, assume_unique, invert, out); - isin_Scalar_Tensor_impl(elements, test_elements, assume_unique, invert, out); - return out; -} - +REGISTER_XPU_DISPATCH(where_kernel, &xpu::where_kernel); +REGISTER_XPU_DISPATCH(clamp_min_scalar_stub, &xpu::clamp_min_scalar_kernel); +REGISTER_XPU_DISPATCH(clamp_max_scalar_stub, &xpu::clamp_max_scalar_kernel); +REGISTER_XPU_DISPATCH(clamp_scalar_stub, &xpu::clamp_scalar_kernel); +REGISTER_XPU_DISPATCH(clamp_stub, &xpu::clamp_kernel); +REGISTER_XPU_DISPATCH(max_stub, &xpu::max_kernel_impl); +REGISTER_XPU_DISPATCH(min_stub, &xpu::min_kernel_impl) +REGISTER_XPU_DISPATCH(isin_default_stub, &xpu::isin_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp index 44da487f7..62e9fdfce 100644 --- a/src/ATen/native/xpu/TensorFactories.cpp +++ b/src/ATen/native/xpu/TensorFactories.cpp @@ -1,16 +1,11 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include -#include #include -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else #include #include -#endif +#include #include #include @@ -18,11 +13,12 @@ namespace at { -Tensor& XPUNativeFunctions::eye_out(int64_t n, Tensor& result) { - return XPUNativeFunctions::eye_out(n, n, result); -} +namespace native { + +REGISTER_XPU_DISPATCH(complex_stub, &xpu::complex_kernel); +REGISTER_XPU_DISPATCH(polar_stub, &xpu::polar_kernel); -Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) { +Tensor& eye_out_xpu(int64_t n, int64_t m, Tensor& result) { TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m); @@ -37,7 +33,11 @@ Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) { return result; } -Tensor XPUNativeFunctions::empty( +Tensor& eye_out_xpu(int64_t n, Tensor& result) { + return eye_out_xpu(n, n, result); +} + +Tensor empty_xpu( IntArrayRef size, c10::optional dtype_opt, c10::optional layout_opt, @@ -60,7 +60,7 @@ Tensor XPUNativeFunctions::empty( return result; } -Tensor XPUNativeFunctions::empty_strided( +Tensor empty_strided_xpu( IntArrayRef size, IntArrayRef stride, c10::optional dtype_opt, @@ -78,13 +78,7 @@ Tensor XPUNativeFunctions::empty_strided( return result; } -Tensor XPUNativeFunctions::clone( - const Tensor& self, - c10::optional memory_format) { - return at::native::clone(self, memory_format); -} - -Tensor XPUNativeFunctions::_efficientzerotensor( +Tensor _efficientzerotensor_xpu( IntArrayRef size, std::optional dtype, std::optional layout, @@ -103,70 +97,7 @@ Tensor XPUNativeFunctions::_efficientzerotensor( return out; } -static void complex_check_floating(const Tensor& a, const Tensor& b) { - TORCH_CHECK( - (a.scalar_type() == kFloat || a.scalar_type() == kDouble || - a.scalar_type() == kHalf) && - (b.scalar_type() == kFloat || b.scalar_type() == kDouble || - b.scalar_type() == kHalf), - "Expected both inputs to be Half, Float or Double tensors but got ", - a.scalar_type(), - " and ", - b.scalar_type()); -} - -static void complex_check_dtype( - const Tensor& result, - const Tensor& a, - const Tensor& b) { - complex_check_floating(a, b); - TORCH_CHECK( - a.scalar_type() == b.scalar_type(), - "Expected object of scalar type ", - a.scalar_type(), - " but got scalar type ", - b.scalar_type(), - " for second argument"); - TORCH_CHECK( - result.scalar_type() == toComplexType(a.scalar_type()), - "Expected object of scalar type ", - toComplexType(a.scalar_type()), - " but got scalar type ", - result.scalar_type(), - " for argument 'out'"); -} - -Tensor& XPUNativeFunctions::complex_out( - const Tensor& real, - const Tensor& imag, - Tensor& result) { - complex_check_dtype(result, real, imag); - auto iter = TensorIteratorConfig() - .add_output(result) - .add_const_input(real) - .add_const_input(imag) - .check_all_same_dtype(false) - .build(); - native::xpu::complex_kernel(iter); - return result; -} - -Tensor& XPUNativeFunctions::polar_out( - const Tensor& abs, - const Tensor& angle, - Tensor& result) { - complex_check_dtype(result, abs, angle); - auto iter = TensorIteratorConfig() - .add_output(result) - .add_const_input(abs) - .add_const_input(angle) - .check_all_same_dtype(false) - .build(); - native::xpu::polar_kernel(iter); - return result; -} - -Tensor& XPUNativeFunctions::randperm_out( +Tensor& randperm_out_xpu( int64_t n, c10::optional generator, Tensor& result) { @@ -183,4 +114,5 @@ Tensor& XPUNativeFunctions::randperm_out( return result; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/TensorProperties.cpp b/src/ATen/native/xpu/TensorProperties.cpp index 428d18fcd..ca35f0a41 100644 --- a/src/ATen/native/xpu/TensorProperties.cpp +++ b/src/ATen/native/xpu/TensorProperties.cpp @@ -1,16 +1,2 @@ -#include -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#endif - -namespace at { - -bool XPUNativeFunctions::is_set_to(const Tensor& self, const Tensor& src) { - return at::native::is_set_to(self, src); -} - -} // namespace at +namespace at {} // namespace at diff --git a/src/ATen/native/xpu/TensorShape.cpp b/src/ATen/native/xpu/TensorShape.cpp index 84c005121..0e5ee6e62 100644 --- a/src/ATen/native/xpu/TensorShape.cpp +++ b/src/ATen/native/xpu/TensorShape.cpp @@ -7,31 +7,22 @@ #include #include #include -#include #include - -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#endif +#include +#include +#include namespace at { -Tensor XPUNativeFunctions::view(const Tensor& self, IntArrayRef size) { - return at::native::view(self, size); -} +namespace native { -Tensor XPUNativeFunctions::view_as_real(const at::Tensor& self) { - return at::native::view_as_real(self); -} - -Tensor XPUNativeFunctions::view_as_complex(const Tensor& self) { - return at::native::view_as_complex(self); -} +TORCH_API at::Tensor as_strided_qtensorimpl( + const at::Tensor& self, + at::IntArrayRef size, + at::IntArrayRef stride, + ::std::optional storage_offset); -Tensor XPUNativeFunctions::as_strided( +Tensor as_strided_xpu( const Tensor& self, IntArrayRef size, IntArrayRef stride, @@ -43,227 +34,20 @@ Tensor XPUNativeFunctions::as_strided( return at::native::as_strided_tensorimpl(self, size, stride, storage_offset); } -Tensor XPUNativeFunctions::_reshape_alias( - const Tensor& self, - IntArrayRef size, - IntArrayRef stride) { - return at::native::_reshape_alias(self, size, stride); -} - -Tensor XPUNativeFunctions::unfold( - const Tensor& self, - int64_t dimension, - int64_t size, - int64_t step) { - return at::native::unfold(self, dimension, size, step); -} - -inline c10::MemoryFormat cat_compute_output_memory_format( - const MaterializedITensorListRef& inputs) { - c10::optional format = c10::nullopt; - for (const Tensor& t : inputs) { - auto f = t.suggest_memory_format(); - if (f == c10::MemoryFormat::Contiguous) { - return f; - } - if (format.has_value() && format.value() != f) { - return c10::MemoryFormat::Contiguous; - } - format = f; - } - return format.value(); -} - -inline void cat_check_no_zero_dim(const MaterializedITensorListRef& tensors) { - size_t i = 0; - for (const Tensor& t : tensors) { - TORCH_CHECK( - t.dim() > 0, - "zero-dimensional tensor (at position ", - i, - ") cannot be concatenated"); - i++; - } -} - -void cat_meta( - const ITensorListRef& tensors, - int64_t& dim, - Tensor& result, - size_t& valid, - bool& all_contiguous, - bool& all_same_dtype, - bool& all_same_sizes_and_stride, - c10::MemoryFormat& memory_format) { - // previously, size [0] tensors were the only possible empty tensors; thus, it - // wasn't possible to cat empty tensors unless all the other tensors were - // 1-dimensional, so we allowed these tensors to be "skipped". We maintain - // this behavior for backwards compatibility, but only for this specific size - // (i.e. other empty sizes are not skipped). - auto materialized = tensors.materialize(); - - cat_check_no_zero_dim(materialized); - dim = at::legacy_cat_wrap_dim(dim, materialized); - - // Checking names before the actual dimensions. - auto maybe_outnames = namedinference::compute_cat_outnames(materialized); - - TORCH_CHECK( - !materialized.empty(), - "torch.cat(): expected a non-empty list of Tensors"); - - // Look for the first valid tensor. - valid = materialized.size(); - for (const auto i : c10::irange(materialized.size())) { - if (!at::native::cat_should_skip_tensor(materialized[i].get())) { - valid = i; - break; - } - } - - all_contiguous = true; - all_same_dtype = true; - all_same_sizes_and_stride = true; - memory_format = cat_compute_output_memory_format(materialized); - - // Compute what the output dtype should be: - auto is_out_defined = result.defined(); - auto out_dtype = at::native::result_type(tensors); - - // If the output tensor is defined, we need to take it into account - // when computing the actual output dtype and the flags. - if (is_out_defined) { - // Check for type promotion, if the output tensor is defined. - TORCH_CHECK( - canCast(out_dtype, result.scalar_type()), - "torch.cat(): input types can't be cast to the desired output type ", - result.scalar_type()); - out_dtype = result.scalar_type(); - all_contiguous = result.is_contiguous(memory_format); - } - - // Fallback 'set_output' parameters. - // (in case we don't find a valid tensor) - DimVector sizes{0}; - TensorOptions options = - materialized[0].get().options().dtype(out_dtype).memory_format( - memory_format); - - // If we found a valid tensor, check whether the input tensors - // are compatible, i.e. we can execute `cat` on them. - bool found_valid_tensor = valid < materialized.size(); - if (found_valid_tensor) { - TORCH_CHECK( - dim <= materialized[valid].get().dim(), - "torch.cat(): dimension ", - dim, - "out of range"); - - // Compute the output tensor size. - // It should have the same shape as any other valid tensor, - // except in the dimension 'dim'. - size_t size_at_dim = 0; - for (const auto i : c10::irange(materialized.size())) { - const Tensor& t = materialized[i]; - all_same_dtype = all_same_dtype && out_dtype == t.scalar_type(); - if (!at::native::cat_should_skip_tensor(t)) { - at::native::check_cat_shape_except_dim(materialized[valid], t, dim, i); - size_at_dim += t.size(dim); - all_contiguous = all_contiguous && t.is_contiguous(memory_format); - all_same_sizes_and_stride = all_same_sizes_and_stride && - t.sizes() == materialized[valid].get().sizes() && - t.strides() == materialized[valid].get().strides(); - } else { - all_contiguous = false; - } - } - - // Actually set the output. - sizes = materialized[valid].get().sizes().vec(); - sizes[dim] = size_at_dim; - options = - materialized[valid].get().options().dtype(out_dtype).memory_format( - memory_format); - } - - if (is_out_defined) { - at::xpu::resize_out(result, sizes, {}, options); - } else { - result = at::xpu::create_out(sizes, {}, options); +TORCH_IMPL_FUNC(cat_out_xpu) +(const ITensorListRef& tensors, + int64_t dim, + int64_t valid, + bool all_contiguous, + bool all_same_dtype, + bool all_same_sizes_and_stride, + MemoryFormat memory_format, + const Tensor& result) { + if (result.numel() == 0) { + return; } - if (!maybe_outnames.empty()) { - namedinference::propagate_names(result, maybe_outnames); - } - // Checks for overlaps between the inputs and the output tensor. - if (is_out_defined && found_valid_tensor) { - at::assert_no_internal_overlap(result); - for (const Tensor& t : materialized) { - at::assert_no_overlap(result, t); - } - } -} - -Tensor& XPUNativeFunctions::cat_out( - const ITensorListRef& tensors, - int64_t dim, - Tensor& result) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, result, "xpu::cat_out", "out"); - c10::impl::check_and_update_common_device( - common_device, tensors, "xpu::cat_out", "tensors"); - - size_t valid; - bool all_contiguous; - bool all_same_dtype; - bool all_same_sizes_and_stride; - c10::MemoryFormat memory_format; - cat_meta( - tensors, - dim, - result, - valid, - all_contiguous, - all_same_dtype, - all_same_sizes_and_stride, - memory_format); - - at::native::xpu::cat_out_kernel( - tensors, - dim, - valid, - all_contiguous, - all_same_dtype, - all_same_sizes_and_stride, - memory_format, - result); - - return result; -} - -Tensor XPUNativeFunctions::cat(const ITensorListRef& tensors, int64_t dim) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, tensors, "xpu::cat", "tensors"); - - Tensor result; - size_t valid; - bool all_contiguous; - bool all_same_dtype; - bool all_same_sizes_and_stride; - c10::MemoryFormat memory_format; - cat_meta( - tensors, - dim, - result, - valid, - all_contiguous, - all_same_dtype, - all_same_sizes_and_stride, - memory_format); - - at::native::xpu::cat_out_kernel( + xpu::cat_out_kernel( tensors, dim, valid, @@ -272,8 +56,7 @@ Tensor XPUNativeFunctions::cat(const ITensorListRef& tensors, int64_t dim) { all_same_sizes_and_stride, memory_format, result); - - return result; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/TensorTopK.cpp b/src/ATen/native/xpu/TensorTopK.cpp index 3961160bf..ab3fc5250 100644 --- a/src/ATen/native/xpu/TensorTopK.cpp +++ b/src/ATen/native/xpu/TensorTopK.cpp @@ -2,55 +2,22 @@ #include #include #include -#include -#include - -namespace at { - -void topk_meta( - const Tensor& self, - int64_t k, - int64_t dim_, - bool largest, - bool sorted, - Tensor& values, - Tensor& indices) { - int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); - TORCH_CHECK( - k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1), - "selected index k out of range"); - int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim); - TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension"); - // Build the output size, which is the dim being selected set to - // size k - DimVector topKSize(self.sizes().vec()); - if (!topKSize.empty()) { - topKSize[dim] = k; - } +#include - if (values.defined()) { - at::xpu::resize_out(values, topKSize, {}, self.options()); - } else { - values = at::xpu::create_out(topKSize, {}, self.options()); - } +#include - if (indices.defined()) { - at::xpu::resize_out(indices, topKSize, {}, self.options().dtype(at::kLong)); - } else { - indices = - at::xpu::create_out(topKSize, {}, self.options().dtype(at::kLong)); - } -} +namespace at { -void topk_out_impl( - const Tensor& self, - int64_t k, - int64_t dim_, - bool largest, - bool sorted, - Tensor& values, - Tensor& indices) { +namespace native { +TORCH_IMPL_FUNC(topk_out_xpu) +(const Tensor& self, + int64_t k, + int64_t dim_, + bool largest, + bool sorted, + const Tensor& values, + const Tensor& indices) { int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); TORCH_CHECK( k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1), @@ -69,30 +36,6 @@ void topk_out_impl( native::xpu::topk_kernel(self, k, dim, largest, sorted, values, indices); } } - -std::tuple XPUNativeFunctions::topk( - const Tensor& self, - int64_t k, - int64_t dim, - bool largest, - bool sorted) { - Tensor values, indices; - topk_meta(self, k, dim, largest, sorted, values, indices); - topk_out_impl(self, k, dim, largest, sorted, values, indices); - return std::tuple(values, indices); -} - -std::tuple XPUNativeFunctions::topk_out( - const Tensor& self, - int64_t k, - int64_t dim, - bool largest, - bool sorted, - Tensor& values, - Tensor& indices) { - topk_meta(self, k, dim, largest, sorted, values, indices); - topk_out_impl(self, k, dim, largest, sorted, values, indices); - return std::forward_as_tuple(values, indices); -} +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/TensorTransformations.cpp b/src/ATen/native/xpu/TensorTransformations.cpp index 2ac3bee4f..eb6950e74 100644 --- a/src/ATen/native/xpu/TensorTransformations.cpp +++ b/src/ATen/native/xpu/TensorTransformations.cpp @@ -1,88 +1,18 @@ #include #include #include +#include +#include #include -#include - #include +#include namespace at { +namespace native { -Tensor XPUNativeFunctions::flip(const Tensor& self, IntArrayRef dims) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, self, "xpu::flip", "self"); - - const int64_t total_dims = self.dim(); - // It wraps the dims and checks that there are no repeated dims - auto flip_dims_b = at::dim_list_to_bitset(dims, total_dims); - - Tensor out_tensor = at::empty_like(self, MemoryFormat::Preserve); - - // Count dimensions in which we need to do work - int n = 0; - auto strides = DimVector(self.strides()); - for (int64_t i = 0; i < total_dims; i++) { - if (flip_dims_b[i] && self.size(i) > 1 && self.stride(i) != 0) { - n++; - strides[i] = 0; - } - } - - // Nothing to do, we return fast - if (n == 0 || self.numel() <= 1) { - out_tensor.copy_(self); - return out_tensor; - } - - // create dummy output with 0 strides at flipped dimension, to prevent - // tensorIterator from coalescing flipped dims - const auto restrided_self = self.as_strided(self.sizes(), strides); - auto iter = - TensorIteratorConfig() - .set_check_mem_overlap(false) - .check_all_same_dtype(false) - .declare_static_dtype_and_device(self.scalar_type(), self.device()) - .add_output(out_tensor) - .add_input(self) - .add_input(restrided_self) - .build(); - - auto* data = reinterpret_cast(iter.data_ptr(0)); - const auto sizes = iter.shape(); - // This is a SmallVector of _signed_ ints - auto strides_bytes = DimVector(iter.strides(0)); - const auto strides_self = iter.strides(1); - const auto strides_dummy = iter.strides(2); - - // To understand this transformation, think of a 3D cube. - // - The data ptr points to the lower-left most vertex of the cube - // - The strides tell us how to move in each dimension, - // that is, data + stride[i] advances one element in the dimension i - // To flip a dimension: - // - We move the pointer to the opposite vertex of the cube - // - We iterate in the opposite direction (invert the strides) - for (int i = 0; i < iter.ndim(); i++) { - // We know that an dimension has a zero stride and self[i] does not, as we - // defined above Note that it may be the case that strides_dummy[i] = 0 - // not because we set it, but because strides_self[i] == 0. We do not want - // to do anything there - if (strides_dummy[i] == 0 && strides_self[i] != 0) { - data += strides_bytes[i] * (sizes[i] - 1); - strides_bytes[i] *= -1; - } - } - iter._unsafe_set_arg_strides(0, strides_bytes); - iter._unsafe_set_arg_data(0, reinterpret_cast(data)); - - at::native::xpu::flip_kernel(iter); - return out_tensor; -} +REGISTER_XPU_DISPATCH(flip_stub, &xpu::flip_kernel); -Tensor XPUNativeFunctions::roll( - const Tensor& self, - IntArrayRef shifts, - IntArrayRef dims) { +Tensor roll_xpu(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { if (dims.size() != 1 || shifts.size() != 1) { return at::native::roll_common(self, shifts, dims); } @@ -96,9 +26,10 @@ Tensor XPUNativeFunctions::roll( return out_tensor; } - native::xpu::roll_kernel(in_tensor, out_tensor, shifts, dims); + xpu::roll_kernel(in_tensor, out_tensor, shifts, dims); return out_tensor; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/TriangluarOps.cpp b/src/ATen/native/xpu/TriangluarOps.cpp index affba5665..3db5e967b 100644 --- a/src/ATen/native/xpu/TriangluarOps.cpp +++ b/src/ATen/native/xpu/TriangluarOps.cpp @@ -1,77 +1,30 @@ -#include + #include #include #include -#include #include +#include -namespace at { - -void tril_meta(const Tensor& self, int64_t k) { - TORCH_CHECK( - self.dim() >= 2, "tril: input tensor must have at least 2 dimensions"); -} - -Tensor& XPUNativeFunctions::tril_out( - const Tensor& self, - int64_t diagonal, - Tensor& out) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, out, "xpu::tril_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::tril_out", "self"); - tril_meta(self, diagonal); - xpu::resize_out(out, self.sizes(), {}, self.options()); - return native::xpu::tril_kernel(out, self, diagonal); -} - -Tensor XPUNativeFunctions::tril(const Tensor& self, int64_t diagonal) { - tril_meta(self, diagonal); - Tensor out = xpu::create_out(self.sizes(), {}, self.options()); - return tril_out(self, diagonal, out); -} - -Tensor& XPUNativeFunctions::tril_(Tensor& self, int64_t diagonal) { - tril_meta(self, diagonal); - xpu::check_inplace(self, self.sizes(), self.options()); - return tril_out(self, diagonal, self); -} +#include +#include -void triu_meta(const Tensor& self, int64_t k) { - TORCH_CHECK( - self.dim() >= 2, "triu: input tensor must have at least 2 dimensions"); -} - -Tensor& XPUNativeFunctions::triu_out( - const Tensor& self, - int64_t diagonal, - Tensor& out) { - std::optional common_device = std::nullopt; - c10::impl::check_and_update_common_device( - common_device, out, "xpu::triu_out", "out"); - c10::impl::check_and_update_common_device( - common_device, self, "xpu::triu_out", "self"); - triu_meta(self, diagonal); - xpu::resize_out(out, self.sizes(), {}, self.options()); - return native::xpu::triu_kernel(out, self, diagonal); -} +namespace at::native { -Tensor XPUNativeFunctions::triu(const Tensor& self, int64_t diagonal) { - triu_meta(self, diagonal); - Tensor out = xpu::create_out(self.sizes(), {}, self.options()); - return triu_out(self, diagonal, out); +TORCH_IMPL_FUNC(tril_xpu)(const Tensor& self, int64_t k, const Tensor& result) { + if (self.numel() != 0) { + xpu::tril_kernel(result, self, k); + } } -Tensor& XPUNativeFunctions::triu_(Tensor& self, int64_t diagonal) { - triu_meta(self, diagonal); - xpu::check_inplace(self, self.sizes(), self.options()); - return triu_out(self, diagonal, self); +TORCH_IMPL_FUNC(triu_xpu)(const Tensor& self, int64_t k, const Tensor& result) { + if (self.numel() != 0) { + xpu::triu_kernel(result, self, k); + } } -Tensor XPUNativeFunctions::trace(const Tensor& self) { +Tensor trace_xpu(const Tensor& self) { TORCH_CHECK(self.dim() == 2, "expected a matrix"); return self.diagonal().sum(); } -} // namespace at +} // namespace at::native \ No newline at end of file diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp index f1bb12ba3..119b7bab9 100644 --- a/src/ATen/native/xpu/UnaryOps.cpp +++ b/src/ATen/native/xpu/UnaryOps.cpp @@ -1,8 +1,10 @@ #include #include + +#include #include #include -#include +#include #include #include @@ -20,1153 +22,61 @@ #include #include #include + #include #include #include #include -namespace at { - -template -static inline Tensor& unary_op_impl_out( - Tensor& result, - const Tensor& self, - Stub& stub) { - auto iter = TensorIterator::unary_op(result, self); - stub(iter); - return result; -} - -template -static inline Tensor& unary_op_impl_float_out( - Tensor& result, - const Tensor& self, - Stub& stub, - Args... args) { - auto iter = TensorIterator::unary_float_op(result, self); - stub(iter, args...); - iter.cast_outputs(); - return result; -} - -template -static inline Tensor& unary_op_impl_with_complex_to_float_out( - Tensor& result, - const Tensor& self, - Stub& stub, - bool promotes_integer_to_float) { - if (self.is_complex() && !result.is_complex()) { - // Checks if the corresponding float type can be cast to the desired dtype - const auto float_type = c10::toRealValueType(self.scalar_type()); - TORCH_CHECK( - canCast(float_type, result.scalar_type()), - "result type ", - float_type, - " can't be cast to the desired output type ", - result.scalar_type()); - - // Runs the function complex->complex, as TensorIterator expects - Tensor complex_result = at::empty({0}, self.options()); - auto iter = TensorIterator::unary_op(complex_result, self); - stub(iter); - - // Copies the complex result to the actual result and returns it - at::native::resize_output(result, complex_result.sizes()); - result.copy_(at::real(complex_result)); - return result; - } - - if (promotes_integer_to_float) { - return unary_op_impl_float_out(result, self, stub); - } - - return unary_op_impl_out(result, self, stub); -} - -// out_impl passed into unary_op_impl and unary_op_impl_ must go through at:: -// device dispatch otherwise it won't dispatch to out-of-source devices like -// XLA. For example it must be at::bitwise_not_out instead of -// bitwise_not_out(which is at::native!). -template -static inline Tensor unary_op_impl(const Tensor& self, OutImpl& out_impl) { - Tensor result = at::empty({0}, self.options()); - return out_impl(result, self); -} - -// An alternate version of unary_op_impl that follows the same pattern -// for non-complex inputs, but returns a floating point tensor -// for complex inputs by default. -template -static inline Tensor unary_op_impl_with_complex_to_float( - const Tensor& self, - OutImpl& out_impl) { - if (self.is_complex()) { - const auto float_type = c10::toRealValueType(self.scalar_type()); - Tensor result = at::empty_like(self, self.options().dtype(float_type)); - return out_impl(result, self); - } - - Tensor result = at::empty({0}, self.options()); - return out_impl(result, self); -} - -template -static inline Tensor& unary_op_impl_(Tensor& self, OutImpl& out_impl) { - return out_impl(self, self); -} - -Tensor XPUNativeFunctions::abs(const Tensor& self) { - return unary_op_impl_with_complex_to_float(self, at::abs_out); -} - -Tensor& XPUNativeFunctions::abs_(Tensor& self) { - TORCH_CHECK( - !self.is_complex(), "In-place abs is not supported for complex tensors."); - return unary_op_impl_(self, at::abs_out); -} - -Tensor& XPUNativeFunctions::abs_out(const Tensor& self, Tensor& out) { - return unary_op_impl_with_complex_to_float_out( - out, - self, - native::xpu::abs_kernel, - /*promotes_integer_to_float=*/false); -} - -Tensor XPUNativeFunctions::sin(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sin_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::sin_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::sin_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::sin_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sin_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::cos(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::cos_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::cos_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::cos_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::cos_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::cos_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::digamma(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::digamma_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::digamma_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::digamma_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::digamma_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::digamma_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::polygamma(int64_t n, const Tensor& self) { - TORCH_CHECK(n >= 0, "polygamma(n, x) does not support negative n."); - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::polygamma_kernel(iter, n); - return iter.output(); -} - -Tensor& XPUNativeFunctions::polygamma_out( - int64_t n, - const Tensor& self, - Tensor& out) { - TORCH_CHECK(n >= 0, "polygamma(n, x) does not support negative n."); - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::polygamma_kernel(iter, n); - return out; -} - -Tensor& XPUNativeFunctions::polygamma_(Tensor& self, int64_t n) { - return polygamma_out(n, self, self); -} - -Tensor XPUNativeFunctions::lgamma(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::lgamma_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::lgamma_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::lgamma_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::lgamma_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::lgamma_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::log(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::log_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::log_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::log_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::log10(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log10_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::log10_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::log10_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::log10_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log10_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::log1p(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log1p_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::log1p_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::log1p_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::log1p_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log1p_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::log2(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log2_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::log2_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::log2_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::log2_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::log2_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::sqrt(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sqrt_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::sqrt_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::sqrt_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::sqrt_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sqrt_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::rsqrt(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::rsqrt_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::rsqrt_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::rsqrt_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::rsqrt_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::rsqrt_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::tanh(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::tanh_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::tanh_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::tanh_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::tanh_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::tanh_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::neg(const Tensor& self) { - TORCH_CHECK( - self.scalar_type() != kBool, - "Negation, the `-` operator, on a bool tensor is not supported. " - "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead."); - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::neg_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::neg_(Tensor& self) { - TORCH_CHECK( - self.scalar_type() != kBool, - "Negation, the `-` operator, on a bool tensor is not supported. " - "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead."); - TensorIterator iter; - iter.build_borrowing_unary_op(self, self); - native::xpu::neg_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::neg_out(const Tensor& self, Tensor& out) { - TORCH_CHECK( - self.scalar_type() != kBool, - "Negation, the `-` operator, on a bool tensor is not supported. " - "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead."); - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::neg_kernel(iter); - return out; -} - -TensorIterator logical_not_meta(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build(TensorIteratorConfig() - .check_all_same_dtype(false) - .add_output(out) - .add_const_input(self)); - return iter; -} - -Tensor XPUNativeFunctions::logical_not(const Tensor& self) { - Tensor out = at::empty({0}, self.options().dtype(kBool)); - return at::logical_not_out(out, self); -} - -Tensor& XPUNativeFunctions::logical_not_(Tensor& self) { - return at::logical_not_out(self, self); -} - -Tensor& XPUNativeFunctions::logical_not_out(const Tensor& self, Tensor& out) { - auto iter = logical_not_meta(self, out); - native::xpu::logical_not_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::reciprocal(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::reciprocal_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::reciprocal_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::reciprocal_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::reciprocal_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::reciprocal_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::bitwise_not_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::bitwise_not_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::exp(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::exp_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::exp_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::exp_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::exp_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::exp_kernel(iter); - return self; -} - -Tensor XPUNativeFunctions::sigmoid(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sigmoid_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::sigmoid_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::sigmoid_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::sigmoid_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sigmoid_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::sign(const Tensor& self) { - TORCH_CHECK( - !self.is_complex(), - "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead."); - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::sign_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::sign_(Tensor& self) { - TORCH_CHECK( - !self.is_complex(), - "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead."); - TensorIterator iter; - iter.build_borrowing_unary_op(self, self); - native::xpu::sign_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::sign_out(const Tensor& self, Tensor& out) { - TORCH_CHECK( - !self.is_complex(), - "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead."); - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::sign_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::signbit(const Tensor& self) { - TORCH_CHECK( - !self.is_complex(), "signbit is not implemented for complex tensors."); - - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_force_boolean_op(out, self); - - if (self.dtype() == at::kBool) { - iter.output().fill_(false); - } else { - native::xpu::signbit_kernel(iter); - } - return iter.output(); -} - -Tensor& XPUNativeFunctions::signbit_out(const Tensor& self, Tensor& out) { - TORCH_CHECK( - !self.is_complex(), "signbit is not implemented for complex tensors."); - TORCH_CHECK( - out.dtype() == at::kBool, - "signbit does not support non-boolean outputs."); - - TensorIterator iter; - iter.build_borrowing_unary_force_boolean_op(out, self); - - if (self.dtype() == at::kBool) { - out.fill_(false); - } else { - native::xpu::signbit_kernel(iter); - } - return out; -} - -Tensor& XPUNativeFunctions::logit_out( - const Tensor& self, - std::optional eps, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::logit_kernel(iter, Scalar(eps ? eps.value() : -1.0)); - return out; -} - -Tensor XPUNativeFunctions::logit( - const Tensor& self, - std::optional eps) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::logit_kernel(iter, Scalar(eps ? eps.value() : -1.0)); - return iter.output(); -} - -Tensor& XPUNativeFunctions::logit_(Tensor& self, std::optional eps) { - return at::logit_out(self, self, eps); -} - -Tensor XPUNativeFunctions::sgn(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - if (self.is_complex()) { - native::xpu::sgn_kernel(iter); - } else { - native::xpu::sign_kernel(iter); - } - return iter.output(); -} - -Tensor& XPUNativeFunctions::sgn_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_op(self, self); - if (self.is_complex()) { - native::xpu::sgn_kernel(iter); - } else { - native::xpu::sign_kernel(iter); - } - return self; -} - -Tensor& XPUNativeFunctions::sgn_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - if (self.is_complex()) { - native::xpu::sgn_kernel(iter); - } else { - native::xpu::sign_kernel(iter); - } - return out; -} - -Tensor XPUNativeFunctions::acos(const Tensor& self) { - Tensor out; - TensorIterator iter; - - iter.build_borrowing_unary_float_op(out, self); - native::xpu::acos_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::acos_(Tensor& self) { - TensorIterator iter; - - iter.build_borrowing_unary_float_op(self, self); - native::xpu::acos_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::acos_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - - iter.build_borrowing_unary_float_op(out, self); - native::xpu::acos_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::acosh(const Tensor& self) { - Tensor out; - TensorIterator iter; - - iter.build_borrowing_unary_float_op(out, self); - native::xpu::acosh_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::acosh_(Tensor& self) { - TensorIterator iter; - - iter.build_borrowing_unary_float_op(self, self); - native::xpu::acosh_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::acosh_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - - iter.build_borrowing_unary_float_op(out, self); - native::xpu::acosh_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::erf(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::erf_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::erf_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::erf_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::erf_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::erf_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::erfc(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::erfc_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::erfc_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::erfc_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::erfc_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::erfinv(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::erfinv_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::erfinv_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::erfinv_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::erfinv_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::erfinv_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::exp2(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::exp2_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::exp2_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::exp2_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::exp2_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::exp2_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::expm1(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::expm1_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::expm1_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::expm1_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::expm1_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::expm1_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::frac(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::frac_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::frac_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_op(self, self); - native::xpu::frac_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::frac_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::frac_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::sinh(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sinh_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::sinh_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::sinh_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::sinh_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::sinh_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::asinh(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::asinh_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::asinh_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::asinh_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::asinh_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::asinh_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::asin(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::asin_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::asin_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::asin_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::asin_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::asin_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::tan(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::tan_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::tan_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::tan_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::tan_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::tan_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::atan(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::atan_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::atan_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::atan_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::atan_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::atan_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::atanh(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::atanh_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::atanh_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::atanh_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::atanh_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::atanh_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::cosh(const Tensor& self) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::cosh_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::cosh_(Tensor& self) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(self, self); - native::xpu::cosh_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::cosh_out(const Tensor& self, Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_float_op(out, self); - native::xpu::cosh_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::conj_physical_out(const Tensor& self, Tensor& out) { - auto iter = TensorIterator::unary_op(out, self); - native::xpu::conj_physical_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::conj_physical_(Tensor& self) { - if (!self.is_complex()) - return self; - return XPUNativeFunctions::conj_physical_out(self, self); -} - -TensorIterator ceil_meta(const Tensor& self, Tensor& out) { - TORCH_CHECK(!self.is_complex(), "ceil is not supported for complex inputs"); - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - return iter; -} - -Tensor XPUNativeFunctions::ceil(const Tensor& self) { - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - return self.clone(); - } - Tensor out; - auto iter = ceil_meta(self, out); - native::xpu::ceil_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::ceil_(Tensor& self) { - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - return self; - } - auto iter = ceil_meta(self, self); - native::xpu::ceil_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::ceil_out(const Tensor& self, Tensor& out) { - auto iter = ceil_meta(self, out); - - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - out.copy_(self); - return out; - } - native::xpu::ceil_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::round(const Tensor& self) { - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - return self.clone(); - } - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::round_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::round_(Tensor& self) { - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - return self; - } - TensorIterator iter; - iter.build_borrowing_unary_op(self, self); - native::xpu::round_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::round_out(const Tensor& self, Tensor& out) { - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - out.copy_(self); - return out; - } - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - native::xpu::round_kernel(iter); - return out; -} - -Tensor XPUNativeFunctions::round(const Tensor& self, int64_t decimals) { - Tensor out; - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - if (decimals != 0) { - native::xpu::round_decimals_kernel(iter, decimals); - } else { - native::xpu::round_kernel(iter); - } - return iter.output(); -} - -Tensor& XPUNativeFunctions::round_(Tensor& self, int64_t decimals) { - TensorIterator iter; - iter.build_borrowing_unary_op(self, self); - if (decimals != 0) { - native::xpu::round_decimals_kernel(iter, decimals); - } else { - native::xpu::round_kernel(iter); - } - return self; -} - -Tensor& XPUNativeFunctions::round_out( - const Tensor& self, - int64_t decimals, - Tensor& out) { - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - if (decimals != 0) { - native::xpu::round_decimals_kernel(iter, decimals); - } else { - native::xpu::round_kernel(iter); - } - return out; -} - -TensorIterator meta_floor(const Tensor& self, Tensor& out) { - // Note: this is consistent with NumPy - TORCH_CHECK(!self.is_complex(), "floor is not supported for complex inputs"); - TensorIterator iter; - iter.build_borrowing_unary_op(out, self); - return iter; -} - -Tensor XPUNativeFunctions::floor(const Tensor& self) { - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - return self.clone(); - } - Tensor out; - auto iter = meta_floor(self, out); - native::xpu::floor_kernel(iter); - return iter.output(); -} - -Tensor& XPUNativeFunctions::floor_(Tensor& self) { - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - return self; - } - auto iter = meta_floor(self, self); - native::xpu::floor_kernel(iter); - return self; -} - -Tensor& XPUNativeFunctions::floor_out(const Tensor& self, Tensor& out) { - auto iter = meta_floor(self, out); - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { - out.copy_(self); - return out; - } - - native::xpu::floor_kernel(iter); - return out; -} - -Tensor& XPUNativeFunctions::nan_to_num_out( - const Tensor& self, - std::optional nan, - std::optional pos_inf, - std::optional neg_inf, - Tensor& result) { - TORCH_CHECK( - self.scalar_type() == result.scalar_type(), - "nan_to_num: dtype of out: ", - result.scalar_type(), - " should be same as input: ", - self.scalar_type()); - - if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) { - at::native::resize_output(result, self.sizes()); - result.copy_(self); - return result; - } - - auto iter = TensorIterator::unary_op(result, self); - native::xpu::nan_to_num_kernel(iter, nan, pos_inf, neg_inf); - return result; -} +#include +#include +#include +namespace at { +namespace native { +REGISTER_XPU_DISPATCH(abs_stub, &xpu::abs_kernel); +REGISTER_XPU_DISPATCH(sin_stub, &xpu::sin_kernel); +REGISTER_XPU_DISPATCH(cos_stub, &xpu::cos_kernel); +REGISTER_XPU_DISPATCH(digamma_stub, &xpu::digamma_kernel); +REGISTER_XPU_DISPATCH(polygamma_stub, &xpu::polygamma_kernel); +REGISTER_XPU_DISPATCH(lgamma_stub, &xpu::lgamma_kernel); +REGISTER_XPU_DISPATCH(log_stub, &xpu::log_kernel); +REGISTER_XPU_DISPATCH(log10_stub, &xpu::log10_kernel); +REGISTER_XPU_DISPATCH(log1p_stub, &xpu::log1p_kernel); +REGISTER_XPU_DISPATCH(log2_stub, &xpu::log2_kernel); +REGISTER_XPU_DISPATCH(sqrt_stub, &xpu::sqrt_kernel); +REGISTER_XPU_DISPATCH(rsqrt_stub, &xpu::rsqrt_kernel); +REGISTER_XPU_DISPATCH(tanh_stub, &xpu::tanh_kernel); +REGISTER_XPU_DISPATCH(neg_stub, &xpu::neg_kernel); +REGISTER_XPU_DISPATCH(logical_not_stub, &xpu::logical_not_kernel); +REGISTER_XPU_DISPATCH(reciprocal_stub, &xpu::reciprocal_kernel); +REGISTER_XPU_DISPATCH(bitwise_not_stub, &xpu::bitwise_not_kernel); +REGISTER_XPU_DISPATCH(exp_stub, &xpu::exp_kernel); +REGISTER_XPU_DISPATCH(sigmoid_stub, &xpu::sigmoid_kernel); +REGISTER_XPU_DISPATCH(logit_stub, &xpu::logit_kernel); +REGISTER_XPU_DISPATCH(sgn_stub, &xpu::sgn_kernel); +REGISTER_XPU_DISPATCH(sign_stub, &xpu::sign_kernel); +REGISTER_XPU_DISPATCH(signbit_stub, &xpu::signbit_kernel); +REGISTER_XPU_DISPATCH(acos_stub, &xpu::acos_kernel); +REGISTER_XPU_DISPATCH(acosh_stub, &xpu::acosh_kernel); +REGISTER_XPU_DISPATCH(erf_stub, &xpu::erf_kernel); +REGISTER_XPU_DISPATCH(erfc_stub, &xpu::erfc_kernel); +REGISTER_XPU_DISPATCH(erfinv_stub, &xpu::erfinv_kernel); +REGISTER_XPU_DISPATCH(exp2_stub, &xpu::exp2_kernel); +REGISTER_XPU_DISPATCH(expm1_stub, &xpu::expm1_kernel); +REGISTER_XPU_DISPATCH(frac_stub, &xpu::frac_kernel); +REGISTER_XPU_DISPATCH(conj_physical_stub, &xpu::conj_physical_kernel); +REGISTER_XPU_DISPATCH(ceil_stub, &xpu::ceil_kernel); +REGISTER_XPU_DISPATCH(sinh_stub, &xpu::sinh_kernel); +REGISTER_XPU_DISPATCH(asinh_stub, &xpu::asinh_kernel); +REGISTER_XPU_DISPATCH(asin_stub, &xpu::asin_kernel); +REGISTER_XPU_DISPATCH(tan_stub, &xpu::tan_kernel); +REGISTER_XPU_DISPATCH(atan_stub, &xpu::atan_kernel); +REGISTER_XPU_DISPATCH(atanh_stub, &xpu::atanh_kernel); +REGISTER_XPU_DISPATCH(cosh_stub, &xpu::cosh_kernel); +REGISTER_XPU_DISPATCH(nan_to_num_stub, &xpu::nan_to_num_kernel); +REGISTER_XPU_DISPATCH(round_stub, &xpu::round_kernel); +REGISTER_XPU_DISPATCH(round_decimals_stub, &xpu::round_decimals_kernel); +REGISTER_XPU_DISPATCH(floor_stub, &xpu::floor_kernel); +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/UnfoldBackward.cpp b/src/ATen/native/xpu/UnfoldBackward.cpp index d97546367..4f54e07ca 100644 --- a/src/ATen/native/xpu/UnfoldBackward.cpp +++ b/src/ATen/native/xpu/UnfoldBackward.cpp @@ -1,26 +1,12 @@ -#include #include -#include - +#include +#include #include +#include namespace at { -Tensor XPUNativeFunctions::unfold_backward( - const Tensor& grad, - IntArrayRef input_sizes, - int64_t dim, - int64_t size, - int64_t step) { - auto grad_input = at::zeros(input_sizes, grad.options()); - if (step >= size) { - auto gI_unfolded = grad_input.unfold(dim, size, step); - gI_unfolded.copy_(grad); - return grad_input; - } - native::xpu::unfold_backward_kernel(grad_input, grad, dim, size, step); - - return grad_input; +namespace native { +REGISTER_XPU_DISPATCH(unfold_backward_stub, &xpu::unfold_backward_kernel); } - } // namespace at diff --git a/src/ATen/native/xpu/Unique.cpp b/src/ATen/native/xpu/Unique.cpp index 2442370bd..423bae002 100644 --- a/src/ATen/native/xpu/Unique.cpp +++ b/src/ATen/native/xpu/Unique.cpp @@ -1,53 +1,55 @@ #include -#include namespace at { -std::tuple XPUNativeFunctions::unique_consecutive( +namespace native { + +std::tuple unique_dim_xpu( const Tensor& self, + const int64_t dim, + const bool sorted, const bool return_inverse, - const bool return_counts, - std::optional dim) { - if (!dim.has_value()) { - return native::xpu::unique_consecutive_kernel( - self, return_inverse, return_counts, dim); - } - return native::xpu::unique_dim_consecutive_kernel( - self, dim.value(), return_inverse, return_counts); + const bool return_counts) { + return xpu::unique_dim_kernel(self, dim, return_inverse, return_counts); +} + +std::tuple _unique_xpu( + const Tensor& self, + const bool sorted, + const bool return_inverse) { + return xpu::_unique_kernel(self, return_inverse); } -std::tuple XPUNativeFunctions::unique_dim_consecutive( +std::tuple unique_dim_consecutive_xpu( const at::Tensor& self, int64_t dim, bool return_inverse, bool return_counts) { - return native::xpu::unique_dim_consecutive_kernel( + return xpu::unique_dim_consecutive_kernel( self, dim, return_inverse, return_counts); } -std::tuple XPUNativeFunctions::unique_dim( +std::tuple unique_consecutive_xpu( const Tensor& self, - const int64_t dim, - const bool sorted, const bool return_inverse, - const bool return_counts) { - return native::xpu::unique_dim_kernel( - self, dim, return_inverse, return_counts); -} - -std::tuple XPUNativeFunctions::_unique( - const Tensor& self, - const bool sorted, - const bool return_inverse) { - return native::xpu::_unique_kernel(self, return_inverse); + const bool return_counts, + std::optional dim) { + if (!dim.has_value()) { + return xpu::unique_consecutive_kernel( + self, return_inverse, return_counts, dim); + } + return xpu::unique_dim_consecutive_kernel( + self, dim.value(), return_inverse, return_counts); } -std::tuple XPUNativeFunctions::_unique2( +std::tuple _unique2_xpu( const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) { - return native::xpu::_unique2_kernel(self, return_inverse, return_counts); + return xpu::_unique2_kernel(self, return_inverse, return_counts); } +} // namespace native + } // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/UpSampleBicubic2d.cpp b/src/ATen/native/xpu/UpSampleBicubic2d.cpp index 509d6e449..8ab810eb9 100644 --- a/src/ATen/native/xpu/UpSampleBicubic2d.cpp +++ b/src/ATen/native/xpu/UpSampleBicubic2d.cpp @@ -2,66 +2,20 @@ #include #include #include -#include #include +#include namespace at { - -void upsample_bicubic2d_meta( - Tensor& output, - const Tensor& input, - IntArrayRef output_size, - bool align_corners, - std::optional scales_h, - std::optional scales_w) { - auto full_output_size = - native::xpu::upsample_2d_common_check(input.sizes(), output_size); - - // Allow for empty batch size but not other dimensions - TORCH_CHECK( - input.numel() != 0 || - c10::multiply_integers( - input.sizes().begin() + 1, input.sizes().end()), - "Non-empty 4D data tensor expected but got a tensor with sizes ", - input.sizes()); - auto memory_format = input.suggest_memory_format(); - if (output.defined()) { - xpu::resize_out( - output, - full_output_size, - {}, - input.options().memory_format(memory_format)); - } else { - output = at::xpu::create_out( - full_output_size, {}, input.options().memory_format(memory_format)); - } -} - -Tensor& XPUNativeFunctions::upsample_bicubic2d_out( - const Tensor& self, - IntArrayRef output_size, - bool align_corners, - std::optional scales_h, - std::optional scales_w, - Tensor& output) { - upsample_bicubic2d_meta( - output, self, output_size, align_corners, scales_h, scales_w); - native::xpu::upsample_bicubic2d_kernel( - output, self, output_size, align_corners, scales_h, scales_w); - return output; +namespace native { +TORCH_IMPL_FUNC(upsample_bicubic2d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + bool align_corners, + std::optional scales_h, + std::optional scales_w, + const Tensor& output) { + xpu::upsample_bicubic2d_kernel( + output, input, output_size, align_corners, scales_h, scales_w); } - -Tensor XPUNativeFunctions::upsample_bicubic2d( - const Tensor& self, - IntArrayRef output_size, - bool align_corners, - std::optional scales_h, - std::optional scales_w) { - Tensor output; - upsample_bicubic2d_out( - self, output_size, align_corners, scales_h, scales_w, output); - - return output; -} - +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp index f0ace4344..67fed551c 100644 --- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp +++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp @@ -1,133 +1,34 @@ #include #include -#include - #include #include #include -namespace at { - -void upsample_bilinear2d_meta( - const Tensor& input, - IntArrayRef output_size, - bool align_corners, - std::optional scales_h, - std::optional scales_w, - Tensor& output) { - auto full_output_size = - native::xpu::upsample_2d_common_check(input.sizes(), output_size); - - // Allow for empty batch size but not other dimensions - TORCH_CHECK( - input.numel() != 0 || - c10::multiply_integers( - input.sizes().begin() + 1, input.sizes().end()), - "Non-empty 4D data tensor expected but got a tensor with sizes ", - input.sizes()); - - auto memory_format = input.suggest_memory_format(); - if (output.defined()) { - xpu::resize_out( - output, - full_output_size, - {}, - input.options().memory_format(memory_format)); - } else { - output = at::xpu::create_out( - full_output_size, {}, input.options().memory_format(memory_format)); - } -} - -void upsample_bilinear2d_backward_meta( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - bool align_corners, - std::optional scales_h, - std::optional scales_w, - Tensor& grad_input) { - auto full_output_size = - native::xpu::upsample_2d_common_check(input_size, output_size); +#include +#include - TORCH_CHECK( - grad_output.dim() == 4, - "Expected grad_output to be a tensor of dimension 4 but got: dimension ", - grad_output.dim()); - - for (const auto i : c10::irange(4)) { - TORCH_CHECK( - grad_output.size(i) == full_output_size[i], - "Expected grad_output to have the same shape as output;", - " output.size(", - i, - ") = ", - full_output_size[i], - " but got grad_output.size(", - i, - ") = ", - grad_output.size(i)); - } - - auto memory_format = grad_output.suggest_memory_format(); - if (grad_input.defined()) { - xpu::resize_out( - grad_input, - input_size, - {}, - grad_output.options().memory_format(memory_format)); - } else { - grad_input = at::xpu::create_out( - input_size, {}, grad_output.options().memory_format(memory_format)); - } -} - -Tensor& XPUNativeFunctions::upsample_bilinear2d_out( - const Tensor& self, - IntArrayRef output_size, - bool align_corners, - std::optional scales_h, - std::optional scales_w, - Tensor& output) { - upsample_bilinear2d_meta( - self, output_size, align_corners, scales_h, scales_w, output); - native::xpu::upsample_bilinear2d_out_kernel( - output, self, output_size, align_corners, scales_h, scales_w); - return output; -} - -Tensor XPUNativeFunctions::upsample_bilinear2d( - const Tensor& self, - IntArrayRef output_size, - bool align_corners, - std::optional scales_h, - std::optional scales_w) { - Tensor output; - upsample_bilinear2d_out( - self, output_size, align_corners, scales_h, scales_w, output); - return output; +namespace at { +namespace native { +TORCH_IMPL_FUNC(upsample_bilinear2d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + bool align_corners, + std::optional scales_h, + std::optional scales_w, + const Tensor& output) { + xpu::upsample_bilinear2d_out_kernel( + output, input, output_size, align_corners, scales_h, scales_w); } -Tensor& XPUNativeFunctions::upsample_bilinear2d_backward_out( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - bool align_corners, - c10::optional scales_h, - c10::optional scales_w, - Tensor& grad_input) { - globalContext().alertNotDeterministic("upsample_bilinear2d_backward_xpu"); - - upsample_bilinear2d_backward_meta( - grad_output, - output_size, - input_size, - align_corners, - scales_h, - scales_w, - grad_input); - - native::xpu::upsample_bilinear2d_backward_out_kernel( +TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_xpu) +(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners, + std::optional scales_h, + std::optional scales_w, + const Tensor& grad_input) { + xpu::upsample_bilinear2d_backward_out_kernel( grad_input, grad_output, output_size, @@ -135,26 +36,7 @@ Tensor& XPUNativeFunctions::upsample_bilinear2d_backward_out( align_corners, scales_h, scales_w); - return grad_input; -} - -Tensor XPUNativeFunctions::upsample_bilinear2d_backward( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { - Tensor grad_input; - upsample_bilinear2d_backward_out( - grad_output, - output_size, - input_size, - align_corners, - scales_h, - scales_w, - grad_input); - return grad_input; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/UpSampleLinear1d.cpp b/src/ATen/native/xpu/UpSampleLinear1d.cpp index fcce31524..13dfa33de 100644 --- a/src/ATen/native/xpu/UpSampleLinear1d.cpp +++ b/src/ATen/native/xpu/UpSampleLinear1d.cpp @@ -1,111 +1,42 @@ #include #include #include -#include + #include #include "ATen/core/ATen_fwd.h" -namespace at { - -void upsample_linear1d_meta( - const Tensor& input, - IntArrayRef output_size, - bool align_corners, - std::optional scales, - Tensor& output) { - auto full_output_size = - at::native::xpu::upsample_1d_common_check(input.sizes(), output_size); - - // Allow for empty batch size but not other dimensions - TORCH_CHECK( - (input.size(1) != 0 && input.size(2) != 0) && input.dim() == 3, - "Non-empty 3D data tensor expected but got a tensor with sizes ", - input.sizes()); - - if (output.defined()) { - at::xpu::resize_out(output, full_output_size, {}, input.options()); - } else { - output = at::xpu::create_out(full_output_size, {}, input.options()); - } -} -void upsample_linear1d_backward_meta( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - bool align_corners, - std::optional scales, - Tensor& grad_input) { - auto full_output_size = - at::native::xpu::upsample_1d_common_check(input_size, output_size); - - TORCH_CHECK( - input_size.size() == 3, - "It is expected input_size equals to 3, but got size ", - input_size.size()); - - check_dim_size(grad_output, 3, 0, full_output_size[0]); - check_dim_size(grad_output, 3, 1, full_output_size[1]); - check_dim_size(grad_output, 3, 2, full_output_size[2]); - - if (grad_input.defined()) { - at::xpu::resize_out(grad_input, input_size, {}, grad_output.options()); - } else { - grad_input = at::xpu::create_out(input_size, {}, grad_output.options()); - } -} - -Tensor XPUNativeFunctions::upsample_linear1d( - const Tensor& input, - IntArrayRef output_size, - bool align_corners, - std::optional scales) { - Tensor output; - return upsample_linear1d_out( - input, output_size, align_corners, scales, output); -} - -Tensor& XPUNativeFunctions::upsample_linear1d_out( - const Tensor& input, - IntArrayRef output_size, - bool align_corners, - std::optional scales, - Tensor& output) { - upsample_linear1d_meta(input, output_size, align_corners, scales, output); +#include +#include +namespace at { +namespace native { + +TORCH_IMPL_FUNC(upsample_linear1d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + bool align_corners, + std::optional scales, + const Tensor& output) { TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; checkAllSameGPU(__func__, {input_arg, output_arg}); - native::xpu::upsample_linear1d_kernel( + xpu::upsample_linear1d_kernel( input, output_size, align_corners, scales, output); - return output; } -Tensor XPUNativeFunctions::upsample_linear1d_backward( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - bool align_corners, - std::optional scales) { - Tensor grad_input; - return upsample_linear1d_backward_out( - grad_output, output_size, input_size, align_corners, scales, grad_input); -} - -Tensor& XPUNativeFunctions::upsample_linear1d_backward_out( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - bool align_corners, - std::optional scales, - Tensor& grad_input) { - upsample_linear1d_backward_meta( - grad_output, output_size, input_size, align_corners, scales, grad_input); +TORCH_IMPL_FUNC(upsample_linear1d_backward_out_xpu) +(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners, + std::optional scales, + const Tensor& grad_input) { TensorArg grad_output_arg{grad_output, "grad_output", 1}, grad_input_arg{grad_input, "grad_input", 2}; checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg}); - native::xpu::upsample_linear1d_backward_kernel( + xpu::upsample_linear1d_backward_kernel( grad_output, output_size, input_size, align_corners, scales, grad_input); - return grad_input; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/UpSampleNearest1d.cpp b/src/ATen/native/xpu/UpSampleNearest1d.cpp index e2684fcac..30287e4b2 100644 --- a/src/ATen/native/xpu/UpSampleNearest1d.cpp +++ b/src/ATen/native/xpu/UpSampleNearest1d.cpp @@ -1,173 +1,51 @@ -#include #include #include -#include +#include -namespace at { - -Tensor& upsample_nearest1d_meta( - const Tensor& input, - Tensor& output, - IntArrayRef output_size) { - auto input_size = input.sizes(); - TORCH_CHECK( - output_size.size() == 1, - "It is expected output_size equals to 1, but got size ", - output_size.size()); - - TORCH_CHECK( - input_size.size() == 3, - "It is expected input_size equals to 3, but got size ", - input_size.size()); - - int64_t output_width = output_size[0]; - - int64_t nbatch = input_size[0]; - int64_t channels = input_size[1]; - int64_t input_width = input_size[2]; - - TORCH_CHECK( - input_width > 0 && output_width > 0, - "Input and output sizes should be greater than 0, but got input (W: ", - input_width, - ") and output (W: ", - output_width, - ")"); - TORCH_CHECK( - (input.size(1) != 0 && input.size(2) != 0) && input.dim() == 3, - "Non-empty 3D data tensor expected but got a tensor with sizes ", - input.sizes()); - - if (!output.defined()) - output = at::empty({nbatch, channels, output_width}, input.options()); - return output; -} - -Tensor& upsample_nearest1d_backward_meta( - const Tensor& grad_output, - Tensor& grad_input, - IntArrayRef input_size, - IntArrayRef output_size) { - TORCH_CHECK( - output_size.size() == 1, - "It is expected output_size equals to 1, but got size ", - output_size.size()); - TORCH_CHECK( - input_size.size() == 3, - "It is expected input_size equals to 3, but got size ", - input_size.size()); - int64_t output_width = output_size[0]; - int64_t nbatch = input_size[0]; - int64_t channels = input_size[1]; - int64_t input_width = input_size[2]; - TORCH_CHECK( - input_width > 0 && output_width > 0, - "Input and output sizes should be greater than 0, but got input (W: ", - input_width, - ") and output (W: ", - output_width, - ")"); - check_dim_size(grad_output, 3, 0, nbatch); - check_dim_size(grad_output, 3, 1, channels); - check_dim_size(grad_output, 3, 2, output_width); - if (!grad_input.defined()) - grad_input = at::empty(input_size, grad_output.options()); - return grad_input; -} - -Tensor XPUNativeFunctions::_upsample_nearest_exact1d( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales) { - Tensor output; - output = upsample_nearest1d_meta(input, output, output_size); - at::native::xpu::upsample_nearest1d_kernel( - output, input, output_size, scales, true); - return output; -} +#include +#include +#include +#include -Tensor& XPUNativeFunctions::_upsample_nearest_exact1d_out( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales, - Tensor& output) { - upsample_nearest1d_meta(input, output, output_size); - at::native::xpu::upsample_nearest1d_kernel( - output, input, output_size, scales, true); - return output; -} - -Tensor XPUNativeFunctions::upsample_nearest1d( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales) { - Tensor output; - output = upsample_nearest1d_meta(input, output, output_size); - at::native::xpu::upsample_nearest1d_kernel( - output, input, output_size, scales, false); - return output; -} - -Tensor& XPUNativeFunctions::upsample_nearest1d_out( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales, - Tensor& output) { - upsample_nearest1d_meta(input, output, output_size); - at::native::xpu::upsample_nearest1d_kernel( - output, input, output_size, scales, false); - return output; -} - -Tensor XPUNativeFunctions::_upsample_nearest_exact1d_backward( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - std::optional scales) { - Tensor grad_input; - grad_input = upsample_nearest1d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest1d_backward_kernel( - grad_input, grad_output, output_size, input_size, scales, true); - return grad_input; -} -Tensor& XPUNativeFunctions::_upsample_nearest_exact1d_backward_out( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - std::optional scales, - Tensor& grad_input) { - upsample_nearest1d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest1d_backward_kernel( +namespace at { +namespace native { +TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + std::optional scales, + const Tensor& output) { + xpu::upsample_nearest1d_kernel(output, input, output_size, scales, true); +} + +TORCH_IMPL_FUNC(upsample_nearest1d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + std::optional scales, + const Tensor& output) { + xpu::upsample_nearest1d_kernel(output, input, output_size, scales, false); +} + +TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_xpu) +(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + std::optional scales, + const Tensor& grad_input) { + grad_input.zero_(); + xpu::upsample_nearest1d_backward_kernel( grad_input, grad_output, output_size, input_size, scales, true); - return grad_input; } -Tensor XPUNativeFunctions::upsample_nearest1d_backward( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - std::optional scales) { - Tensor grad_input; - grad_input = upsample_nearest1d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest1d_backward_kernel( +TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_xpu) +(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + std::optional scales, + const Tensor& grad_input) { + grad_input.zero_(); + xpu::upsample_nearest1d_backward_kernel( grad_input, grad_output, output_size, input_size, scales, false); - return grad_input; -} - -Tensor& XPUNativeFunctions::upsample_nearest1d_backward_out( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - std::optional scales, - Tensor& grad_input) { - upsample_nearest1d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest1d_backward_kernel( - grad_input, grad_output, output_size, input_size, scales, true); - return grad_input; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/UpSampleNearest2d.cpp b/src/ATen/native/xpu/UpSampleNearest2d.cpp index da9f9474f..9ebbd74b1 100644 --- a/src/ATen/native/xpu/UpSampleNearest2d.cpp +++ b/src/ATen/native/xpu/UpSampleNearest2d.cpp @@ -1,224 +1,44 @@ -#include #include #include -#include +#include +#include +#include +#include +#include namespace at { -Tensor& upsample_nearest2d_meta( - const Tensor& input, - Tensor& output, - IntArrayRef output_size) { - auto input_size = input.sizes(); +namespace native { - TORCH_CHECK( - output_size.size() == 2, - "It is expected output_size equals to 2, but got size ", - output_size.size()); - - TORCH_CHECK( - input_size.size() == 4, - "It is expected input_size equals to 4, but got size ", - input_size.size()); - - int64_t output_height = output_size[0]; - int64_t output_width = output_size[1]; - - int64_t nbatch = input_size[0]; - int64_t channels = input_size[1]; - int64_t input_height = input_size[2]; - int64_t input_width = input_size[3]; - - TORCH_CHECK( - input_height > 0 && input_width > 0 && output_height > 0 && - output_width > 0, - "Input and output sizes should be greater than 0," - " but got input (H: ", - input_height, - ", W: ", - input_width, - ") output (H: ", - output_height, - ", W: ", - output_width, - ")"); - - // Allow for empty batch size but not other dimensions - TORCH_CHECK( - input.numel() != 0 || - c10::multiply_integers( - input.sizes().begin() + 1, input.sizes().end()), - "Non-empty 4D data tensor expected but got a tensor with sizes ", - input.sizes()); - - if (!output.defined()) - output = at::empty( - {nbatch, channels, output_height, output_width}, - input.options().memory_format(input.suggest_memory_format())); - return output; -} - -Tensor& upsample_nearest2d_backward_meta( - const Tensor& grad_output, - Tensor& grad_input, - IntArrayRef input_size, - IntArrayRef output_size) { - TORCH_CHECK( - output_size.size() == 2, - "It is expected output_size equals to 2, but got size ", - output_size.size()); - - TORCH_CHECK( - input_size.size() == 4, - "It is expected input_size equals to 4, but got size ", - input_size.size()); - - int64_t output_height = output_size[0]; - int64_t output_width = output_size[1]; - - int64_t nbatch = input_size[0]; - int64_t channels = input_size[1]; - int64_t input_height = input_size[2]; - int64_t input_width = input_size[3]; - - TORCH_CHECK( - input_height > 0 && input_width > 0 && output_height > 0 && - output_width > 0, - "Input and output sizes should be greater than 0," - " but got input (H: ", - input_height, - ", W: ", - input_width, - ") output (H: ", - output_height, - ", W: ", - output_width, - ")"); - - TORCH_CHECK( - grad_output.dim() == 4, - "Expected grad_output to be a tensor of dimension 4 but got: dimension ", - grad_output.dim()); - std::array full_output_size = { - nbatch, channels, output_height, output_width}; - for (const auto i : c10::irange(4)) { - TORCH_CHECK( - grad_output.size(i) == full_output_size[i], - "Expected grad_output to have the same shape as output;", - " output.size(", - i, - ") = ", - full_output_size[i], - " but got grad_output.size(", - i, - ") = ", - grad_output.size(i)); - } - if (!grad_input.defined()) - grad_input = at::empty( - input_size, - grad_output.options().memory_format( - grad_output.suggest_memory_format())); - return grad_input; -} - -Tensor XPUNativeFunctions::_upsample_nearest_exact2d( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w) { - Tensor output; - output = upsample_nearest2d_meta(input, output, output_size); - at::native::xpu::upsample_nearest2d_kernel( - output, input, output_size, scales_h, scales_w, true); - return output; -} - -Tensor& XPUNativeFunctions::_upsample_nearest_exact2d_out( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, - Tensor& output) { - upsample_nearest2d_meta(input, output, output_size); - at::native::xpu::upsample_nearest2d_kernel( - output, input, output_size, scales_h, scales_w, true); - return output; -} - -Tensor XPUNativeFunctions::upsample_nearest2d( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w) { - Tensor output; - output = upsample_nearest2d_meta(input, output, output_size); - at::native::xpu::upsample_nearest2d_kernel( +TORCH_IMPL_FUNC(upsample_nearest2d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + std::optional scales_h, + std::optional scales_w, + const Tensor& output) { + xpu::upsample_nearest2d_kernel( output, input, output_size, scales_h, scales_w, false); - return output; } -Tensor& XPUNativeFunctions::upsample_nearest2d_out( - const Tensor& input, - IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, - Tensor& output) { - upsample_nearest2d_meta(input, output, output_size); - at::native::xpu::upsample_nearest2d_kernel( - output, input, output_size, scales_h, scales_w, false); - return output; -} - -Tensor XPUNativeFunctions::_upsample_nearest_exact2d_backward( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w) { - Tensor grad_input; - grad_input = upsample_nearest2d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest2d_backward_kernel( - grad_input, - grad_output, - output_size, - input_size, - scales_h, - scales_w, - true); - return grad_input; -} -Tensor& XPUNativeFunctions::_upsample_nearest_exact2d_backward_out( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, - Tensor& grad_input) { - upsample_nearest2d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest2d_backward_kernel( - grad_input, - grad_output, - output_size, - input_size, - scales_h, - scales_w, - true); - return grad_input; +TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_xpu) +(const Tensor& input, + IntArrayRef output_size, + std::optional scales_h, + std::optional scales_w, + const Tensor& output) { + xpu::upsample_nearest2d_kernel( + output, input, output_size, scales_h, scales_w, true); } -Tensor XPUNativeFunctions::upsample_nearest2d_backward( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w) { - Tensor grad_input; - grad_input = upsample_nearest2d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest2d_backward_kernel( +TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_xpu) +(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + std::optional scales_h, + std::optional scales_w, + const Tensor& grad_input) { + grad_input.zero_(); + xpu::upsample_nearest2d_backward_kernel( grad_input, grad_output, output_size, @@ -226,19 +46,17 @@ Tensor XPUNativeFunctions::upsample_nearest2d_backward( scales_h, scales_w, false); - return grad_input; } -Tensor& XPUNativeFunctions::upsample_nearest2d_backward_out( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, - Tensor& grad_input) { - upsample_nearest2d_backward_meta( - grad_output, grad_input, input_size, output_size); - at::native::xpu::upsample_nearest2d_backward_kernel( +TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_xpu) +(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + std::optional scales_h, + std::optional scales_w, + const Tensor& grad_input) { + grad_input.zero_(); + xpu::upsample_nearest2d_backward_kernel( grad_input, grad_output, output_size, @@ -246,7 +64,7 @@ Tensor& XPUNativeFunctions::upsample_nearest2d_backward_out( scales_h, scales_w, true); - return grad_input; } +} // namespace native } // namespace at diff --git a/src/ATen/native/xpu/WeightNorm.cpp b/src/ATen/native/xpu/WeightNorm.cpp index 7fec9ecfe..81f5288ab 100644 --- a/src/ATen/native/xpu/WeightNorm.cpp +++ b/src/ATen/native/xpu/WeightNorm.cpp @@ -1,14 +1,15 @@ #include -#include + namespace at { -std::tuple XPUNativeFunctions::_weight_norm_interface( +namespace native { +std::tuple weight_norm_xpu( const Tensor& v, const Tensor& g, int64_t dim) { return native::xpu::weight_norm_kernel(v, g, dim); } -std::tuple XPUNativeFunctions::_weight_norm_interface_backward( +std::tuple weight_norm_backward_xpu( const Tensor& grad_w, const Tensor& saved_v, const Tensor& saved_g, @@ -24,4 +25,6 @@ std::tuple XPUNativeFunctions::_weight_norm_interface_backward( return native::xpu::weight_norm_backward_kernel( grad_w, saved_v, saved_g, saved_norms, dim); } + +} // namespace native } // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/XPUScalar.cpp b/src/ATen/native/xpu/XPUScalar.cpp index 17cbe66a3..d47dd7871 100644 --- a/src/ATen/native/xpu/XPUScalar.cpp +++ b/src/ATen/native/xpu/XPUScalar.cpp @@ -2,13 +2,12 @@ #include #include #include -#include - #include +#include -namespace at { +namespace at::native { -Scalar XPUNativeFunctions::_local_scalar_dense(const Tensor& self) { +Scalar _local_scalar_dense_xpu(const Tensor& self) { Scalar r; AT_DISPATCH_V2( self.scalar_type(), @@ -41,4 +40,4 @@ Scalar XPUNativeFunctions::_local_scalar_dense(const Tensor& self) { return r; } -} // namespace at +} // namespace at::native diff --git a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp index 57b5e34b3..ee73856ee 100644 --- a/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationEluKernels.cpp @@ -1,5 +1,5 @@ -#include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp index 5bbaa1ab0..2bfcccb69 100644 --- a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp +++ b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp @@ -1,11 +1,10 @@ -#include #include #include #include #include - #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp index 6399e6b23..c3fe41fd8 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp index 7dadbf3aa..8c6e47f77 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp @@ -1,10 +1,10 @@ -#include #include #include #include #include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp index 2f009757b..1dcd78b88 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.cpp @@ -1,8 +1,8 @@ -#include #include #include #include #include +#include #include #include diff --git a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp index 81e2cb5ba..88d170352 100644 --- a/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.cpp @@ -1,8 +1,8 @@ -#include #include #include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp index 09487462e..f4051184e 100644 --- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.cpp @@ -45,7 +45,7 @@ struct LogSigmoidBackwardFunctor { } }; -void log_sigmoid_backward_kernel(TensorIteratorBase& iter) { +void log_sigmoid_backward_kernel(TensorIterator& iter) { AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, diff --git a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h index be8e7266c..42ee9dbea 100644 --- a/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h @@ -6,6 +6,6 @@ namespace at::native::xpu { TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter); -TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp index 5e517911b..9033b103c 100644 --- a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp @@ -42,7 +42,7 @@ struct MishBackwardFunctor { } }; -void mish_backward_kernel(TensorIteratorBase& iter) { +void mish_backward_kernel(TensorIterator& iter) { AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h index a2c7e499d..7c4efc0af 100644 --- a/src/ATen/native/xpu/sycl/ActivationMishKernels.h +++ b/src/ATen/native/xpu/sycl/ActivationMishKernels.h @@ -6,6 +6,6 @@ namespace at::native::xpu { TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter); -TORCH_XPU_API void mish_backward_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void mish_backward_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp index 924b75d81..7d9f0872a 100644 --- a/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationSiluKernels.cpp @@ -1,8 +1,8 @@ -#include #include #include #include #include +#include #include #include diff --git a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp index 175db7753..3915a5560 100644 --- a/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp +++ b/src/ATen/native/xpu/sycl/ActivationThresholdKernel.cpp @@ -1,4 +1,3 @@ -#include #include #include #include diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp new file mode 100644 index 000000000..08a1456ad --- /dev/null +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp @@ -0,0 +1,322 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native::xpu { + +using namespace at::xpu; + +template +struct AdaptiveAvgPool2dBwdKernelFunctor { + void operator()(sycl::nd_item<1> item) const { + int64_t gi = item.get_global_linear_id(); + + for (int64_t i = gi; i < numel; i += global_range) { + int64_t _iw, _ih, _ic, _ib; + if constexpr (is_channels_last) { + _ic = i % ic; + _iw = i / ic % iw; + _ih = i / ic / iw % ih; + _ib = i / ic / iw / ih; + } else { + _iw = i % iw; + _ih = i / iw % ih; + _ic = i / iw / ih % ic; + _ib = i / iw / ih / ic; + } + + int64_t _oh0 = native::start_index(_ih, ih, oh); + int64_t _oh1 = native::end_index(_ih, ih, oh); + int64_t _ow0 = native::start_index(_iw, iw, ow); + int64_t _ow1 = native::end_index(_iw, iw, ow); + int64_t _ob = _ib; + int64_t _oc = _ic; + + accscalar_t gx = 0; + accscalar_t _ikh, _ikw; + for (int _oh = _oh0; _oh < _oh1; _oh++) { + _ikh = accscalar_t(1.0) / + (accscalar_t)(native::end_index(_oh, oh, ih) - native::start_index(_oh, oh, ih)); + for (int _ow = _ow0; _ow < _ow1; _ow++) { + _ikw = accscalar_t(1.0) / + (accscalar_t)(native::end_index(_ow, ow, iw) - native::start_index(_ow, ow, iw)); + gx += gyacc[_ob][_oc][_oh][_ow] * _ikh * _ikw; + } + } + + const auto store = [](PackedTensorAccessor64 gxacc, + int64_t _ib, + int64_t _ic, + int64_t _ih, + int64_t _iw, + scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; }; + store(gxacc, _ib, _ic, _ih, _iw, (scalar_t)gx); + } + } + + AdaptiveAvgPool2dBwdKernelFunctor( + PackedTensorAccessor64 gyacc_, + PackedTensorAccessor64 gxacc_) + : gyacc(gyacc_), gxacc(gxacc_) { + ib = gxacc.size(0); + ic = gxacc.size(1); + ih = gxacc.size(2); + iw = gxacc.size(3); + oh = gyacc.size(2); + ow = gyacc.size(3); + + numel = ib * ic * ih * iw; + int total_item = std::min(numel, syclMaxWorkItemsPerTile()); + local_range = syclMaxWorkItemsPerEU(); + global_range = total_item < local_range + ? local_range + : (total_item / local_range) * local_range; + } + + sycl::range<1> glb_range() { + return sycl::range<1>(global_range); + } + + sycl::range<1> loc_range() { + return sycl::range<1>(local_range); + } + + private: + int ib; + int ic; + int ih; + int iw; + int oh; + int ow; + int64_t numel; + int global_range; + int local_range; + PackedTensorAccessor64 gyacc; + PackedTensorAccessor64 gxacc; +}; + +template +struct AdaptiveAvgPool2dBwdSLMKernelFunctor + : public __SYCL_KER_CONFIG_CONVENTION__ { + void operator()(sycl::nd_item<1> item) const { + int64_t gi = item.get_global_linear_id(); + int64_t li = item.get_local_id(0); + + // for-loop order: oh*ow->ih->iw + // reuse oh*ow(oh0, oh1, ow0, ow1), ih(ikh), iw(ikw) in inner loop. + for (int _ih = li; _ih < ih; _ih += local_range) { + _oh0_cached[_ih] = (int)native::start_index(_ih, ih, oh); + _oh1_cached[_ih] = (int)native::end_index(_ih, ih, oh); + } + for (int _iw = li; _iw < iw; _iw += local_range) { + _ow0_cached[_iw] = (int)native::start_index(_iw, iw, ow); + _ow1_cached[_iw] = (int)native::end_index(_iw, iw, ow); + } + for (int _oh = li; _oh < oh; _oh += local_range) { + _ikh_cached[_oh] = accscalar_t(1.0) / + (accscalar_t)(native::end_index(_oh, oh, ih) - + native::start_index(_oh, oh, ih)); + } + for (int _ow = li; _ow < ow; _ow += local_range) { + _ikw_cached[_ow] = accscalar_t(1.0) / + (accscalar_t)(native::end_index(_ow, ow, iw) - + native::start_index(_ow, ow, iw)); + } + + item.barrier(sycl_local_fence); + + for (int64_t i = gi; i < numel; i += global_range) { + int64_t _iw, _ih, _ic, _ib; + if constexpr (is_channels_last) { + _ic = i % ic; + _iw = i / ic % iw; + _ih = i / ic / iw % ih; + _ib = i / ic / iw / ih; + } else { + _iw = i % iw; + _ih = i / iw % ih; + _ic = i / iw / ih % ic; + _ib = i / iw / ih / ic; + } + + int64_t _oh0, _oh1, _ow0, _ow1; + _oh0 = _oh0_cached[_ih]; + _oh1 = _oh1_cached[_ih]; + _ow0 = _ow0_cached[_iw]; + _ow1 = _ow1_cached[_iw]; + int64_t _ob = _ib; + int64_t _oc = _ic; + + accscalar_t gx = 0; + accscalar_t _ikh, _ikw; + for (int _oh = _oh0; _oh < _oh1; _oh++) { + _ikh = _ikh_cached[_oh]; + for (int _ow = _ow0; _ow < _ow1; _ow++) { + _ikw = _ikw_cached[_ow]; + gx += gyacc[_ob][_oc][_oh][_ow] * _ikh * _ikw; + } + } + + const auto store = [](PackedTensorAccessor64 gxacc, + int64_t _ib, + int64_t _ic, + int64_t _ih, + int64_t _iw, + scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; }; + store(gxacc, _ib, _ic, _ih, _iw, (scalar_t)gx); + } + } + + void sycl_ker_config_convention(sycl::handler& cgh) { + _oh0_cached = sycl_local_acc_t(ih, cgh); + _oh1_cached = sycl_local_acc_t(ih, cgh); + _ow0_cached = sycl_local_acc_t(iw, cgh); + _ow1_cached = sycl_local_acc_t(iw, cgh); + _ikh_cached = sycl_local_acc_t(oh, cgh); + _ikw_cached = sycl_local_acc_t(ow, cgh); + } + + AdaptiveAvgPool2dBwdSLMKernelFunctor( + PackedTensorAccessor64 gyacc_, + PackedTensorAccessor64 gxacc_) + : gyacc(gyacc_), gxacc(gxacc_) { + ib = gxacc.size(0); + ic = gxacc.size(1); + ih = gxacc.size(2); + iw = gxacc.size(3); + oh = gyacc.size(2); + ow = gyacc.size(3); + + numel = ib * ic * ih * iw; + int total_item = std::min(numel, syclMaxWorkItemsPerTile()); + + local_range = syclMaxWorkGroupSize(*this); + global_range = total_item < local_range + ? local_range + : (total_item / local_range) * local_range; + } + + sycl::range<1> glb_range() { + return sycl::range<1>(global_range); + } + + sycl::range<1> loc_range() { + return sycl::range<1>(local_range); + } + + private: + int ib; + int ic; + int ih; + int iw; + int oh; + int ow; + int64_t numel; + int local_range; + int global_range; + PackedTensorAccessor64 gyacc; + PackedTensorAccessor64 gxacc; + sycl_local_acc_t _oh0_cached; + sycl_local_acc_t _oh1_cached; + sycl_local_acc_t _ow0_cached; + sycl_local_acc_t _ow1_cached; + sycl_local_acc_t _ikh_cached; + sycl_local_acc_t _ikw_cached; +}; + +void adaptive_avg_pool2d_backward_out_kernel( + Tensor& gradInput, + const Tensor& gradOutput, + const Tensor& input) { + TensorArg grad_input_arg{gradInput, "gradInput", 1}, + grad_output_arg{gradOutput, "gradOutput", 2}, + input_arg{input, "input", 3}; + adaptive_pool_empty_output_check(gradOutput, "adaptive_avg_pool2d_backward"); + checkAllSameGPU(__func__, {grad_input_arg, grad_output_arg, input_arg}); + + TORCH_CHECK( + (input.ndimension() == 3 || input.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + + auto outputHeight = gradOutput.size(-2); + auto outputWidth = gradOutput.size(-1); + + const auto nInputPlane = input.size(-3); + const auto inputHeight = input.size(-2); + const auto inputWidth = input.size(-1); + + int dH = std::floor((float)2 * inputHeight / outputHeight) - + (inputHeight / outputHeight); + int dW = std::floor((float)2 * inputWidth / outputWidth) - + (inputWidth / outputWidth); + std::vector stride_vec = {dH, dW}; + + int kH = std::ceil((float)2 * inputHeight / outputHeight) - + (inputHeight / outputHeight); + int kW = std::ceil((float)2 * inputWidth / outputWidth) - + (inputWidth / outputWidth); + std::vector kernel_size_vec = {kH, kW}; + + int padH = (dH * (outputHeight - 1) + kH - inputHeight) / 2; + int padW = (dW * (outputWidth - 1) + kW - inputWidth) / 2; + std::vector padding_vec = {padH, padW}; + + bool is_3d = gradOutput.ndimension() == 3; + if (is_3d) { + gradOutput.resize_({1, nInputPlane, outputHeight, outputWidth}); + gradInput.resize_({1, nInputPlane, inputHeight, inputWidth}); + } + + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::BFloat16, + at::ScalarType::Half, + gradOutput.scalar_type(), + "adaptive_avg_pool2d_backward_xpu", + [&]() { + using accscalar_t = acc_type; + auto gyacc = gradOutput.packed_accessor64(); + auto gxacc = gradInput.packed_accessor64(); + + int64_t ohw01_shared_size = + ((inputHeight + inputWidth) * 2) * sizeof(int); + int64_t ikhw_shared_size = + (outputHeight + outputWidth) * sizeof(accscalar_t); + bool using_shared = + syclLocalMemSize() >= ohw01_shared_size + ikhw_shared_size; + + auto& q = getCurrentSYCLQueue(); + if (is_smf_channels_last(gradOutput)) { + if (using_shared) { + AdaptiveAvgPool2dBwdSLMKernelFunctor + kfn(gyacc, gxacc); + sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn); + } else { + AdaptiveAvgPool2dBwdKernelFunctor kfn( + gyacc, gxacc); + sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn); + } + } else { + if (using_shared) { + AdaptiveAvgPool2dBwdSLMKernelFunctor + kfn(gyacc, gxacc); + sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn); + } else { + AdaptiveAvgPool2dBwdKernelFunctor kfn( + gyacc, gxacc); + sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn); + } + } + }); + + if (is_3d) { + gradOutput.resize_({nInputPlane, outputHeight, outputWidth}); + gradInput.resize_({nInputPlane, inputHeight, inputWidth}); + } +} + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp index 0221ceec0..86fd7edfe 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp @@ -1,9 +1,9 @@ -#include #include #include #include #include #include +#include #include #include diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp index cb2c6b083..dacae3e68 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp @@ -153,8 +153,8 @@ void launch_adaptive_max_pool2d_kernel( void adaptive_max_pool2d_kernel( const Tensor& input, IntArrayRef output_size, - Tensor& output, - Tensor& indices) { + const Tensor& output, + const Tensor& indices) { int64_t osizeH = output_size[0]; int64_t osizeW = output_size[1]; @@ -327,7 +327,7 @@ void adaptive_max_pool2d_backward_kernel( const Tensor& grad_output, const Tensor& input, const Tensor& indices, - Tensor& grad_input) { + const Tensor& grad_input) { globalContext().alertNotDeterministic("adaptive_max_pool2d_backward_xpu"); const at::Tensor grad_output_ = grad_output.contiguous(); diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h index abba4e354..2714e6627 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h +++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h @@ -7,13 +7,13 @@ namespace at::native::xpu { TORCH_XPU_API void adaptive_max_pool2d_kernel( const Tensor& input, IntArrayRef output_size, - Tensor& output, - Tensor& indices); + const Tensor& output, + const Tensor& indices); TORCH_XPU_API void adaptive_max_pool2d_backward_kernel( const Tensor& grad_output, const Tensor& input, const Tensor& indices, - Tensor& grad_input); + const Tensor& grad_input); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/AmpKernels.h b/src/ATen/native/xpu/sycl/AmpKernels.h index 4f828c1f2..7c703e70e 100644 --- a/src/ATen/native/xpu/sycl/AmpKernels.h +++ b/src/ATen/native/xpu/sycl/AmpKernels.h @@ -1,5 +1,5 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp index 7373935aa..e7a2c2e1f 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp @@ -243,7 +243,7 @@ void launch_avg_pool2d_channels_last_kernel( const int stride_w, const int pad_h, const int pad_w, - Tensor& output, + const Tensor& output, const int divisor_override, const bool count_include_pad, const bool use_divisor) { @@ -291,7 +291,7 @@ void launch_avg_pool2d_kernel( const int stride_w, const int pad_h, const int pad_w, - Tensor& output, + const Tensor& output, const int divisor_override, const bool count_include_pad, const bool use_divisor) { @@ -548,7 +548,7 @@ void launch_avg_pool2d_backward_channels_last_kernel( const int stride_w, const int pad_h, const int pad_w, - Tensor& grad_input, + const Tensor& grad_input, const int divisor_override, bool count_include_pad, bool use_divisor) { @@ -599,7 +599,7 @@ void launch_avg_pool2d_backward_kernel( const int stride_w, const int pad_h, const int pad_w, - Tensor& grad_input, + const Tensor& grad_input, const int divisor_override, bool count_include_pad, bool use_divisor) { @@ -634,35 +634,24 @@ void launch_avg_pool2d_backward_kernel( void avg_pool2d_kernel( const Tensor& input_, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, + int64_t kH_, + int64_t kW_, + int64_t dH_, + int64_t dW_, + int64_t padH_, + int64_t padW_, bool ceil_mode, bool count_include_pad, c10::optional divisor_override, - Tensor& output) { - const int kH = safe_downcast(kernel_size[0]); - const int kW = kernel_size.size() == 1 - ? kH - : safe_downcast(kernel_size[1]); - - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() ? kW - : stride.size() == 1 ? dH - : safe_downcast(stride[1]); - - const int padH = safe_downcast(padding[0]); - const int padW = - padding.size() == 1 ? padH : safe_downcast(padding[1]); - + const Tensor& output) { const int64_t nInputPlane = input_.size(-3); const int64_t inputHeight = input_.size(-2); const int64_t inputWidth = input_.size(-1); int64_t outputWidth = - pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); + pooling_output_shape(inputWidth, kW_, padW_, dW_, 1, ceil_mode); int64_t outputHeight = - pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); + pooling_output_shape(inputHeight, kH_, padH_, dH_, 1, ceil_mode); const auto memory_format = input_.suggest_memory_format(); Tensor input = input_.contiguous(memory_format); @@ -688,12 +677,12 @@ void avg_pool2d_kernel( inputWidth, outputHeight, outputWidth, - kH, - kW, - dH, - dW, - padH, - padW, + kH_, + kW_, + dH_, + dW_, + padH_, + padW_, output, divisor_override_value, count_include_pad, @@ -709,12 +698,12 @@ void avg_pool2d_kernel( inputWidth, outputHeight, outputWidth, - kH, - kW, - dH, - dW, - padH, - padW, + kH_, + kW_, + dH_, + dW_, + padH_, + padW_, output, divisor_override_value, count_include_pad, @@ -740,7 +729,7 @@ void avg_pool2d_backward_kernel( bool ceil_mode, bool count_include_pad, c10::optional divisor_override, - Tensor& gradInput) { + const Tensor& gradInput) { const int kH = safe_downcast(kernel_size[0]); const int kW = kernel_size.size() == 1 ? kH diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h index 84842355d..7667fe021 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.h +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.h @@ -1,18 +1,19 @@ -#pragma once - -#include +#include namespace at::native::xpu { TORCH_XPU_API void avg_pool2d_kernel( const Tensor& input_, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, + int64_t kH_, + int64_t kW_, + int64_t dH_, + int64_t dW_, + int64_t padH_, + int64_t padW_, bool ceil_mode, bool count_include_pad, c10::optional divisor_override, - Tensor& output); + const Tensor& output); TORCH_XPU_API void avg_pool2d_backward_kernel( const Tensor& gradOutput_, @@ -23,6 +24,6 @@ TORCH_XPU_API void avg_pool2d_backward_kernel( bool ceil_mode, bool count_include_pad, c10::optional divisor_override, - Tensor& gradInput); + const Tensor& gradInput); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index 084a9b65f..5e14a0a94 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -1,8 +1,8 @@ -#include #include #include #include #include +#include #include #include #include @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -3060,7 +3061,7 @@ void batch_norm_mean_var( } // For some reason this isn't an actual operator but it exists anyway... - var_mean_out( + at::native::var_mean_out( save_var, save_mean, self, diff --git a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp index 34e50222e..2c2dd1550 100644 --- a/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp index 3224ba54f..e0e8487f6 100644 --- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp @@ -1,6 +1,7 @@ -#include #include #include +#include + #include #include diff --git a/src/ATen/native/xpu/sycl/BinaryKernels.cpp b/src/ATen/native/xpu/sycl/BinaryKernels.cpp index 2902486bf..daafadd23 100644 --- a/src/ATen/native/xpu/sycl/BinaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryKernels.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include #include #include diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp index 18bcffca6..3268ab90f 100644 --- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.cpp @@ -17,7 +17,7 @@ struct LogicalAndFunctor { } }; -void logical_and_kernel(TensorIteratorBase& iter) { +void logical_and_kernel(TensorIterator& iter) { auto dtype = iter.common_dtype(); if (at::isComplexType(dtype)) { AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_xpu", [&]() { @@ -40,7 +40,7 @@ struct LogicalOrFunctor { } }; -void logical_or_kernel(TensorIteratorBase& iter) { +void logical_or_kernel(TensorIterator& iter) { auto dtype = iter.common_dtype(); if (at::isComplexType(dtype)) { AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_or_xpu", [&]() { @@ -62,7 +62,7 @@ struct LogicalXorFunctor { } }; -void logical_xor_kernel(TensorIteratorBase& iter) { +void logical_xor_kernel(TensorIterator& iter) { auto dtype = iter.common_dtype(); if (at::isComplexType(dtype)) { AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_xor_xpu", [&]() { diff --git a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h index 82271e6a1..dce1a1a5e 100644 --- a/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h @@ -4,10 +4,10 @@ namespace at::native::xpu { -TORCH_XPU_API void logical_and_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_and_kernel(TensorIterator& iter); -TORCH_XPU_API void logical_or_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_or_kernel(TensorIterator& iter); -TORCH_XPU_API void logical_xor_kernel(TensorIteratorBase& iter); +TORCH_XPU_API void logical_xor_kernel(TensorIterator& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp index a7a676675..73732e02f 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp @@ -1,4 +1,3 @@ -#include #include #include #include diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp index 3e0989b24..d96e5064e 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp index 9a5320c68..92eac3da2 100644 --- a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp +++ b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp @@ -1,4 +1,3 @@ -#include #include #include #include diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp index 1d64e34d6..d52a65fdf 100644 --- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp +++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp @@ -1,5 +1,5 @@ -#include #include +#include #include #include diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.h b/src/ATen/native/xpu/sycl/Col2ImKernel.h index a910112e0..62d87b406 100644 --- a/src/ATen/native/xpu/sycl/Col2ImKernel.h +++ b/src/ATen/native/xpu/sycl/Col2ImKernel.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/CompareKernels.cpp b/src/ATen/native/xpu/sycl/CompareKernels.cpp index 1096dfa40..91c3ac614 100644 --- a/src/ATen/native/xpu/sycl/CompareKernels.cpp +++ b/src/ATen/native/xpu/sycl/CompareKernels.cpp @@ -1,7 +1,7 @@ -#include #include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index bdddd3f44..dc2991cc5 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -18,7 +18,7 @@ struct CopyScalarFunc { } }; -void copy_kernel(TensorIterator& iter) { +void copy_kernel(TensorIteratorBase& iter) { ScalarType dtype = iter.common_dtype(); if (isQIntType(dtype)) { AT_DISPATCH_QINT_TYPES(dtype, "copy_xpu", [&] { diff --git a/src/ATen/native/xpu/sycl/CopyKernel.h b/src/ATen/native/xpu/sycl/CopyKernel.h index 30232b27a..3a8e4d263 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.h +++ b/src/ATen/native/xpu/sycl/CopyKernel.h @@ -4,6 +4,6 @@ namespace at::native::xpu { -TORCH_XPU_API void copy_kernel(TensorIterator& iter); +TORCH_XPU_API void copy_kernel(TensorIteratorBase& iter); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index 8c130a0b9..ba0283b8b 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -4,10 +4,10 @@ #pragma clang diagnostic ignored "-Wreturn-type" #pragma GCC diagnostic ignored "-Wreturn-type" -#include #include #include #include +#include #include #include @@ -498,8 +498,8 @@ void max_pool2d_with_indices_kernel( IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, - Tensor& output_, - Tensor& indices_) { + const Tensor& output_, + const Tensor& indices_) { NoNamesGuard guard; TensorArg output_arg{output_, "output", 1}; @@ -614,8 +614,8 @@ void max_pool2d_with_indices_kernel( } } -Tensor& max_pool2d_with_indices_backward_kernel( - Tensor& gradInput_, +void max_pool2d_with_indices_backward_kernel( + const Tensor& gradInput_, const Tensor& gradOutput_, const Tensor& input_, const Tensor& indices_, @@ -733,8 +733,6 @@ Tensor& max_pool2d_with_indices_backward_kernel( (!is_3d && !gradInput_.is_contiguous(smf))) { gradInput_.copy_(gradInput); } - - return gradInput_; } } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h index 9d827c642..d530560e6 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace at::native::xpu { @@ -11,11 +11,11 @@ TORCH_XPU_API void max_pool2d_with_indices_kernel( IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, - Tensor& output, - Tensor& indices); + const Tensor& output, + const Tensor& indices); -TORCH_XPU_API Tensor& max_pool2d_with_indices_backward_kernel( - Tensor& gradInput, +TORCH_XPU_API void max_pool2d_with_indices_backward_kernel( + const Tensor& gradInput, const Tensor& gradOutput, const Tensor& input, const Tensor& indices, diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp index 3deddb8cf..5006dd8b4 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp +++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.h b/src/ATen/native/xpu/sycl/DistanceKernels.h index a53b84cdf..46a34d031 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.h +++ b/src/ATen/native/xpu/sycl/DistanceKernels.h @@ -1,5 +1,5 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp index 1a01a7cfc..c3de4c593 100644 --- a/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp +++ b/src/ATen/native/xpu/sycl/DistributionBernoulli.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -6,6 +5,7 @@ #include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/DistributionNormal.cpp b/src/ATen/native/xpu/sycl/DistributionNormal.cpp index 93b938210..3aacf6639 100644 --- a/src/ATen/native/xpu/sycl/DistributionNormal.cpp +++ b/src/ATen/native/xpu/sycl/DistributionNormal.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -8,6 +7,7 @@ #include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp index 3c15de1d9..e4698a723 100644 --- a/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp +++ b/src/ATen/native/xpu/sycl/DistributionRandomKernel.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -8,6 +7,7 @@ #include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/DistributionTemplates.h b/src/ATen/native/xpu/sycl/DistributionTemplates.h index db117c021..f5a5efdb5 100644 --- a/src/ATen/native/xpu/sycl/DistributionTemplates.h +++ b/src/ATen/native/xpu/sycl/DistributionTemplates.h @@ -13,6 +13,8 @@ #include #include +#include + namespace at { namespace native { namespace xpu { diff --git a/src/ATen/native/xpu/sycl/DistributionUniform.cpp b/src/ATen/native/xpu/sycl/DistributionUniform.cpp index c38626909..17ff4d698 100644 --- a/src/ATen/native/xpu/sycl/DistributionUniform.cpp +++ b/src/ATen/native/xpu/sycl/DistributionUniform.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -8,6 +7,7 @@ #include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp index ddaee4d4c..54b1b4d4c 100644 --- a/src/ATen/native/xpu/sycl/Dropout.cpp +++ b/src/ATen/native/xpu/sycl/Dropout.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -9,6 +8,10 @@ #include #include #include +#include + +#include +#include #include diff --git a/src/ATen/native/xpu/sycl/Embedding.cpp b/src/ATen/native/xpu/sycl/Embedding.cpp index d905a4d97..4ea45b4c2 100644 --- a/src/ATen/native/xpu/sycl/Embedding.cpp +++ b/src/ATen/native/xpu/sycl/Embedding.cpp @@ -1,4 +1,3 @@ -#include #include #include #include diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp index f4f35061f..0e2d02b6a 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include #include #include diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.h b/src/ATen/native/xpu/sycl/EmbeddingBag.h index b1ac0038c..07f9de97a 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.h +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.h @@ -1,7 +1,7 @@ #pragma once -#include #include +#include #include #include diff --git a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h index 97ad6f0d0..f73dabb6b 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h +++ b/src/ATen/native/xpu/sycl/EmbeddingBagKernels.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/FillKernel.cpp b/src/ATen/native/xpu/sycl/FillKernel.cpp index 22e845364..6dbb55c67 100644 --- a/src/ATen/native/xpu/sycl/FillKernel.cpp +++ b/src/ATen/native/xpu/sycl/FillKernel.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include #include diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp index 11046d93f..94571607c 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp @@ -7,8 +7,9 @@ #include #include -namespace at::native::xpu { +#include +namespace at::native::xpu { template class Op> std::vector foreach_tensor_list_op( TensorList tensors1, diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h index d4448ca1b..2c501a2bc 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h @@ -1,5 +1,5 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp index b37a4e786..fe8c06cd0 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp @@ -7,8 +7,9 @@ #include #include -namespace at::native::xpu { +#include +namespace at::native::xpu { template class Op> std::vector foreach_binary_op( TensorList tensors, diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h index 4aac91b47..00044fa7d 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h @@ -1,5 +1,5 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp index 21af7e81d..7f6c0ee99 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp @@ -7,6 +7,7 @@ #include #include +#include namespace at::native::xpu { template class Op> diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h index a863983dd..bafd220c2 100644 --- a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h @@ -1,5 +1,5 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp b/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp index 97e5f8245..7385ab7e0 100644 --- a/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachPointwiseKernels.cpp @@ -1,14 +1,15 @@ -#include #include #include +#include #include #include #include #include -namespace at::native::xpu { +#include +namespace at::native::xpu { template