PaddlePaddle · QingshuChen · Nov 16, 2023 · Nov 10, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -43,6 +43,10 @@ XPUOpMap& get_kl2_ops() {
       {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"addcmul_xpu",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"arange_tensor",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
       {"arg_max",
        XPUKernelSet({phi::DataType::INT32,
                      phi::DataType::FLOAT32,
@@ -98,6 +102,7 @@ XPUOpMap& get_kl2_ops() {
       {"bilinear_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"bitwise_not", XPUKernelSet({phi::DataType::BOOL})},
       {"bitwise_and", XPUKernelSet({phi::DataType::BOOL})},
+      {"bitwise_or", XPUKernelSet({phi::DataType::BOOL})},
       {"broadcast", XPUKernelSet({phi::DataType::FLOAT32})},
       {"c_allgather",
        XPUKernelSet({phi::DataType::FLOAT16,
@@ -715,6 +720,8 @@ XPUOpMap& get_kl2_ops() {
       {"roi_align_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"roll", XPUKernelSet({phi::DataType::FLOAT32})},
       {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"scale",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,

diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -38,6 +38,10 @@ XPUOpMap& get_kl3_ops() {
       {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"addcmul_xpu",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"arange_tensor",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
       {"arg_max",
        XPUKernelSet({phi::DataType::INT32,
                      phi::DataType::FLOAT32,
@@ -92,6 +96,7 @@ XPUOpMap& get_kl3_ops() {
       {"bilinear_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"bitwise_not", XPUKernelSet({phi::DataType::BOOL})},
       {"bitwise_and", XPUKernelSet({phi::DataType::BOOL})},
+      {"bitwise_or", XPUKernelSet({phi::DataType::BOOL})},
       {"broadcast", XPUKernelSet({phi::DataType::FLOAT32})},
       {"c_allgather",
        XPUKernelSet({phi::DataType::FLOAT16,
@@ -675,6 +680,8 @@ XPUOpMap& get_kl3_ops() {
       {"roi_align_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"roll", XPUKernelSet({phi::DataType::FLOAT32})},
       {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"scale",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,

diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -611,13 +611,29 @@ struct XPUCosGradFunctor : public funcs::BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct XPURsqrtGradFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor* x,
+                  const DenseTensor* out,
+                  const DenseTensor* dout,
+                  DenseTensor* dx) const {
+    int r = xpu_activation_backward<Context, T, XPUType>(
+        dev_ctx, x, out, dout, dx, xpu::rsqrt_grad<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "rsqrt_grad");
+  }
+};
+
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, XPUExpGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, XPUReciprocalGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, XPUSigmoidGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, XPUSqrtGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, XPUTanhGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, XPUReluGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, XPURelu6GradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, XPURsqrtGradFunctor);
 
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, XPULogGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, XPUSquareGradFunctor);
@@ -721,5 +737,6 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
 
 PD_REGISTER_KERNEL(pow_grad, XPU, ALL_LAYOUT, phi::PowGradKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -514,6 +514,19 @@ struct XPUCosFunctor : public funcs::BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct XPURsqrtFunctor : public funcs::BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  template <typename Context>
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) const {
+    int ret = xpu_activation_func<Context, T, XPUType>(
+        dev_ctx, x, out, xpu::rsqrt<XPUType>);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rsqrt");
+  }
+};
+
 DEFINE_XPU_ACTIVATION_KERNEL(Exp, XPUExpFunctor)
 DEFINE_XPU_ACTIVATION_KERNEL(Floor, XPUFloorFunctor)
 DEFINE_XPU_ACTIVATION_KERNEL(Log, XPULogFunctor)
@@ -526,6 +539,7 @@ DEFINE_XPU_ACTIVATION_KERNEL(Tanh, XPUTanhFunctor)
 DEFINE_XPU_ACTIVATION_KERNEL(Silu, XPUSiluFunctor)
 DEFINE_XPU_ACTIVATION_KERNEL(Sin, XPUSinFunctor)
 DEFINE_XPU_ACTIVATION_KERNEL(Cos, XPUCosFunctor)
+DEFINE_XPU_ACTIVATION_KERNEL(Rsqrt, XPURsqrtFunctor)
 
 DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Mish, XPUMishFunctor, threshold)
 DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu,
@@ -617,3 +631,4 @@ PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
 PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
diff --git a/paddle/phi/kernels/xpu/arange_kernel.cc b/paddle/phi/kernels/xpu/arange_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/arange_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/range_function.h"
@@ -32,19 +33,11 @@ void ArangeTensorKernel(const Context& dev_ctx,
   int64_t size = 0;
   phi::funcs::GetSize(start_value, end_value, step_value, &size);
   out->Resize(phi::make_ddim({size}));
-  dev_ctx.template Alloc<T>(out);
-
-  DenseTensor out_cpu;
-  out_cpu.Resize({out->numel()});
-  dev_ctx.template HostAlloc<T>(&out_cpu);
-  T* out_cpu_data = out_cpu.data<T>();
-
-  T value = start_value;
-  for (int64_t i = 0; i < size; ++i) {
-    out_cpu_data[i] = value;
-    value += step_value;
-  }
-  phi::Copy(dev_ctx, out_cpu, out->place(), true, out);
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  int ret = xpu::range<T>(
+      dev_ctx.x_context(), out_data, start_value, step_value, size);
+  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "range");
 }
 
 }  // namespace phi
@@ -54,7 +47,6 @@ PD_REGISTER_KERNEL(arange_tensor,
                    ALL_LAYOUT,
                    phi::ArangeTensorKernel,
                    float,
-                   double,
                    int,
                    int64_t) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);

diff --git a/paddle/phi/kernels/xpu/bitwise.cc b/paddle/phi/kernels/xpu/bitwise.cc
@@ -44,7 +44,17 @@ void BitwiseAndKernel(const Context& ctx,
   // counterpart. Need to be changed when adding support to other types.
   LogicalAndKernel<T, Context>(ctx, x, y, out);
 }
+
+template <typename T, typename Context>
+void BitwiseOrKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out) {
+  // Same reason as bitwise_and
+  LogicalOrKernel<T, Context>(ctx, x, y, out);
+}
 }  // namespace phi
 
 PD_REGISTER_KERNEL(bitwise_not, XPU, ALL_LAYOUT, phi::BitwiseNotKernel, bool) {}
 PD_REGISTER_KERNEL(bitwise_and, XPU, ALL_LAYOUT, phi::BitwiseAndKernel, bool) {}
+PD_REGISTER_KERNEL(bitwise_or, XPU, ALL_LAYOUT, phi::BitwiseOrKernel, bool) {}
diff --git a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc
@@ -28,6 +28,7 @@ void EmbeddingGradKernel(const Context& ctx,
                          const DenseTensor& out_grad,
                          int64_t padding_idx,
                          DenseTensor* weight_grad) {
+  using XPUT = typename XPUTypeTrait<T>::Type;
   DDim table_dim;
   table_dim = weight.dims();
 
@@ -62,14 +63,15 @@ void EmbeddingGradKernel(const Context& ctx,
   int ym = static_cast<int>(ids_numel);
   int n = d_table_t->dims()[1];
 
-  int r = xpu::embedding_grad<T, int64_t>(dev_ctx.x_context(),
-                                          d_output_data,
-                                          ids_data,
-                                          d_table_data,
-                                          xm,
-                                          n,
-                                          ym,
-                                          padding_idx);
+  int r = xpu::embedding_grad<XPUT, int64_t>(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUT*>(d_output_data),
+      ids_data,
+      reinterpret_cast<XPUT*>(d_table_data),
+      xm,
+      n,
+      ym,
+      padding_idx);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "embedding_grad");
 }
 

diff --git a/test/xpu/test_activation_op_xpu.py b/test/xpu/test_activation_op_xpu.py
@@ -1373,5 +1373,51 @@ def init_config(self):
 for stype in support_types:
     create_test_class(globals(), XPUTestCosOP, stype)
 
+
+class XPUTestRsqrtOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "rsqrt"
+        self.use_dynamic_create_class = False
+
+    class XPUTestRsqrtBase(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "rsqrt"
+            self.dtype = self.in_type
+            self.init_config()
+            out = np.reciprocal(np.sqrt(self.x))
+
+            self.inputs = {'X': self.x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+        def init_config(self):
+            self.x = np.random.uniform(0.01, 4, [11, 17]).astype(self.dtype)
+
+    class XPUTestRsqrt_ZeroDim(XPUTestRsqrtBase):
+        def init_config(self):
+            self.x = np.random.uniform(0.01, 4, []).astype(self.dtype)
+
+    class XPUTestRsqrt2(XPUTestRsqrtBase):
+        def init_config(self):
+            self.x = np.random.uniform(0.01, 4, [1024, 8]).astype(self.dtype)
+
+    class XPUTestRsqrt3(XPUTestRsqrtBase):
+        def init_config(self):
+            self.x = np.random.uniform(0.01, 4, [4, 512, 15, 15]).astype(
+                self.dtype
+            )
+
+    class XPUTestRsqrt4(XPUTestRsqrtBase):
+        def init_config(self):
+            self.x = np.random.uniform(0.01, 4, [4, 256, 22, 22]).astype(
+                self.dtype
+            )
+
+
+support_types = get_xpu_op_support_types('rsqrt')
+for stype in support_types:
+    create_test_class(globals(), XPUTestRsqrtOP, stype)
+
+
 if __name__ == "__main__":
     unittest.main()