PaddlePaddle · houj04 · Nov 17, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 17, 2023
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -841,6 +841,10 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"square",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"squared_l2_norm",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"squared_l2_norm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"squeeze2_grad",
        XPUKernelSet({phi::DataType::FLOAT64,
                      phi::DataType::INT64,

diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -801,6 +801,10 @@ XPUOpMap& get_kl3_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"square",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"squared_l2_norm",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"squared_l2_norm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"squeeze2_grad",
        XPUKernelSet({phi::DataType::FLOAT64,
                      phi::DataType::INT64,

diff --git a/paddle/phi/kernels/xpu/squared_l2_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/squared_l2_norm_grad_kernel.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squared_l2_norm_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/common/memory_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+
+void SquaredL2NormGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& dout,
+                             DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+
+  PADDLE_ENFORCE_EQ(
+      dout.numel(),
+      1,
+      phi::errors::InvalidArgument(
+          "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  XPUType dout_value_cpu = 0;
+  memory_utils::Copy(CPUPlace(),
+                     static_cast<void*>(&dout_value_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<const void*>(dout.data<T>()),
+                     sizeof(XPUType));
+
+  // squared_l2_norm_grad: dx = dout(it is a scalar value!) * x * 2.0
+
+  // int scale(Context* ctx, const T* x, T* y, int64_t len, bool
+  // bias_after_scale, float _scale, float _bias);
+  int r = xpu::scale(dev_ctx.x_context(),
+                     reinterpret_cast<const XPUType*>(x.data<T>()),
+                     reinterpret_cast<XPUType*>(dx->data<T>()),
+                     x.numel(),
+                     false,
+                     dout_value_cpu * 2,
+                     0.0f);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(squared_l2_norm_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SquaredL2NormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/xpu/squared_l2_norm_kernel.cc b/paddle/phi/kernels/xpu/squared_l2_norm_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squared_l2_norm_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SquaredL2NormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         DenseTensor* out) {
+  T* data = dev_ctx.template Alloc<T>(out);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  float* y_for_xdnn = nullptr;
+  if (std::is_same<T, float>::value) {
+    y_for_xdnn = reinterpret_cast<float*>(data);
+  } else {
+    y_for_xdnn = RAII_GUARD.alloc_l3_or_gm<float>(1);
+  }
+
+  // int square_reduce_sum(Context* ctx, const T* x, float* y, int64_t len, bool
+  // is_sqrt=false);
+  int r = xpu::square_reduce_sum<XPUType>(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(x.data<T>()),
+      y_for_xdnn,
+      x.numel(),
+      false);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "square_reduce_sum");
+
+  if (!std::is_same<T, float>::value) {
+    // int cast(Context* ctx, const TX* x, TY* y, int64_t len);
+    int r = xpu::cast<float, XPUType>(
+        dev_ctx.x_context(), y_for_xdnn, reinterpret_cast<XPUType*>(data), 1);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(squared_l2_norm,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SquaredL2NormKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
@@ -234,11 +234,6 @@ def _squared_l2_norm(x):
 
     x = _cast_to_mp_type_if_enabled(x)
 
-    if core.is_compiled_with_xpu():
-        square = paddle.square(x)
-        sum_square = paddle.sum(square)
-        return sum_square
-
     if in_dynamic_or_pir_mode():
         return _C_ops.squared_l2_norm(x)
 

diff --git a/test/xpu/test_squared_l2_norm_op_xpu.py b/test/xpu/test_squared_l2_norm_op_xpu.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+from op_test_xpu import XPUOpTest
+
+import paddle
+
+
+class XPUTestSquaredL2NormOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'squared_l2_norm'
+        self.use_dynamic_create_class = False
+
+    class TestSquaredL2NormOp(XPUOpTest):
+        def init(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = 'squared_l2_norm'
+
+        def setUp(self):
+            self.init()
+            self.use_mkldnn = False
+            self.max_relative_error = 0.05
+            self.set_inputs()
+            self.inputs = {'X': self.x}
+            self.outputs = {
+                'Out': np.array([np.square(np.linalg.norm(self.x))])
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+        def set_inputs(self):
+            self.x = np.random.uniform(-1, 1, (13, 19)).astype(self.in_type)
+            self.x[np.abs(self.x) < self.max_relative_error] = 0.1
+
+    class TestSquaredL2NormOp_1(TestSquaredL2NormOp):
+        def set_inputs(self):
+            self.x = np.random.uniform(-0.2, 0.2, (8, 128, 24)).astype(
+                self.in_type
+            )
+            self.x[np.abs(self.x) < self.max_relative_error] = 0.02
+
+    class TestSquaredL2NormOp_2(TestSquaredL2NormOp):
+        def set_inputs(self):
+            self.x = np.random.uniform(-0.1, 0.1, (2, 128, 256)).astype(
+                self.in_type
+            )
+            self.x[np.abs(self.x) < self.max_relative_error] = 0.01
+
+
+support_types = get_xpu_op_support_types('squared_l2_norm')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSquaredL2NormOp, stype)
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    paddle.seed(10)
+    unittest.main()