From 4f57da5fa6866a81f47ba90a8c9573648bdff11d Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Thu, 17 Nov 2022 11:19:58 +0800
Subject: [PATCH 1/5] [Zero-Dim] temporarily revert create_scalar due to input
 0D is not fully supported (#48058)

---
 python/paddle/fluid/layers/math_op_patch.py                 | 3 ++-
 python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index fb3979434347f4..f9ba6498671617 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -99,7 +99,8 @@ def create_tensor(block, value, dtype, shape):
         return var
 
     def create_scalar(block, value, dtype):
-        return create_tensor(block, value, dtype, shape=[])
+        # TODO(zhouwei): will change to [] which is 0-D Tensor
+        return create_tensor(block, value, dtype, shape=[1])
 
     def create_tensor_with_batchsize(ref_var, value, dtype):
         assert isinstance(ref_var, Variable)
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index c85f5aec42e9f6..174172b026f21f 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -350,7 +350,7 @@ def test_dygraph_binary(self):
 
         paddle.enable_static()
 
-    def test_static_unary(self):
+    def test_static_binary(self):
         paddle.enable_static()
         for api in binary_api_list + binary_api_list_without_grad:
             main_prog = fluid.Program()
@@ -377,15 +377,19 @@ def test_static_unary(self):
                 # Test runtime shape
                 self.assertEqual(out_np.shape, ())
 
+                # TODO(zhouwei): will open when create_scalar is []
                 # 2) x is 0D , y is scalar
+                '''
                 x = paddle.rand([])
                 y = 0.5
                 x.stop_gradient = False
+                print(api)
                 if isinstance(api, dict):
                     out = getattr(paddle.static.Variable, api['cls_method'])(
                         x, y
                     )
                     self.assertEqual(out.shape, ())
+                '''
 
         for api in binary_int_api_list_without_grad:
             main_prog = fluid.Program()

From e5ed5257083b92b018330812c33c746bae26fb41 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Thu, 17 Nov 2022 11:22:47 +0800
Subject: [PATCH 2/5] Support bfloat16 for adamw and adam optimizer. Fit the lr
 for pure bf16 training with tensor fusion. (#48041)

* add bfloat16 for adamw

* set lr not to bfloat16 for pure bf16 training

* update the logic

* update the adamw optimizer

* support bfloat for adam
---
 paddle/fluid/pybind/eager_functions.cc |  3 ++-
 paddle/phi/kernels/gpu/adamw_kernel.cu |  4 ++-
 python/paddle/optimizer/adam.py        | 34 ++++++++++++--------------
 python/paddle/optimizer/adamw.py       | 17 ++++++-------
 python/paddle/optimizer/optimizer.py   | 26 +++++++++++++++++---
 5 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index cdace567b2e9d4..3389daf330c7c8 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -268,7 +268,8 @@ PyObject* eager_api_get_grads_types(PyObject* self,
     if (meta && grad.initialized()) {
       if (grad.is_dense_tensor() &&
           (tensor.dtype() == paddle::experimental::DataType::FLOAT32 ||
-           tensor.dtype() == paddle::experimental::DataType::FLOAT16)) {
+           tensor.dtype() == paddle::experimental::DataType::FLOAT16 ||
+           tensor.dtype() == paddle::experimental::DataType::BFLOAT16)) {
         ret.emplace_back(
             paddle::framework::TransToProtoVarType(tensor.dtype()));
       }
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 9ddaacdd5cc6bb..6994c83f53624a 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -300,7 +301,8 @@ PD_REGISTER_KERNEL(adamw,
                    phi::AdamwDenseKernel,
                    float,
                    double,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   // Skip beta1_pow, beta2_pow, skip_update data transform
   kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
   kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 74499b05f24ae5..aa76fb82759f18 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -28,7 +28,7 @@
 
 __all__ = []
 
-GRAD_TYPES = [int(paddle.float32), int(paddle.float16)]
+GRAD_TYPES = [int(paddle.float32), int(paddle.float16), int(paddle.bfloat16)]
 
 
 class Adam(Optimizer):
@@ -265,8 +265,8 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        find_master = (
-            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
+            param.dtype
         )
         target_param = (
             self._master_weights[param.name] if find_master else param
@@ -285,10 +285,7 @@ def _get_accumulator(self, name, param):
 
     def _add_moments_pows(self, p):
         acc_dtype = p.dtype
-        if (
-            acc_dtype == core.VarDesc.VarType.FP16
-            or acc_dtype == core.VarDesc.VarType.BF16
-        ):
+        if self._is_dtype_fp16_or_bf16(acc_dtype):
             acc_dtype = core.VarDesc.VarType.FP32
         self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
         self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
@@ -322,16 +319,16 @@ def _create_accumulators(self, block, parameters):
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
-            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+            if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
                 master_p = self._create_master_weight(p)
                 self._add_moments_pows(master_p)
                 continue
             if (
-                p.dtype == core.VarDesc.VarType.FP16
+                self._is_dtype_fp16_or_bf16(p.dtype)
                 and not self._multi_precision
             ):
                 warnings.warn(
-                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Accumulating with FP16 or BF16 in optimizer can lead to poor accuracy or slow convergence."
                     "Consider using multi_precision=True option of the Adam optimizer."
                 )
             self._add_moments_pows(p)
@@ -353,9 +350,8 @@ def _append_optimize_op(self, block, param_and_grad):
         beta2_pow_acc = self._get_accumulator(
             self._beta2_pow_acc_str, param_and_grad[0]
         )
-        find_master = (
-            self._multi_precision
-            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
+        find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
+            param_and_grad[0].dtype
         )
         master_weight = (
             self._master_weights[param_and_grad[0].name]
@@ -571,7 +567,7 @@ def step(self):
 
     def _multi_tensor_init(self, target_block, parameters, param_group_idx):
         """
-        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
+        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (bfloat16, float16, float32).
         This function will be overridden in the corresponding optimizer file.
         Args:
             target_block: the block in which the loss tensor is present
@@ -604,7 +600,7 @@ def _multi_tensor_init(self, target_block, parameters, param_group_idx):
                 self._beta2_pow_acc_dict['FP32_LODTensor'][
                     param_group_idx
                 ].append(beta2_pow_acc)
-            elif param.dtype == paddle.float16:
+            elif self._is_dtype_fp16_or_bf16(param.dtype):
                 self._param_dict['FP16_LODTensor'][param_group_idx].append(
                     param
                 )
@@ -628,7 +624,7 @@ def _multi_tensor_init(self, target_block, parameters, param_group_idx):
                     self._master_weight_dict['FP16_LODTensor'] = None
             else:
                 raise ValueError(
-                    "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
+                    "Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is LOD_TENSOR."
                 )
 
     def _append_optimize_multi_tensor_op(
@@ -656,7 +652,7 @@ def _append_optimize_multi_tensor_op(
                         )
                         lr = self._create_param_lr(parameters_and_grads[index])
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif tp == GRAD_TYPES[1]:
+                    elif tp == GRAD_TYPES[1] or tp == GRAD_TYPES[2]:
                         grad_dict['FP16_LODTensor'].append(
                             parameters_and_grads[index][1]
                         )
@@ -678,7 +674,7 @@ def _append_optimize_multi_tensor_op(
                             lr = self._create_param_lr(param_and_grad)
                             lr_dict['FP32_LODTensor'].append(lr)
                         elif (
-                            param_and_grad[0].dtype == paddle.float16
+                            self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
                             and param_and_grad[1].type
                             == core.VarDesc.VarType.LOD_TENSOR
                         ):
@@ -711,7 +707,7 @@ def _append_optimize_multi_tensor_op(
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
                     elif (
-                        param_and_grad[0].dtype == paddle.float16
+                        self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
                         and param_and_grad[1].type
                         == core.VarDesc.VarType.LOD_TENSOR
                     ):
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index dca844b6682759..5424331a71fa93 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -369,8 +369,8 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        find_master = (
-            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
+            param.dtype
         )
         target_param = (
             self._master_weights[param.name] if find_master else param
@@ -389,7 +389,7 @@ def _get_accumulator(self, name, param):
 
     def _add_moments_pows(self, p):
         acc_dtype = p.dtype
-        if acc_dtype == core.VarDesc.VarType.FP16:
+        if self._is_dtype_fp16_or_bf16(acc_dtype):
             acc_dtype = core.VarDesc.VarType.FP32
         self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
         self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
@@ -423,16 +423,16 @@ def _create_accumulators(self, block, parameters):
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
-            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+            if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
                 master_p = self._create_master_weight(p)
                 self._add_moments_pows(master_p)
                 continue
             if (
-                p.dtype == core.VarDesc.VarType.FP16
+                self._is_dtype_fp16_or_bf16(p.dtype)
                 and not self._multi_precision
             ):
                 warnings.warn(
-                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Accumulating with FP16 or BF16 in optimizer can lead to poor accuracy or slow convergence."
                     "Consider using multi_precision=True option of the Adam optimizer."
                 )
             self._add_moments_pows(p)
@@ -463,9 +463,8 @@ def _append_optimize_op(self, block, param_and_grad):
         beta2_pow_acc = self._get_accumulator(
             self._beta2_pow_acc_str, param_and_grad[0]
         )
-        find_master = (
-            self._multi_precision
-            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
+        find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
+            param_and_grad[0].dtype
         )
         master_weight = (
             self._master_weights[param_and_grad[0].name]
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 26ae5b50269b2f..59663bb8190886 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -421,15 +421,21 @@ def get_opti_var_name_list(self):
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
-        # lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
+        # lr var can't be float16 or bfloat16, for pure fp16 or bf16 training, should extra handle the dtype for lr
         _lr_dtype = (
             paddle.get_default_dtype() if self._dtype is None else self._dtype
         )
         _lr_dtype = (
             paddle.float32
             if (
-                paddle.get_default_dtype() != "float16"
-                and _lr_dtype == paddle.float16
+                (
+                    paddle.get_default_dtype() != "float16"
+                    and _lr_dtype == paddle.float16
+                )
+                or (
+                    paddle.get_default_dtype() != "bfloat16"
+                    and _lr_dtype == paddle.bfloat16
+                )
             )
             else _lr_dtype
         )
@@ -1526,3 +1532,17 @@ def _append_optimize_multi_tensor_op(
         For Multi Tensor, append optimize merged_operator to block.
         """
         pass
+
+    def _is_dtype_fp16_or_bf16(self, dtype):
+        """
+        check the dtype is fp16 or the dtype is bf16
+        :param dtype: instance of core.VarDesc.VarType
+        :return: True if dtype is one of fp16 or bf16, False otherwise
+        """
+        assert isinstance(
+            dtype, core.VarDesc.VarType
+        ), "The dtype should be an instance of core.VarDesc.VarType."
+        return (
+            dtype == core.VarDesc.VarType.FP16
+            or dtype == core.VarDesc.VarType.BF16
+        )

From f62bd3b490b151fce074d1cd11389161b1b0acbd Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Thu, 17 Nov 2022 11:29:36 +0800
Subject: [PATCH 3/5] [PHI decoupling] move "paddle/fluid/operators/math.h" to
 phi (#48062)

* rm "paddle/fluid/operators/math.h" in phi

* rm "paddle/fluid/operators/math.h" in fluit
---
 paddle/fluid/operators/cross_entropy_op.h     |  4 +--
 paddle/fluid/operators/dequantize_log_op.cu   |  1 -
 .../detection/sigmoid_focal_loss_op.cu        | 32 +++++++++++--------
 paddle/fluid/operators/math/cross_entropy.cu  | 13 ++++----
 .../sequence_ops/sequence_softmax_op.cu       |  6 ++--
 paddle/phi/kernels/cpu/bce_loss_kernel.cc     |  7 ++--
 .../phi/kernels/cpu/nll_loss_grad_kernel.cc   |  2 +-
 paddle/phi/kernels/funcs/functors.h           |  8 ++---
 .../operators => phi/kernels/funcs}/math.h    | 20 ++++++------
 paddle/phi/kernels/gpu/nll_loss.h             |  2 +-
 .../gpu/sigmoid_cross_entropy_with_logits.h   |  2 +-
 ...d_cross_entropy_with_logits_grad_kernel.cu |  4 +--
 ...igmoid_cross_entropy_with_logits_kernel.cu |  5 ++-
 paddle/phi/kernels/impl/selu_kernel_impl.h    |  4 +--
 14 files changed, 55 insertions(+), 55 deletions(-)
 rename paddle/{fluid/operators => phi/kernels/funcs}/math.h (69%)

diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 2949dc8d1fb2a1..4dcaf7b99f0914 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/math.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -190,7 +190,7 @@ struct HardLabelCrossEntropyForwardFunctor {
                      label);
 
       auto match_x = x_[idx * feature_size_ + label];
-      y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
+      y_[idx] = -math::TolerableValue<T>()(phi::funcs::real_log(match_x));
       match_x_[idx] = match_x;
     } else {
       y_[idx] = 0;
diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index 360871f9e7251b..4a1976f6fdd685 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_log_op.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index bad93fd22b2e9b..76a47581e9f727 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace paddle {
 namespace operators {
@@ -55,15 +55,16 @@ __global__ void GPUSigmoidFocalLossForward(const T *x_data,
     T s_pos = alpha / fg_num;
 
     // p = 1. / 1. + expf(-x)
-    T p = 1. / (1. + real_exp(-x));
+    T p = 1. / (1. + phi::funcs::real_exp(-x));
 
     // (1 - p)**gamma * log(p)
     T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                 real_log(p > FLT_MIN ? p : FLT_MIN);
+                 phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN);
     // p**gamma * log(1 - p)
-    T term_neg =
-        std::pow(p, gamma) *
-        (-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0))));
+    T term_neg = std::pow(p, gamma) *
+                 (-1. * x * (x >= 0) -
+                  phi::funcs::real_log(
+                      1. + phi::funcs::real_exp(x - 2. * x * (x >= 0))));
 
     out_data[i] = 0.0;
     out_data[i] += -c_pos * term_pos * s_pos;
@@ -96,17 +97,20 @@ __global__ void GPUSigmoidFocalLossBackward(const T *x_data,
     T c_pos = static_cast<T>(g == (d + 1));
     T c_neg = static_cast<T>((g != -1) & (g != (d + 1)));
 
-    T p = 1. / (1. + real_exp(-x));
+    T p = 1. / (1. + phi::funcs::real_exp(-x));
 
     // (1-p)**g * (1 - p - g*p*log(p))
-    T term_pos = std::pow(static_cast<T>(1. - p), gamma) *
-                 (1. - p - (p * gamma * real_log(p > FLT_MIN ? p : FLT_MIN)));
+    T term_pos =
+        std::pow(static_cast<T>(1. - p), gamma) *
+        (1. - p -
+         (p * gamma * phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN)));
     // (p**g) * (g*(1-p)*log(1-p) - p)
-    T term_neg =
-        std::pow(p, gamma) *
-        ((-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0)))) *
-             (1. - p) * gamma -
-         p);
+    T term_neg = std::pow(p, gamma) *
+                 ((-1. * x * (x >= 0) -
+                   phi::funcs::real_log(
+                       1. + phi::funcs::real_exp(x - 2. * x * (x >= 0)))) *
+                      (1. - p) * gamma -
+                  p);
 
     dx_data[i] = 0.0;
     dx_data[i] += -c_pos * s_pos * term_pos;
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 0e5b95542455e3..478c4e0cd6611b 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace paddle {
 namespace operators {
@@ -39,9 +39,10 @@ __global__ void CrossEntropyKernel(T* Y,
                    D,
                    ignore_index,
                    lbl);
-    Y[i] = ignore_index == lbl
-               ? static_cast<T>(0)
-               : -math::TolerableValue<T>()(real_log(X[i * D + lbl]));
+    Y[i] =
+        ignore_index == lbl
+            ? static_cast<T>(0)
+            : -math::TolerableValue<T>()(phi::funcs::real_log(X[i * D + lbl]));
   }
 }
 
@@ -56,7 +57,7 @@ __global__ void SoftCrossEntropyKernel(T* Y,
   int idx = blockIdx.x * class_num + tid;
   int end = blockIdx.x * class_num + class_num;
   for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(real_log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(phi::funcs::real_log(X[idx])) * label[idx];
   }
 
   val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -152,7 +153,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
 
 template class CrossEntropyFunctor<phi::GPUContext, float>;
 template class CrossEntropyFunctor<phi::GPUContext, double>;
-template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;
+template class CrossEntropyFunctor<phi::GPUContext, phi::dtype::float16>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 29f562ec5eca24..e58cff60aea485 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -23,8 +23,8 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace paddle {
 namespace operators {
@@ -67,7 +67,7 @@ __global__ void sequence_softmax_kernel(const T *in_data,
     T sum_data = 0;
     for (int tid = threadIdx.x; tid < span; tid += blockDim.x) {
       T ele = in_data[start + tid];
-      sum_data += real_exp(ele - shared_max_data);
+      sum_data += phi::funcs::real_exp(ele - shared_max_data);
     }
     sum_data =
         BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, cub::Sum());
@@ -79,7 +79,7 @@ __global__ void sequence_softmax_kernel(const T *in_data,
     // get final resit
     for (int tid = threadIdx.x; tid < span; tid += blockDim.x) {
       T ele = in_data[start + tid];
-      ele = real_exp(ele - shared_max_data) / shared_sum_data;
+      ele = phi::funcs::real_exp(ele - shared_max_data) / shared_sum_data;
       out_data[start + tid] = ele;
     }
   }
diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
index 9d62fabcbe736a..7b980162016664 100644
--- a/paddle/phi/kernels/cpu/bce_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
@@ -16,9 +16,9 @@
 
 #include <algorithm>  // for max
 
-#include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace phi {
 
@@ -47,10 +47,9 @@ void BCELossKernel(const Context& dev_ctx,
             "Illegal input, input must be less than or equal to 1"));
     out_data[i] =
         (label_data[i] - static_cast<T>(1)) *
-            std::max(paddle::operators::real_log(static_cast<T>(1) - x_data[i]),
+            std::max(phi::funcs::real_log(static_cast<T>(1) - x_data[i]),
                      (T)(-100)) -
-        label_data[i] *
-            std::max(paddle::operators::real_log(x_data[i]), (T)(-100));
+        label_data[i] * std::max(phi::funcs::real_log(x_data[i]), (T)(-100));
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
index 9048e87d049895..c84b3d4efbb88c 100644
--- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -17,9 +17,9 @@
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h
index d518a877b26f2c..2e6fe8b2d738bc 100644
--- a/paddle/phi/kernels/funcs/functors.h
+++ b/paddle/phi/kernels/funcs/functors.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace phi {
 namespace funcs {
@@ -89,8 +89,7 @@ struct TanhFunctor {
     // y = 2 / (1 + e^-2x) - 1
     T t0 = static_cast<T>(2) * x;
     T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0);
-    return static_cast<T>(2) /
-               (static_cast<T>(1) + paddle::operators::real_exp(-t1)) -
+    return static_cast<T>(2) / (static_cast<T>(1) + phi::funcs::real_exp(-t1)) -
            static_cast<T>(1);
   }
 };
@@ -111,8 +110,7 @@ struct SigmoidFunctor {
   inline HOSTDEVICE T operator()(T x) {
     // y = 1 / (1 + e^-x)
     T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x);
-    return static_cast<T>(1) /
-           (static_cast<T>(1) + paddle::operators::real_exp(-tmp));
+    return static_cast<T>(1) / (static_cast<T>(1) + phi::funcs::real_exp(-tmp));
   }
 };
 
diff --git a/paddle/fluid/operators/math.h b/paddle/phi/kernels/funcs/math.h
similarity index 69%
rename from paddle/fluid/operators/math.h
rename to paddle/phi/kernels/funcs/math.h
index 47281fb0280f0f..f8c373badf187a 100644
--- a/paddle/fluid/operators/math.h
+++ b/paddle/phi/kernels/funcs/math.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,22 +15,22 @@
 #pragma once
 
 #include "math.h"  // NOLINT
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
-  return static_cast<platform::float16>(::expf(static_cast<float>(x)));
+inline HOSTDEVICE phi::dtype::float16 real_exp(phi::dtype::float16 x) {
+  return static_cast<phi::dtype::float16>(::expf(static_cast<float>(x)));
 }
 
 inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
 
 inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
 
-inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
-  return static_cast<platform::float16>(::logf(static_cast<float>(x)));
+inline HOSTDEVICE phi::dtype::float16 real_log(phi::dtype::float16 x) {
+  return static_cast<phi::dtype::float16>(::logf(static_cast<float>(x)));
 }
 
 inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
@@ -41,5 +41,5 @@ inline HOSTDEVICE float real_min(float x, float y) { return ::fminf(x, y); }
 
 inline HOSTDEVICE double real_min(double x, double y) { return ::fmin(x, y); }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/nll_loss.h b/paddle/phi/kernels/gpu/nll_loss.h
index 37a67b4767a9bb..9d063d0ef44a0b 100644
--- a/paddle/phi/kernels/gpu/nll_loss.h
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -19,10 +19,10 @@
 #include <functional>
 #include <string>
 
-#include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace phi {
 static constexpr int kNumCUDAThreads = 512;
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 84a24449b3a1c6..1cc025bac480f4 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -17,13 +17,13 @@
 #include <algorithm>
 
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/math.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 
 #ifdef __NVCC__
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index f61cd2c39674ec..736c5608a6ac7c 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -37,8 +37,8 @@ struct SigmoidBwdFunctor {
       dx_data = static_cast<T>(0.);
       counts = 0;
     } else {
-      T simoid_x = static_cast<T>(1) /
-                   (static_cast<T>(1) + paddle::operators::real_exp(-x));
+      T simoid_x =
+          static_cast<T>(1) / (static_cast<T>(1) + phi::funcs::real_exp(-x));
       T diff = simoid_x - label;
       dx_data = dout * diff;
       counts = 1;
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index b0e9efe5bbafe6..fb0183ce1efd65 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -37,9 +37,8 @@ struct SigmoidFwdFunctor {
     } else {
       T term1 = (x > 0) ? x : 0;
       T term2 = x * label;
-      T term3 = paddle::operators::real_log(
-          static_cast<T>(1) +
-          paddle::operators::real_exp(static_cast<T>(-abs(x))));
+      T term3 = phi::funcs::real_log(
+          static_cast<T>(1) + phi::funcs::real_exp(static_cast<T>(-abs(x))));
 
       out_data = term1 - term2 + term3;
       counts = 1;
diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h
index c5d756e6eb4fae..14789a7d61ac84 100644
--- a/paddle/phi/kernels/impl/selu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/selu_kernel_impl.h
@@ -15,9 +15,9 @@
 #pragma once
 #include <string>
 
-#include "paddle/fluid/operators/math.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace phi {
 
@@ -32,7 +32,7 @@ struct SeluFunctor {
   HOSTDEVICE void operator()(size_t idx) const {
     T x_ele = x_data_ptr_[idx];
     if (x_ele <= 0) {
-      x_ele = alpha_ * paddle::operators::real_exp(x_ele) - alpha_;
+      x_ele = alpha_ * phi::funcs::real_exp(x_ele) - alpha_;
     }
     y_data_ptr_[idx] = scale_ * x_ele;
   }

From b7e120d264a33d97bb7d946d6197edc488a0976c Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Thu, 17 Nov 2022 11:38:50 +0800
Subject: [PATCH 4/5] rm "paddle/phi/kernels/gpu/batch_norm_utils.h" in phi
 (#48057)

---
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |   2 +-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/batch_norm_utils.h     | 142 ------------------
 4 files changed, 3 insertions(+), 145 deletions(-)
 delete mode 100644 paddle/phi/kernels/gpu/batch_norm_utils.h

diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index f2054d4d396c60..efd55dee88cd04 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -16,9 +16,9 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 8d072368633ef1..e6c681588e4edb 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -22,10 +22,10 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 7b553db274d1f3..44fe99046e1585 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -29,10 +29,10 @@ namespace cub = hipcub;
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h
deleted file mode 100644
index c9c62026edfa7a..00000000000000
--- a/paddle/phi/kernels/gpu/batch_norm_utils.h
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-using Tensor = DenseTensor;
-
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelFirst(const DeviceContext& context,
-                                 const Tensor* input,
-                                 Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = phi::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[4];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    in_dims_vec[4] = input->dims()[3];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
-    context.template Alloc<T>(transformed_input);
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = phi::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[3];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
-    context.template Alloc<T>(transformed_input);
-  } else if (dim == 1) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = phi::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
-    context.template Alloc<T>(transformed_input);
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelLast(const DeviceContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = phi::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[4];
-    in_dims_vec[4] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
-    context.template Alloc<T>(transformed_input);
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = phi::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
-    context.template Alloc<T>(transformed_input);
-  } else if (dim == 1) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = phi::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
-    context.template Alloc<T>(transformed_input);
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelFirst(const DeviceContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  VLOG(5) << "Why am I called?";
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    std::vector<int> axis{0, 4, 1, 2, 3};
-    funcs::Transpose<DeviceContext, T, 5> trans5;
-    trans5(context, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    std::vector<int> axis{0, 3, 1, 2};
-    funcs::Transpose<DeviceContext, T, 4> trans4;
-    trans4(context, *input, transformed_input, axis);
-  } else if (dim == 1) {
-    std::vector<int> axis{0, 2, 1};
-    funcs::Transpose<DeviceContext, T, 3> trans3;
-    trans3(context, *input, transformed_input, axis);
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelLast(const DeviceContext& context,
-                               const Tensor* input,
-                               Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    std::vector<int> axis{0, 2, 3, 4, 1};
-    funcs::Transpose<DeviceContext, T, 5> trans5;
-    trans5(context, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    std::vector<int> axis{0, 2, 3, 1};
-    funcs::Transpose<DeviceContext, T, 4> trans4;
-    trans4(context, *input, transformed_input, axis);
-  } else if (dim == 1) {
-    std::vector<int> axis{0, 2, 1};
-    funcs::Transpose<DeviceContext, T, 3> trans3;
-    trans3(context, *input, transformed_input, axis);
-  }
-}
-
-}  // namespace phi

From 460d5040d2e8fd58ab470ba376438b56a0cb8dd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kevin=E5=90=B4=E5=98=89=E6=96=87?= <417333277@qq.com>
Date: Thu, 17 Nov 2022 11:44:13 +0800
Subject: [PATCH 5/5] Remove reduntant numpy input in Example code,
 test=document_fix (#47916)

---
 python/paddle/distributed/utils/moe_utils.py  |  33 ++---
 python/paddle/fft.py                          | 124 ++++++++++--------
 .../paddle/sparse/nn/functional/activation.py |  39 +++---
 python/paddle/sparse/nn/layer/activation.py   |  43 +++---
 4 files changed, 124 insertions(+), 115 deletions(-)

diff --git a/python/paddle/distributed/utils/moe_utils.py b/python/paddle/distributed/utils/moe_utils.py
index cd7c0e758d4e0b..eb7e73c363bf2c 100644
--- a/python/paddle/distributed/utils/moe_utils.py
+++ b/python/paddle/distributed/utils/moe_utils.py
@@ -71,7 +71,6 @@ def global_scatter(
         .. code-block:: python
 
             # required: distributed
-            import numpy as np
             import paddle
             from paddle.distributed import init_parallel_env
             init_parallel_env()
@@ -79,17 +78,14 @@ def global_scatter(
             world_size = 2
             d_model = 2
             in_feat = d_model
-            local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
-            dtype=np.float32)
+            local_input_buf = paddle.to_tensor([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
+                                            dtype='float32', stop_gradient=False)
             if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = np.array([2, 1, 1, 1])
-                global_count = np.array([2, 1, 1, 1])
+                local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+                global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
             else:
-                local_count = np.array([1, 1, 2, 1])
-                global_count = np.array([1, 1, 2, 1])
-            local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
-            local_count = paddle.to_tensor(local_count, dtype="int64")
-            global_count = paddle.to_tensor(global_count, dtype="int64")
+                local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+                global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
             a = paddle.distributed.utils.global_scatter(local_input_buf, \
             local_count, global_count)
             a.stop_gradient = False
@@ -193,7 +189,6 @@ def global_gather(
         .. code-block:: python
 
             # required: distributed
-            import numpy as np
             import paddle
             from paddle.distributed import init_parallel_env
             init_parallel_env()
@@ -201,17 +196,15 @@ def global_gather(
             world_size = 2
             d_model = 2
             in_feat = d_model
-            local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],\
-                                        dtype=np.float32)
+            local_input_buf = paddle._to_tensor([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],\
+                                        dtype='float32', stop_gradient=False)
             if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = np.array([2, 1, 1, 1])
-                global_count = np.array([2, 1, 1, 1])
+                local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+                global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
             else:
-                local_count = np.array([1, 1, 2, 1])
-                global_count = np.array([1, 1, 2, 1])
-            local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
-            local_count = paddle.to_tensor(local_count, dtype="int64")
-            global_count = paddle.to_tensor(global_count, dtype="int64")
+                local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+                global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+
             a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count)
             print(a)
             # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 8bc95cd37e9f24..1e4ca9237469ba 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -521,26 +521,29 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.mgrid[:4, :4, :4][1]
-            xp = paddle.to_tensor(x)
-            fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
-            print(fftn_xp)
-            #  [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            arr = paddle.arange(4, dtype="float64")
+            x = paddle.meshgrid(arr, arr, arr)[1]
+
+            fftn_xp = paddle.fft.fftn(x, axes=(1, 2))
+            print(fftn_xp.numpy())
+            # [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+
+            #  [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+
+            #  [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+
+            #  [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
@@ -901,15 +904,16 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.mgrid[:2, :2][1]
-            xp = paddle.to_tensor(x)
-            fft2_xp = paddle.fft.fft2(xp).numpy()
+            arr = paddle.arange(2, dtype="float64")
+            x = paddle.meshgrid(arr, arr)[0]
+
+            fft2_xp = paddle.fft.fft2(x)
             print(fft2_xp)
-            #  [[ 2.+0.j -2.+0.j]
-            #   [ 0.+0.j  0.+0.j]]
+            # Tensor(shape=[2, 2], dtype=complex128, place=Place(gpu:0), stop_gradient=True,
+            #        [[ (2+0j),  0j    ],
+            #         [(-2+0j),  0j    ]])
 
     """
     _check_at_least_ndim(x, 2)
@@ -971,15 +975,16 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.mgrid[:2, :2][1]
-            xp = paddle.to_tensor(x)
-            ifft2_xp = paddle.fft.ifft2(xp).numpy()
+            arr = paddle.arange(2, dtype="float64")
+            x = paddle.meshgrid(arr, arr)[0]
+
+            ifft2_xp = paddle.fft.ifft2(x)
             print(ifft2_xp)
-            #  [[ 0.5+0.j -0.5+0.j]
-            #   [ 0. +0.j  0. +0.j]]
+            # Tensor(shape=[2, 2], dtype=complex128, place=Place(gpu:0), stop_gradient=True,
+            #        [[ (0.5+0j),  0j      ],
+            #         [(-0.5+0j),  0j      ]])
     """
     _check_at_least_ndim(x, 2)
     if s is not None:
@@ -1033,16 +1038,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     .. code-block:: python
 
         import paddle
-        import numpy as np
-
-        x = paddle.to_tensor(np.mgrid[:5, :5][0].astype(np.float32))
-        print(paddle.fft.rfft2(x))
-        # Tensor(shape=[5, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[ (50+0j)                                        ,  (1.1920928955078125e-07+0j)                    ,  0j                                             ],
-        #         [(-12.5+17.204774856567383j)                     , (-9.644234211236835e-08+7.006946134424652e-08j) ,  0j                                             ],
-        #         [(-12.500000953674316+4.061495304107666j)        , (3.6837697336977726e-08-1.1337477445749755e-07j),  0j                                             ],
-        #         [(-12.500000953674316-4.061495304107666j)        , (3.6837697336977726e-08+1.1337477445749755e-07j),  0j                                             ],
-        #         [(-12.5-17.204774856567383j)                     , (-9.644234211236835e-08-7.006946134424652e-08j) ,  0j                                             ]])
+
+        arr = paddle.arange(5, dtype="float64")
+        x = paddle.meshgrid(arr, arr)[0]
+
+        result = paddle.fft.rfft2(x)
+        print(result.numpy())
+        # [[ 50.  +0.j           0.  +0.j           0.  +0.j        ]
+        #  [-12.5+17.20477401j   0.  +0.j           0.  +0.j        ]
+        #  [-12.5 +4.0614962j    0.  +0.j           0.  +0.j        ]
+        #  [-12.5 -4.0614962j    0.  +0.j           0.  +0.j        ]
+        #  [-12.5-17.20477401j   0.  +0.j           0.  +0.j        ]]
     """
     _check_at_least_ndim(x, 2)
     if s is not None:
@@ -1192,13 +1198,20 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.mgrid[:5, :5][0].astype(np.float64)
-            xp = paddle.to_tensor(x)
-            ihfft2_xp = paddle.fft.ihfft2(xp).numpy()
-            print(ihfft2_xp)
+            arr = paddle.arange(5, dtype="float64")
+            x = paddle.meshgrid(arr, arr)[0]
+            print(x)
+            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            #        [[0., 0., 0., 0., 0.],
+            #         [1., 1., 1., 1., 1.],
+            #         [2., 2., 2., 2., 2.],
+            #         [3., 3., 3., 3., 3.],
+            #         [4., 4., 4., 4., 4.]])
+
+            ihfft2_xp = paddle.fft.ihfft2(x)
+            print(ihfft2_xp.numpy())
             # [[ 2. +0.j          0. +0.j          0. +0.j        ]
             #  [-0.5-0.68819096j  0. +0.j          0. +0.j        ]
             #  [-0.5-0.16245985j  0. +0.j          0. +0.j        ]
@@ -1250,15 +1263,11 @@ def fftfreq(n, d=1.0, dtype=None, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
             scalar_temp = 0.5
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            fftfreq_xp = paddle.fft.fftfreq(5, d=scalar_temp)
             print(fftfreq_xp)
-
             #  Tensor(shape=[5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
             #           [ 0.        ,  0.40000001,  0.80000001, -0.80000001, -0.40000001])
     """
@@ -1301,13 +1310,10 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
             scalar_temp = 0.3
-            n = x.size
-            rfftfreq_xp = paddle.fft.rfftfreq(n, d=scalar_temp)
+            rfftfreq_xp = paddle.fft.rfftfreq(5, d=scalar_temp)
             print(rfftfreq_xp)
 
             #  Tensor(shape=[3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
@@ -1343,15 +1349,17 @@ def fftshift(x, axes=None, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
-            res = paddle.fft.fftshift(fftfreq_xp).numpy()
+            fftfreq_xp = paddle.fft.fftfreq(5, d=0.3)
+            print(fftfreq_xp)
+            # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [ 0.        ,  0.66666669,  1.33333337, -1.33333337, -0.66666669])
+
+            res = paddle.fft.fftshift(fftfreq_xp)
             print(res)
-            #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
+            # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-1.33333337, -0.66666669,  0.        ,  0.66666669,  1.33333337])
 
     """
     shape = paddle.shape(x)
@@ -1386,15 +1394,17 @@ def ifftshift(x, axes=None, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
-            res = paddle.fft.ifftshift(fftfreq_xp).numpy()
+            fftfreq_xp = paddle.fft.fftfreq(5, d=0.3)
+            print(fftfreq_xp)
+            # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [ 0.        ,  0.66666669,  1.33333337, -1.33333337, -0.66666669])
+
+            res = paddle.fft.ifftshift(fftfreq_xp)
             print(res)
-            #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]
+            # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [ 1.33333337, -1.33333337, -0.66666669,  0.        ,  0.66666669])
 
     """
     shape = paddle.shape(x)
diff --git a/python/paddle/sparse/nn/functional/activation.py b/python/paddle/sparse/nn/functional/activation.py
index cbe2ddd0d79dbf..93c5e74014f3e0 100644
--- a/python/paddle/sparse/nn/functional/activation.py
+++ b/python/paddle/sparse/nn/functional/activation.py
@@ -87,28 +87,31 @@ def softmax(x, axis=-1, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.seed(100)
 
-            mask = np.random.rand(3, 4) < 0.5
-            np_x = np.random.rand(3, 4) * mask
-            # [[0.         0.         0.96823406 0.19722934]
-            #  [0.94373937 0.         0.02060066 0.71456372]
-            #  [0.         0.         0.         0.98275049]]
-
-            csr = paddle.to_tensor(np_x).to_sparse_csr()
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0, 2, 5, 6],
-            #        cols=[2, 3, 0, 2, 3, 3],
-            #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
-            #                0.98275049])
+            mask = paddle.rand((3, 4)) < 0.5
+            x = paddle.rand((3, 4)) * mask
+            print(x)
+            # Tensor(shape=[3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.83438963, 0.70008713, 0.        , 0.88831252],
+            #         [0.02200012, 0.        , 0.75432241, 0.65136462],
+            #         [0.96088767, 0.82938021, 0.35367414, 0.86653489]])
+
+            csr = x.to_sparse_csr()
+            print(csr)
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0 , 3 , 6 , 10],
+            #        cols=[0, 1, 3, 0, 2, 3, 0, 1, 2, 3],
+            #        values=[0.83438963, 0.70008713, 0.88831252, 0.02200012, 0.75432241,
+            #                0.65136462, 0.96088767, 0.82938021, 0.35367414, 0.86653489])
 
             out = paddle.sparse.nn.functional.softmax(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0, 2, 5, 6],
-            #        cols=[2, 3, 0, 2, 3, 3],
-            #        values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269,
-            #                1.        ])
+            print(out)
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0 , 3 , 6 , 10],
+            #        cols=[0, 1, 3, 0, 2, 3, 0, 1, 2, 3],
+            #        values=[0.34132850, 0.29843223, 0.36023921, 0.20176248, 0.41964680,
+            #                0.37859070, 0.30015594, 0.26316854, 0.16354506, 0.27313042])
 
     """
     return _C_ops.sparse_softmax(x, axis)
diff --git a/python/paddle/sparse/nn/layer/activation.py b/python/paddle/sparse/nn/layer/activation.py
index 3ad856f69fbec1..91d5c198189dd9 100644
--- a/python/paddle/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
@@ -86,29 +86,32 @@ class Softmax(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            paddle.seed(100)
-
-            mask = np.random.rand(3, 4) < 0.5
-            np_x = np.random.rand(3, 4) * mask
-            # [[0.         0.         0.96823406 0.19722934]
-            #  [0.94373937 0.         0.02060066 0.71456372]
-            #  [0.         0.         0.         0.98275049]]
-
-            csr = paddle.to_tensor(np_x).to_sparse_csr()
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0, 2, 5, 6],
-            #        cols=[2, 3, 0, 2, 3, 3],
-            #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
-            #                0.98275049])
+            paddle.seed(2022)
+
+            mask = paddle.rand((3, 4)) < 0.7
+            x = paddle.rand((3, 4)) * mask
+            print(x)
+            # Tensor(shape=[3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.08325022, 0.27030438, 0.        , 0.83883715],
+            #         [0.        , 0.95856029, 0.24004589, 0.        ],
+            #         [0.14500992, 0.17088132, 0.        , 0.        ]])
+
+            csr = x.to_sparse_csr()
+            print(csr)
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 3, 5, 7],
+            #        cols=[0, 1, 3, 1, 2, 0, 1],
+            #        values=[0.08325022, 0.27030438, 0.83883715, 0.95856029, 0.24004589,
+            #                0.14500992, 0.17088132])
 
             softmax = paddle.sparse.nn.Softmax()
             out = softmax(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0, 2, 5, 6],
-            #        cols=[2, 3, 0, 2, 3, 3],
-            #        values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269,
-            #                1.        ])
+            print(out)
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 3, 5, 7],
+            #        cols=[0, 1, 3, 1, 2, 0, 1],
+            #        values=[0.23070428, 0.27815846, 0.49113727, 0.67227983, 0.32772022,
+            #                0.49353254, 0.50646752])
     """
 
     def __init__(self, axis=-1, name=None):