diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu
index 534a9ca35e3a6..9dc25e30ce9aa 100644
--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/prelu.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -135,14 +136,17 @@ void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
 
 template class PreluChannelWiseDirectCUDAFunctor<float>;
 template class PreluChannelWiseDirectCUDAFunctor<platform::float16>;
+template class PreluChannelWiseDirectCUDAFunctor<platform::bfloat16>;
 template class PreluChannelWiseDirectCUDAFunctor<double>;
 
 template class PreluElementWiseDirectCUDAFunctor<float>;
 template class PreluElementWiseDirectCUDAFunctor<platform::float16>;
+template class PreluElementWiseDirectCUDAFunctor<platform::bfloat16>;
 template class PreluElementWiseDirectCUDAFunctor<double>;
 
 template class PreluScalarDirectCUDAFunctor<float>;
 template class PreluScalarDirectCUDAFunctor<platform::float16>;
+template class PreluScalarDirectCUDAFunctor<platform::bfloat16>;
 template class PreluScalarDirectCUDAFunctor<double>;
 
 }  // namespace math
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 35769d0e6e338..e39c860bc8096 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -1316,6 +1316,74 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        int lda,
+                                        const phi::dtype::bfloat16 *B,
+                                        int ldb,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C,
+                                        int ldc) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      80,
+      phi::errors::InvalidArgument(
+          "cublas bf16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                          cuTransB,
+                                                          cuTransA,
+                                                          N,
+                                                          M,
+                                                          K,
+                                                          &h_alpha,
+                                                          B,
+                                                          CUDA_R_16BF,
+                                                          ldb,
+                                                          A,
+                                                          CUDA_R_16BF,
+                                                          lda,
+                                                          &h_beta,
+                                                          C,
+                                                          CUDA_R_16BF,
+                                                          ldc,
+                                                          CUDA_R_32F,
+                                                          algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
index 5edfe3a602c7b..bb02242e2db72 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -751,7 +751,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       context_.GetComputeCapability(),
       80,
       phi::errors::InvalidArgument(
-          "rocblas fp16 gemm requires GPU compute capability >= 80,"
+          "rocblas bf16 gemm requires GPU compute capability >= 80,"
           "but received %d",
           context_.GetComputeCapability()));
 
@@ -982,6 +982,70 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        int lda,
+                                        const phi::dtype::bfloat16 *B,
+                                        int ldb,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C,
+                                        int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      80,
+      phi::errors::InvalidArgument(
+          "rocblas bf16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::rocblas_gemm_ex(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &h_alpha,
+                                      B,
+                                      rocblas_datatype_bf16_r,
+                                      ldb,
+                                      A,
+                                      rocblas_datatype_bf16_r,
+                                      lda,
+                                      &h_beta,
+                                      C,
+                                      rocblas_datatype_bf16_r,
+                                      ldc,
+                                      C,
+                                      rocblas_datatype_bf16_r,
+                                      ldc,
+                                      rocblas_datatype_f32_r,
+                                      algo,
+                                      0,
+                                      0));
+  });
+}
+
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 3543c0c6aa066..508240a991644 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
@@ -189,6 +190,17 @@ struct FMinFunctor<dtype::float16> {
   }
 };
 
+template <>
+struct FMinFunctor<dtype::bfloat16> {
+  inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a,
+                                               const dtype::bfloat16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return static_cast<dtype::bfloat16>(result);
+  }
+};
+
 template <>
 struct FMinFunctor<int> {
   inline HOSTDEVICE int operator()(const int a, const int b) const {
@@ -228,6 +240,17 @@ struct FMaxFunctor<dtype::float16> {
   }
 };
 
+template <>
+struct FMaxFunctor<dtype::bfloat16> {
+  inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a,
+                                               const dtype::bfloat16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return static_cast<dtype::bfloat16>(result);
+  }
+};
+
 template <>
 struct FMaxFunctor<int> {
   inline HOSTDEVICE int operator()(const int a, const int b) const {
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
index 00e8f45d8ffb0..56068f9459ebd 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -21,12 +21,14 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-#define Instantiate_Template_Function(func)                                    \
-  Instantiate_Template_Function_index_t(                                       \
-      func, int) Instantiate_Template_Function_index_t(func, float)            \
-      Instantiate_Template_Function_index_t(func, double)                      \
-          Instantiate_Template_Function_index_t(func, int64_t)                 \
-              Instantiate_Template_Function_index_t(func, phi::dtype::float16) \
+#define Instantiate_Template_Function(func)                                  \
+  Instantiate_Template_Function_index_t(                                     \
+      func, int) Instantiate_Template_Function_index_t(func, float)          \
+      Instantiate_Template_Function_index_t(                                 \
+          func, double) Instantiate_Template_Function_index_t(func, int64_t) \
+          Instantiate_Template_Function_index_t(func, phi::dtype::float16)   \
+              Instantiate_Template_Function_index_t(func,                    \
+                                                    phi::dtype::bfloat16)    \
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
 #define Instantiate_Template_Function_index_t(func, tensor_t)          \
diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu
index 78a0b345e389f..87c82adbb7fbe 100644
--- a/paddle/phi/kernels/funcs/im2col.cu
+++ b/paddle/phi/kernels/funcs/im2col.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 
 namespace phi {
@@ -71,7 +72,7 @@ __global__ void im2col(const T* data_im,
         }
         *data_col =
             (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
-                ? 0
+                ? T(0)
                 : data_im[im_idx];
         data_col += col_height * col_width;
       }
@@ -173,7 +174,7 @@ __global__ void col2im(int n,
   int input_channels = n / im_height / im_width;
 
   if (index < n) {
-    T val = 0;
+    T val = static_cast<T>(0);
     int w = (data_layout != DataLayout::kNHWC
                  ? index % im_width + padding_width
                  : (index / input_channels) % im_width + padding_width);
@@ -309,12 +310,24 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
 template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
                              phi::GPUContext,
                              double>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::float16>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::bfloat16>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
                              phi::GPUContext,
                              float>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
                              phi::GPUContext,
                              double>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::float16>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::bfloat16>;
 
 template <class T>
 __global__ void im2colOCF(const T* im_data,
@@ -560,13 +573,24 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
 template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              double>;
-
+template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::float16>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::bfloat16>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              float>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              double>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::float16>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::bfloat16>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index d0a0416994169..2f89b51815e64 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -1963,7 +1963,7 @@ __global__ void KernelMaxPool2dWithIdx(const int nthreads,
       wstart = max(wstart, 0);
     }
 
-    T1 ele = -FLT_MAX;
+    T1 ele = static_cast<T1>(-FLT_MAX);
     int max_index = -1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
@@ -2015,7 +2015,7 @@ __global__ void AdaptiveKernelMaxPool2dWithIdx(const int nthreads,
     wstart = AdaptStartIndex(w_offset, input_width, output_width);
     wend = AdaptEndIndex(w_offset, input_width, output_width);
 
-    T1 ele = -FLT_MAX;
+    T1 ele = static_cast<T1>(-FLT_MAX);
     int max_index = -1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
@@ -2089,7 +2089,7 @@ __global__ void KernelMaxPool2DWithIdxGrad(const int nthreads,
       pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
     }
 
-    T1 input_grad_data = 0;
+    T1 input_grad_data = static_cast<T1>(0);
     int input_current_featuremap_idx = h_offset * input_width + w_offset;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
@@ -2259,6 +2259,14 @@ template class MaxPool2dWithIndexFunctor<phi::GPUContext, float, int>;
 template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, float, int>;
 template class MaxPool2dWithIndexFunctor<phi::GPUContext, double, int>;
 template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, double, int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, dtype::float16, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext,
+                                             dtype::float16,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, dtype::bfloat16, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext,
+                                             dtype::bfloat16,
+                                             int>;
 
 template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdx(const int ncd,
@@ -2324,7 +2332,7 @@ __global__ void KernelMaxPool3DWithIdx(const int ncd,
         wstart = max(wstart, 0);
       }
 
-      T1 ele = -FLT_MAX;
+      T1 ele = static_cast<T1>(-FLT_MAX);
       int max_index = -1;
       for (int d = dstart; d < dend; ++d) {
         for (int h = hstart; h < hend; ++h) {
@@ -2560,6 +2568,14 @@ template class MaxPool3dWithIndexFunctor<phi::GPUContext, float, int>;
 template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, float, int>;
 template class MaxPool3dWithIndexFunctor<phi::GPUContext, double, int>;
 template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, double, int>;
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, dtype::float16, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext,
+                                             dtype::float16,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, dtype::bfloat16, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext,
+                                             dtype::bfloat16,
+                                             int>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
index 65978da1374e4..9d915af9170f6 100644
--- a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
@@ -18,5 +18,11 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    addmm_grad, GPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {}
+PD_REGISTER_KERNEL(addmm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddmmGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu
index 7b589ce20acca..563b137040ac7 100644
--- a/paddle/phi/kernels/gpu/addmm_kernel.cu
+++ b/paddle/phi/kernels/gpu/addmm_kernel.cu
@@ -18,4 +18,11 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
 
-PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
+PD_REGISTER_KERNEL(addmm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddmmKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 7acfd33e94a9a..a202c428394d7 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -104,8 +104,10 @@ PD_REGISTER_KERNEL(broadcast_tensors_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::BroadcastTensorsGradKernel,
+                   bool,
                    int,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
index 82dac4552a42f..3d16797cb66c0 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/broadcast_tensors_kernel.h"
 
-#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
 
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(broadcast_tensors,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu
index 6c3abf843f998..cd80246944967 100644
--- a/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu
@@ -17,7 +17,7 @@
 #include <typeinfo>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
@@ -34,7 +34,7 @@ void ClipByNormKernel(const Context& dev_ctx,
     return ClipByNormFunctor<float, Context>(dev_ctx, in, max_norm, output);
   }
   auto input = &in;
-  dev_ctx.template Alloc<dtype::float16>(output);
+  dev_ctx.template Alloc<T>(output);
 
   PADDLE_ENFORCE_NOT_NULL(input,
                           phi::errors::InvalidArgument(
@@ -49,20 +49,14 @@ void ClipByNormKernel(const Context& dev_ctx,
   auto* tmp = &tmp_tensor;
   tmp->Resize({1});
   dev_ctx.template Alloc<float>(tmp);
-  phi::funcs::ReduceKernel<dtype::float16,
-                           float,
-                           kps::AddFunctor,
-                           kps::SquareFunctor<dtype::float16, float>>(
-      dev_ctx,
-      *input,
-      tmp,
-      kps::SquareFunctor<dtype::float16, float>(),
-      reduce_dims);
+  phi::funcs::
+      ReduceKernel<T, float, kps::AddFunctor, kps::SquareFunctor<T, float>>(
+          dev_ctx, *input, tmp, kps::SquareFunctor<T, float>(), reduce_dims);
   auto tmp_eigen = phi::EigenVector<float>::Flatten(*tmp);
   auto x_norm = tmp_eigen.sqrt();
 
-  auto x = phi::EigenVector<dtype::float16>::Flatten(*input);
-  auto out = phi::EigenVector<dtype::float16>::Flatten(*output);
+  auto x = phi::EigenVector<T>::Flatten(*input);
+  auto out = phi::EigenVector<T>::Flatten(*output);
   auto* place = dev_ctx.eigen_device();
 
   auto temp = (x_norm <= max_norm).template cast<float>();
@@ -72,7 +66,7 @@ void ClipByNormKernel(const Context& dev_ctx,
 
   auto scaling =
       (temp + (static_cast<float>(1) - temp) * max_norm / (x_norm + epsilon))
-          .template cast<dtype::float16>();
+          .template cast<T>();
   Eigen::array<int, 1> one_dim{{1}};
   Eigen::DSizes<int, 1> m_dsize(input->numel());
 
@@ -86,4 +80,5 @@ PD_REGISTER_KERNEL(clip_by_norm,
                    ALL_LAYOUT,
                    phi::ClipByNormKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 4ff1164e64e08..0dac34f5baccd 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -98,6 +98,7 @@ PD_REGISTER_KERNEL(fmax_grad,
                    double,
                    int,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int64_t) {}
 
 PD_REGISTER_KERNEL(fmin_grad,
@@ -108,6 +109,7 @@ PD_REGISTER_KERNEL(fmin_grad,
                    double,
                    int,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int64_t) {}
 
 PD_REGISTER_KERNEL(maximum_grad,
diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
index 88e42a16dba36..c0d5b737c5fbb 100644
--- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
@@ -105,5 +105,6 @@ PD_REGISTER_KERNEL(index_add_grad,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index f4894fc6d9308..9ba5bde7f6136 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -123,5 +123,6 @@ PD_REGISTER_KERNEL(index_add,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {}
diff --git a/paddle/phi/kernels/gpu/logspace_kernel.cu b/paddle/phi/kernels/gpu/logspace_kernel.cu
index b5e4904fdcf69..458600e2f29e1 100644
--- a/paddle/phi/kernels/gpu/logspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/logspace_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/logspace_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/data_type_transform.h"
@@ -25,25 +26,34 @@ namespace phi {
 template <typename T>
 __global__ void LogspaceKernelInner(
     T start, T stop, double step, T base, int64_t size, T* out) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType mt_start = static_cast<MPType>(start);
+  MPType mt_stop = static_cast<MPType>(stop);
+  MPType mt_base = static_cast<MPType>(base);
+
   int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; index < size; index += blockDim.x * gridDim.x) {
     if (index < size / 2) {
       out[index] =
-          static_cast<T>(pow(static_cast<double>(base),
-                             static_cast<double>(start + step * index)));
+          static_cast<T>(pow(static_cast<double>(mt_base),
+                             static_cast<double>(mt_start + step * index)));
     } else {
       out[index] = static_cast<T>(
-          pow(static_cast<double>(base),
-              static_cast<double>(stop - step * (size - index - 1))));
+          pow(static_cast<double>(mt_base),
+              static_cast<double>(mt_stop - step * (size - index - 1))));
     }
   }
 }
 
 template <typename T>
 __global__ void LogspaceSpecialKernel(T start, T base, T* out) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType mt_start = static_cast<MPType>(start);
+  MPType mt_base = static_cast<MPType>(base);
+
   out[0] = static_cast<T>(
-      pow(static_cast<double>(base), static_cast<double>(start)));
+      pow(static_cast<double>(mt_base), static_cast<double>(mt_start)));
 }
 
 template <typename T, typename Context>
@@ -54,6 +64,8 @@ void LogspaceKernel(const Context& ctx,
                     const DenseTensor& base,
                     DataType dtype,
                     DenseTensor* out) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
   auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
   auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
   auto base_t = phi::funcs::TransDataType(ctx, base, dtype);
@@ -71,6 +83,9 @@ void LogspaceKernel(const Context& ctx,
   phi::Copy(ctx, base_t, phi::CPUPlace(), false, &n_base);
   T base_data = n_base.data<T>()[0];
 
+  MPType mt_start_data = static_cast<MPType>(start_data);
+  MPType mt_stop_data = static_cast<MPType>(stop_data);
+
   PADDLE_ENFORCE_GT(
       num,
       0,
@@ -86,7 +101,7 @@ void LogspaceKernel(const Context& ctx,
   int block = 512;
   int grid = (num + block - 1) / block;
   if (num != 1) {
-    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    step = (static_cast<double>(mt_stop_data - mt_start_data)) / (num - 1);
     LogspaceKernelInner<T><<<grid, block, 0, stream>>>(
         start_data, stop_data, step, base_data, num, out_data);
   } else {
@@ -104,4 +119,6 @@ PD_REGISTER_KERNEL(logspace,
                    float,
                    int32_t,
                    int64_t,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index 4b48e2eb17322..e27d5251a9144 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -64,4 +64,5 @@ PD_REGISTER_KERNEL(matmul_with_flatten_double_grad,
                    phi::MatmulWithFlattenDoubleGradKernel,
                    float,
                    double,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
index 9c665ed1233e0..b1da8858a8bcd 100644
--- a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
@@ -15,16 +15,15 @@ limitations under the License. */
 #include "paddle/phi/kernels/multi_dot_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
-using float16 = phi::dtype::float16;
-
 PD_REGISTER_KERNEL(multi_dot_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MultiDotGradKernel,
                    float,
                    double,
-                   float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
index be73e2d162a4b..f6328f400cd2d 100644
--- a/paddle/phi/kernels/gpu/multi_dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
@@ -15,11 +15,15 @@ limitations under the License. */
 #include "paddle/phi/kernels/multi_dot_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
-using float16 = phi::dtype::float16;
-
-PD_REGISTER_KERNEL(
-    multi_dot, GPU, ALL_LAYOUT, phi::MultiDotKernel, float, double, float16) {}
+PD_REGISTER_KERNEL(multi_dot,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiDotKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
index e4cfcb23b730e..c625977543558 100644
--- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -38,7 +38,9 @@ PD_REGISTER_KERNEL(max_pool2d_with_index_grad,
                    ALL_LAYOUT,
                    phi::MaxPool2dWithIndexGradKernel,
                    float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->InputAt(1).SetDataType(phi::CppTypeToDataType<int>::Type());
 }
 
@@ -55,6 +57,8 @@ PD_REGISTER_KERNEL(max_pool3d_with_index_grad,
                    ALL_LAYOUT,
                    phi::MaxPool3dWithIndexGradKernel,
                    float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->InputAt(1).SetDataType(phi::CppTypeToDataType<int>::Type());
 }
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
index 65d0ef4bdc916..511cc263bc760 100644
--- a/paddle/phi/kernels/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -32,7 +32,9 @@ PD_REGISTER_KERNEL(max_pool2d_with_index,
                    ALL_LAYOUT,
                    phi::MaxPool2dWithIndexKernel,
                    float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->OutputAt(1).SetDataType(phi::CppTypeToDataType<int>::Type());
 }
 
@@ -49,6 +51,8 @@ PD_REGISTER_KERNEL(max_pool3d_with_index,
                    ALL_LAYOUT,
                    phi::MaxPool3dWithIndexKernel,
                    float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->OutputAt(1).SetDataType(phi::CppTypeToDataType<int>::Type());
 }
diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
index 2a9512e495cc9..b613cdc606ece 100644
--- a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
@@ -189,4 +189,5 @@ PD_REGISTER_KERNEL(prelu_grad,
                    phi::PReluGradKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double) {}
diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu
index 679d7ba3f2e7d..246b6fdc95d28 100644
--- a/paddle/phi/kernels/gpu/prelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_kernel.cu
@@ -79,4 +79,5 @@ PD_REGISTER_KERNEL(prelu,
                    phi::PReluKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index 77276f244a2b7..8321bcd1aa7ac 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -75,4 +75,5 @@ PD_REGISTER_KERNEL(put_along_axis_grad,
                    double,
                    int64_t,
                    int,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index e44580fd2da85..b63047973e9b8 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -82,4 +82,5 @@ PD_REGISTER_KERNEL(put_along_axis,
                    double,
                    int64_t,
                    int,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 6ae0e6df07cfb..456d541451e64 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -28,6 +28,7 @@ namespace cub = hipcub;
 
 #include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -165,4 +166,6 @@ PD_REGISTER_KERNEL(randperm,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
index ea1d377c45976..86cccc5e03b1c 100644
--- a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
@@ -16,8 +16,63 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const IntArray& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* x_grad) {
+  dev_ctx.Alloc(x_grad, x.dtype());
+  reduce_all = recompute_reduce_all(x, dims, reduce_all);
+
+  // get reduce_dim
+  int dim_size = x.dims().size();
+  auto reduce_dims =
+      funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
+  auto update_dims = vectorize(x.dims());
+  for (auto i : reduce_dims) {
+    update_dims[i] = 1;
+  }
+
+  // make new tensor of out and out_grad
+  phi::DenseTensor new_out(out.type());
+  new_out.ShareDataWith(out);
+  new_out.Resize(phi::make_ddim(update_dims));
+
+  phi::DenseTensor new_out_grad(out_grad.type());
+  new_out_grad.ShareDataWith(out_grad);
+  new_out_grad.Resize(phi::make_ddim(update_dims));
+
+  // make equal_out
+  phi::DenseTensor* equal_out = new phi::DenseTensor();
+  equal_out->Resize(x.dims());
+  dev_ctx.template Alloc<T>(equal_out);
+
+  // compute
+  // 1. equal_out = Equal(x, y)
+  std::vector<const phi::DenseTensor*> equal_inputs = {&new_out, &x};
+  std::vector<phi::DenseTensor*> equal_outputs = {equal_out};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
+
+  // 2. dx = dout * 1
+  std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
+  std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor<T>());
+  delete equal_out;
+}
+}  // namespace phi
 PD_REGISTER_KERNEL(min_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -25,4 +80,6 @@ PD_REGISTER_KERNEL(min_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index c4fffce6ccc07..f38ab6416693f 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -68,4 +68,5 @@ PD_REGISTER_KERNEL(take_along_axis_grad,
                    double,
                    int64_t,
                    int,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index 3c919f5acd1cd..6ad828f2680b3 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -54,4 +54,5 @@ PD_REGISTER_KERNEL(take_along_axis,
                    double,
                    int64_t,
                    int,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
index 6692c1e19b033..a97e71a01874e 100644
--- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
@@ -27,5 +27,6 @@ PD_REGISTER_KERNEL(trace_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 4a749c5b3347d..671ca490e136a 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -52,5 +52,6 @@ PD_REGISTER_KERNEL(trace,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/unfold_grad_kernel.cu b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu
index 9cd874ec6a6f7..3484fe8fdc75e 100644
--- a/paddle/phi/kernels/gpu/unfold_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    unfold_grad, GPU, ALL_LAYOUT, phi::UnfoldGradKernel, float, double) {}
+PD_REGISTER_KERNEL(unfold_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnfoldGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/unfold_kernel.cu b/paddle/phi/kernels/gpu/unfold_kernel.cu
index db0d259791b60..f816db028cbc1 100644
--- a/paddle/phi/kernels/gpu/unfold_kernel.cu
+++ b/paddle/phi/kernels/gpu/unfold_kernel.cu
@@ -15,7 +15,15 @@
 #include "paddle/phi/kernels/unfold_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unfold_kernel_impl.h"
 
-PD_REGISTER_KERNEL(unfold, GPU, ALL_LAYOUT, phi::UnfoldKernel, float, double) {}
+PD_REGISTER_KERNEL(unfold,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnfoldKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu
index 61efe8807643d..3b7f8a931278e 100644
--- a/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/uniform_inplace_grad_kernel.h"
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 
@@ -41,4 +42,6 @@ PD_REGISTER_KERNEL(uniform_inplace_grad,
                    ALL_LAYOUT,
                    phi::UniformInplaceGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu
index 29bc2f4de5b86..5c3a886ad87e9 100644
--- a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <thrust/random.h>
 
 #include "gflags/gflags.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
@@ -72,8 +73,12 @@ void UniformInplaceKernel(const Context& ctx,
     funcs::distribution_and_transform<T>(ctx, out, dist, trans);
   } else {
     // Use OP seed
-    auto func =
-        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
+    auto func = UniformGenerator<T>(static_cast<T>(min),
+                                    static_cast<T>(max),
+                                    seed,
+                                    diag_num,
+                                    diag_step,
+                                    static_cast<T>(diag_val));
     IndexKernel<T, UniformGenerator<T>>(ctx, out, func);
   }
 }
@@ -85,4 +90,6 @@ PD_REGISTER_KERNEL(uniform_inplace,
                    ALL_LAYOUT,
                    phi::UniformInplaceKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
index a72fb6062ceff..7b05b08eef5fb 100644
--- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -18,13 +18,34 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/addmm_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
 
+template <typename T>
+struct CopyOrScaleFunctor {
+  CopyOrScaleFunctor(const float scale, const T* x, T* output, int64_t numel)
+      : scale_(scale), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    const MPType mp_scale = static_cast<MPType>(scale_);
+    const MPType mp_x = static_cast<MPType>(x_[idx]);
+    output_[idx] = static_cast<T>(mp_scale * mp_x);
+  }
+
+ private:
+  const float scale_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+
 template <typename T,
           size_t D,
           int MajorType = Eigen::RowMajor,
@@ -45,6 +66,13 @@ void AddmmGradKernel(const Context& dev_ctx,
                      DenseTensor* input_grad,
                      DenseTensor* x_grad,
                      DenseTensor* y_grad) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  bool is_float16_or_bfloat16 = false;
+  if (std::is_same<T, phi::dtype::float16>::value ||
+      std::is_same<T, phi::dtype::bfloat16>::value) {
+    is_float16_or_bfloat16 = true;
+  }
+
   auto in_dims = input.dims();
   if (input.dims().size() == 1) {
     in_dims = {1, input.dims()[0]};
@@ -65,6 +93,7 @@ void AddmmGradKernel(const Context& dev_ctx,
   }
 
   auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  auto mt_blas = funcs::GetBlas<Context, MPType>(dev_ctx);
   if (input_grad) {
     dev_ctx.template Alloc<T>(input_grad);
     total_elems = in_dims[0] * in_dims[1];
@@ -78,19 +107,60 @@ void AddmmGradKernel(const Context& dev_ctx,
         Array2(input_grad->dims()[0], input_grad->dims()[1]);
 
     if (row_compress && col_compress) {
-      eigen_dinput.device(place) =
-          eigen_dout.sum().eval().reshape(eigen_dinput_shape);
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum().eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum()
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
     } else if (row_compress) {
-      eigen_dinput.device(place) =
-          eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array1(0))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
     } else if (col_compress) {
-      eigen_dinput.device(place) =
-          eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array1(1))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
     } else {
-      blas.VCOPY(total_elems, out_grad.data<T>(), input_grad->data<T>());
+      // The VCOPY does not support the float16, bfloat16
+      if (!is_float16_or_bfloat16) {
+        mt_blas.VCOPY(
+            total_elems, out_grad.data<MPType>(), input_grad->data<MPType>());
+      } else {
+        phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+        CopyOrScaleFunctor<T> functor(
+            1, out_grad.data<T>(), input_grad->data<T>(), total_elems);
+        for_range(functor);
+      }
     }
 
-    blas.SCAL(total_elems, beta, input_grad->data<T>());
+    // The SCAL does not support the float16, bfloat16
+    if (!is_float16_or_bfloat16) {
+      mt_blas.SCAL(total_elems, beta, input_grad->data<MPType>());
+    } else {
+      phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+      CopyOrScaleFunctor<T> functor(
+          beta, input_grad->data<T>(), input_grad->data<T>(), total_elems);
+      for_range(functor);
+    }
 
     if (input.dims().size() == 1) {
       input_grad->Resize(input.dims());
@@ -101,14 +171,28 @@ void AddmmGradKernel(const Context& dev_ctx,
     total_elems = x.dims()[0] * x.dims()[1];
     // x_grad = out_grad * y'. x_grad: M x K, out_grad : M x N, y : K x N
     blas.MatMul(out_grad, false, y, true, x_grad);
-    blas.SCAL(total_elems, alpha, x_grad->data<T>());
+    if (!is_float16_or_bfloat16) {
+      mt_blas.SCAL(total_elems, alpha, x_grad->data<MPType>());
+    } else {
+      phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+      CopyOrScaleFunctor<T> functor(
+          alpha, x_grad->data<T>(), x_grad->data<T>(), total_elems);
+      for_range(functor);
+    }
   }
   if (y_grad) {
     dev_ctx.template Alloc<T>(y_grad);
     total_elems = x.dims()[1] * y.dims()[1];
     // y_grad = x' * out_grad. y_grad K x N, out_grad : M x N, x : M x K
     blas.MatMul(x, true, out_grad, false, y_grad);
-    blas.SCAL(total_elems, alpha, y_grad->data<T>());
+    if (!is_float16_or_bfloat16) {
+      mt_blas.SCAL(total_elems, alpha, y_grad->data<MPType>());
+    } else {
+      phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+      CopyOrScaleFunctor<T> functor(
+          alpha, y_grad->data<T>(), y_grad->data<T>(), total_elems);
+      for_range(functor);
+    }
   }
 }
 
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
index c86cea80e47e8..957e02de6af62 100644
--- a/paddle/phi/kernels/impl/addmm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -112,17 +112,19 @@ void AddmmKernel(const Context& dev_ctx,
   funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
       place, eigen_out, eigen_input, bcast_dims);
 
+  T t_alpha = static_cast<T>(alpha);
+  T t_beta = static_cast<T>(beta);
   blas.GEMM(false,
             false,
             x_dims[0],
             y_dims[1],
             x_dims[1],
-            alpha,
+            t_alpha,
             x.data<T>(),
             x_dims[1],
             y.data<T>(),
             y_dims[1],
-            beta,
+            t_beta,
             out->data<T>(),
             y_dims[1]);
 }
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 245137943d571..dcab3a0d90e67 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -117,6 +117,7 @@ PD_REGISTER_KERNEL(fmax,
                    double,
                    int,
                    float16,
+                   bfloat16,
                    int64_t) {}
 
 PD_REGISTER_KERNEL(fmin,
@@ -127,6 +128,7 @@ PD_REGISTER_KERNEL(fmin,
                    double,
                    int,
                    float16,
+                   bfloat16,
                    int64_t) {}
 
 PD_REGISTER_KERNEL(maximum_raw,
diff --git a/paddle/phi/kernels/kps/reduce_min_kernel.cu b/paddle/phi/kernels/kps/reduce_min_kernel.cu
index 450fee16b4ca9..8ed9ec30c1920 100644
--- a/paddle/phi/kernels/kps/reduce_min_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_min_kernel.cu
@@ -36,6 +36,14 @@ void MinRawKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float) {}
 #else
-PD_REGISTER_KERNEL(
-    min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(min_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MinRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index c4c58c8342e60..ff50e9d1077b0 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -39,7 +39,20 @@ void MinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_KERNEL(min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+
+#if defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/python/paddle/fluid/tests/unittests/eager_op_test.py b/python/paddle/fluid/tests/unittests/eager_op_test.py
index 0bcb34f956b23..1be451d65e83b 100644
--- a/python/paddle/fluid/tests/unittests/eager_op_test.py
+++ b/python/paddle/fluid/tests/unittests/eager_op_test.py
@@ -2689,6 +2689,26 @@ def fluid_dtype_to_np_dtype(self, dtype):
     def np_value_to_fluid_value(input):
         return input
 
+    def cast_bf16_output(self, block, cast_inputs):
+        output_names = []
+        for i in range(0, len(cast_inputs)):
+            cast_output = block.create_var(
+                dtype="float32", shape=cast_inputs[i].shape
+            )
+            cast_op = block.append_op(
+                inputs={"X": cast_inputs[i]},
+                outputs={"Out": cast_output},
+                type="cast",
+                attrs={
+                    "in_dtype": core.VarDesc.VarType.BF16,
+                    "out_dtype": core.VarDesc.VarType.FP32,
+                },
+            )
+            cast_op.desc.infer_var_type(block.desc)
+            cast_op.desc.infer_shape(block.desc)
+            output_names.append(cast_output.name)
+        return output_names
+
     def _get_gradient(
         self,
         input_to_check,
@@ -2712,21 +2732,24 @@ def _get_gradient(
             if user_defined_grad_outputs is None:
                 if self.dtype == np.uint16:
                     cast_inputs = list(map(block.var, output_names))
-                    cast_outputs = block.create_var(
-                        dtype="float32", shape=cast_inputs[0].shape
-                    )
-                    cast_op = block.append_op(
-                        inputs={"X": cast_inputs},
-                        outputs={"Out": cast_outputs},
-                        type="cast",
-                        attrs={
-                            "in_dtype": core.VarDesc.VarType.BF16,
-                            "out_dtype": core.VarDesc.VarType.FP32,
-                        },
-                    )
-                    cast_op.desc.infer_var_type(block.desc)
-                    cast_op.desc.infer_shape(block.desc)
-                    output_names = [cast_outputs.name]
+                    if self.op_type == "broadcast_tensors":
+                        output_names = self.cast_bf16_output(block, cast_inputs)
+                    else:
+                        cast_outputs = block.create_var(
+                            dtype="float32", shape=cast_inputs[0].shape
+                        )
+                        cast_op = block.append_op(
+                            inputs={"X": cast_inputs},
+                            outputs={"Out": cast_outputs},
+                            type="cast",
+                            attrs={
+                                "in_dtype": core.VarDesc.VarType.BF16,
+                                "out_dtype": core.VarDesc.VarType.FP32,
+                            },
+                        )
+                        cast_op.desc.infer_var_type(block.desc)
+                        cast_op.desc.infer_shape(block.desc)
+                        output_names = [cast_outputs.name]
                 loss = append_loss_ops(block, output_names)
                 param_grad_list = append_backward(
                     loss=loss,
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 3041841cdf84f..66a86961e885d 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -15,11 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
-from paddle.fluid import Program, program_guard
+from paddle.fluid import Program, core, program_guard
 
 
 class TestAddMMOp(OpTest):
@@ -27,7 +27,6 @@ class TestAddMMOp(OpTest):
     def setUp(self):
         self.op_type = "addmm"
         self.python_api = paddle.addmm
-        self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {
             'Input': np.random.random((100, 1)).astype(self.dtype),
@@ -40,7 +39,7 @@ def setUp(self):
         }
 
     def init_dtype_type(self):
-        pass
+        self.dtype = np.float64
 
     def test_check_output(self):
         self.check_output()
@@ -58,6 +57,62 @@ def test_check_grad_input(self):
         self.check_grad(['Input'], 'Out', no_grad_set=None)
 
 
+class TestAddMMFP16Op(TestAddMMOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestAddMMBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "addmm"
+        self.python_api = paddle.addmm
+        self.init_dtype_type()
+        self.inputs = {
+            'Input': np.random.random((100, 1)).astype(self.np_dtype),
+            'X': np.random.random((100, 10)).astype(self.np_dtype),
+            'Y': np.random.random((10, 20)).astype(self.np_dtype),
+        }
+        self.outputs = {
+            'Out': self.inputs['Input']
+            + np.dot(self.inputs['X'], self.inputs['Y'])
+        }
+
+        self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input'])
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['Input', 'X', 'Y'], 'Out')
+
+    def test_check_grad_x(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', no_grad_set=None)
+
+    def test_check_grad_y(self):
+        self.check_grad_with_place(self.place, ['Y'], 'Out', no_grad_set=None)
+
+    def test_check_grad_input(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', no_grad_set=None
+        )
+
+
 class TestAddMMOpError(unittest.TestCase):
     # test error
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
index d2af1933469aa..be37ff0157898 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -43,7 +43,7 @@ def find_output_shape(input_list):
     return list(reversed(output_shape))
 
 
-def make_inputs_outputs(input_shapes, dtype):
+def make_inputs_outputs(input_shapes, dtype, is_bfloat16=False):
     """Automatically generate formatted inputs and outputs from input_shapes"""
     input_list = [
         np.random.random(shape).astype(dtype) for shape in input_shapes
@@ -53,6 +53,16 @@ def make_inputs_outputs(input_shapes, dtype):
         x + np.zeros(output_shape).astype(x.dtype) for x in input_list
     ]
 
+    if is_bfloat16:
+        input_list = [
+            convert_float_to_uint16(input_list[i])
+            for i in range(len(input_list))
+        ]
+        output_list = [
+            convert_float_to_uint16(output_list[i])
+            for i in range(len(output_list))
+        ]
+
     output_formatted = {
         "Out": [(f"out{i}", output_list[i]) for i in range(len(output_list))]
     }
@@ -63,24 +73,24 @@ def make_inputs_outputs(input_shapes, dtype):
     return input_formatted, output_formatted
 
 
-def gen_rank_diff_test(dtype):
+def gen_rank_diff_test(dtype, is_bfloat16=False):
     input_shapes = [(2, 60, 1), (6, 2, 1, 10)]
-    return make_inputs_outputs(input_shapes, dtype)
+    return make_inputs_outputs(input_shapes, dtype, is_bfloat16)
 
 
-def gen_no_broadcast_test(dtype):
+def gen_no_broadcast_test(dtype, is_bfloat16=False):
     input_shapes = [(12, 1, 10, 1), (12, 1, 10, 1)]
-    return make_inputs_outputs(input_shapes, dtype)
+    return make_inputs_outputs(input_shapes, dtype, is_bfloat16)
 
 
-def gen_mixed_tensors_test(dtype):
+def gen_mixed_tensors_test(dtype, is_bfloat16=False):
     input_shapes = [(2, 60, 1), (2, 2, 1, 30), (1, 2, 60, 1)]
-    return make_inputs_outputs(input_shapes, dtype)
+    return make_inputs_outputs(input_shapes, dtype, is_bfloat16)
 
 
-def gen_empty_tensors_test(dtype):
+def gen_empty_tensors_test(dtype, is_bfloat16=False):
     input_shapes = [(0), (0), (0)]
-    return make_inputs_outputs(input_shapes, dtype)
+    return make_inputs_outputs(input_shapes, dtype, is_bfloat16)
 
 
 class TestCPUBroadcastTensorsOp(OpTest):
@@ -125,7 +135,7 @@ def run_triple_in_test(self, test_func, args):
     def test_check_output(self):
         self.run_dual_test(
             self.check_output_with_place,
-            {"place": self.place, "atol": 1e-1},
+            {"place": self.place},
         )
 
     def test_check_grad_normal(self):
@@ -135,7 +145,6 @@ def test_check_grad_normal(self):
                 "place": self.place,
                 "inputs_to_check": ['x0', 'x1'],
                 "output_names": ['out0', 'out1'],
-                "max_relative_error": 0.05,
             },
         )
         self.run_triple_in_test(
@@ -144,7 +153,6 @@ def test_check_grad_normal(self):
                 "place": self.place,
                 "inputs_to_check": ['x0', 'x1', 'x2'],
                 "output_names": ['out0', 'out1', "out2"],
-                "max_relative_error": 0.05,
             },
         )
 
@@ -152,14 +160,77 @@ def test_check_grad_normal(self):
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestCUDABroadcastTensorsOp(TestCPUBroadcastTensorsOp):
+class TestBroadcastTensorsFP16Op(TestCPUBroadcastTensorsOp):
     def set_place(self):
         self.place = core.CUDAPlace(0)
 
     def set_dtypes(self):
-        self.dtypes = ['float64']
-        if core.is_float16_supported(self.place):
-            self.dtypes.append('float16')
+        self.dtypes = ['float16']
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestBroadcastTensorsBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "broadcast_tensors"
+        self.dtype = np.uint16
+        self.np_dtype = "float32"
+        self.use_mkldnn = False
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        self.test_gen_func_list = [
+            gen_rank_diff_test,
+            gen_no_broadcast_test,
+            gen_mixed_tensors_test,
+        ]
+        self.python_api = paddle.broadcast_tensors
+        self.place = core.CUDAPlace(0)
+
+    def run_dual_test(self, test_func, args):
+        for gen_func in self.test_gen_func_list:
+            self.inputs, self.outputs = gen_func(self.np_dtype, True)
+            if len(self.outputs["Out"]) < 3:
+                self.python_out_sig = [
+                    f"out{i}" for i in range(len(self.outputs["Out"]))
+                ]
+                test_func(**args)
+
+    def run_triple_in_test(self, test_func, args):
+        self.inputs, self.outputs = self.test_gen_func_list[2](
+            self.np_dtype, True
+        )
+        self.python_out_sig = [
+            f"out{i}" for i in range(len(self.outputs["Out"]))
+        ]
+        test_func(**args)
+
+    def test_check_output(self):
+        self.run_dual_test(
+            self.check_output_with_place,
+            {"place": self.place},
+        )
+
+    def test_check_grad_normal(self):
+        self.run_dual_test(
+            self.check_grad_with_place,
+            {
+                "place": self.place,
+                "inputs_to_check": ['x0', 'x1'],
+                "output_names": ['out0', 'out1'],
+                "check_dygraph": False,
+            },
+        )
+        self.run_triple_in_test(
+            self.check_grad_with_place,
+            {
+                "place": self.place,
+                "inputs_to_check": ['x0', 'x1', 'x2'],
+                "output_names": ['out0', 'out1', 'out2'],
+                "check_dygraph": False,
+            },
+        )
 
 
 class TestBroadcastTensorsAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index 12c161ca9e56f..da78ee681aac0 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
 
 import paddle
@@ -102,6 +102,48 @@ def initTestCase(self):
         self.max_norm = 1.0
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestClipByNormBF16Op(OpTest):
+    def setUp(self):
+        self.max_relative_error = 0.006
+        self.python_api = clip.clip_by_norm
+        self.init_dtype()
+        self.initTestCase()
+        input = np.random.random(self.shape).astype(self.np_dtype)
+        input[np.abs(input) < self.max_relative_error] = 0.5
+        self.op_type = "clip_by_norm"
+        self.inputs = {
+            'X': input,
+        }
+        self.attrs = {}
+        self.attrs['max_norm'] = self.max_norm
+        norm = np.sqrt(np.sum(np.square(input)))
+        if norm > self.max_norm:
+            output = self.max_norm * input / norm
+        else:
+            output = input
+        self.outputs = {'Out': output}
+
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def initTestCase(self):
+        self.shape = (100,)
+        self.max_norm = 1.0
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+
 class TestClipByNormOpWithSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         self.config_test_case()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 0b843663827c1..d2b6545a81a4c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -15,10 +15,16 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    get_numeric_gradient,
+    paddle_static_guard,
+)
 
 import paddle
 from paddle.fluid import core
+from paddle.fluid.tests.unittests.testsuite import create_op
 
 
 def conv3d_forward_naive(
@@ -179,6 +185,77 @@ def init_kernel_type(self):
     globals()[cls_name] = TestCUDNNCase
 
 
+def create_test_cudnn_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestConv3DCUDNNBF16(parent):
+        def get_numeric_grad(self, place, check_name):
+            scope = core.Scope()
+            self._check_grad_helper()
+            op = create_op(
+                scope, self.op_type, self.inputs, self.outputs, self.attrs
+            )
+            return get_numeric_gradient(
+                place, scope, op, self.inputs_fp32, check_name, ['Output']
+            )
+
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(
+                place, check_dygraph=(not self.use_mkldnn)
+            )
+
+        def test_check_grad_no_filter(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Input')
+
+            self.check_grad_with_place(
+                place,
+                ['Input'],
+                'Output',
+                no_grad_set={'Filter'},
+                check_dygraph=(not self.use_mkldnn),
+                user_defined_grads=[numeric_grads],
+            )
+
+        def test_check_grad_no_input(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Filter')
+
+            self.check_grad_with_place(
+                place,
+                ['Filter'],
+                'Output',
+                no_grad_set={'Input'},
+                check_dygraph=(not self.use_mkldnn),
+                user_defined_grads=[numeric_grads],
+            )
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            numeric_input_grads = self.get_numeric_grad(place, 'Input')
+            numeric_fliter_grads = self.get_numeric_grad(place, 'Filter')
+
+            self.check_grad_with_place(
+                place,
+                {'Input', 'Filter'},
+                'Output',
+                user_defined_grads=[numeric_input_grads, numeric_fliter_grads],
+                check_dygraph=(not self.use_mkldnn),
+            )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP")
+    TestConv3DCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DCUDNNBF16
+
+
 def create_test_padding_SAME_class(parent):
     class TestPaddingSMAECase(parent):
         def init_paddings(self):
@@ -323,19 +400,37 @@ def setUp(self):
             'dilations': self.dilations,
         }
 
-        input = np.random.random(self.input_size).astype(self.dtype)
-        filter = np.random.random(self.filter_size).astype(self.dtype)
+        if self.is_bfloat16_op():
+            input = np.random.random(self.input_size).astype(np.float32)
+            filter = np.random.random(self.filter_size).astype(np.float32)
+        else:
+            input = np.random.random(self.input_size).astype(self.dtype)
+            filter = np.random.random(self.filter_size).astype(self.dtype)
+
         output = conv3d_forward_naive(
             input,
             filter,
             self.groups,
             conv3d_param,
-        ).astype(self.dtype)
+        )
+
+        if self.is_bfloat16_op():
+            output = convert_float_to_uint16(output)
+            self.inputs = {
+                'Input': convert_float_to_uint16(input),
+                'Filter': convert_float_to_uint16(filter),
+            }
+            self.inputs_fp32 = {
+                'Input': OpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': OpTest.np_dtype_to_fluid_dtype(filter),
+            }
+        else:
+            output = output.astype(self.dtype)
+            self.inputs = {
+                'Input': OpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': OpTest.np_dtype_to_fluid_dtype(filter),
+            }
 
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter),
-        }
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
@@ -358,8 +453,6 @@ def test_check_output(self):
         )
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_grad_with_place(
@@ -371,8 +464,7 @@ def test_check_grad(self):
         )
 
     def test_check_grad_no_filter(self):
-        if self.dtype == np.float16:
-            return
+
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_grad_with_place(
@@ -385,8 +477,7 @@ def test_check_grad_no_filter(self):
         )
 
     def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
+
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_grad_with_place(
@@ -617,6 +708,14 @@ def init_kernel_type(self):
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
+# ----------------Conv3DCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestConv3DOp)
+create_test_cudnn_bf16_class(TestWithGroup1)
+create_test_cudnn_bf16_class(TestWithGroup2)
+create_test_cudnn_bf16_class(TestWith1x1)
+create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1)
+
+
 # ---- test asymmetric padding ----
 
 
@@ -1114,4 +1213,5 @@ def run_8():
 
 
 if __name__ == '__main__':
+
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index 22e22d9b2f66a..c0814754cc231 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -19,11 +19,25 @@
 import paddle
 
 paddle.enable_static()
-from eager_op_test import OpTest
+from eager_op_test import OpTest, copy_bits_from_float_to_uint16
 
 from paddle.fluid import core
 
 
+def convert_float_to_uint16(float_list, data_format="NCHW"):
+    if data_format == "NHWC":
+        float_list = np.transpose(float_list, [0, 4, 1, 2, 3])
+
+    new_output = []
+    for x in np.nditer(float_list):
+        new_output.append(np.uint16(copy_bits_from_float_to_uint16(x)))
+    new_output = np.reshape(new_output, float_list.shape).view(np.uint16)
+
+    if data_format == "NHWC":
+        new_output = np.transpose(new_output, [0, 2, 3, 4, 1])
+    return new_output
+
+
 def conv3dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
@@ -134,6 +148,86 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
     return out
 
 
+def create_test_cudnn_fp16_class(parent, grad_check=True):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestConv3DTransposeCUDNNFP16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=2e-2)
+
+        def test_check_grad_no_filter(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ['Input'], 'Output', no_grad_set={'Filter'}
+                )
+
+        def test_check_grad_no_input(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ['Filter'], 'Output', no_grad_set={'Input'}
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNFP16OP")
+    TestConv3DTransposeCUDNNFP16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNFP16
+
+
+def create_test_cudnn_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestConv3DTransposeCUDNNBF16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                {'Input', 'Filter'},
+                'Output',
+            )
+
+        def test_check_grad_no_filter(self):
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                ['Input'],
+                'Output',
+                no_grad_set={'Filter'},
+            )
+
+        def test_check_grad_no_input(self):
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                ['Filter'],
+                'Output',
+                no_grad_set={'Input'},
+            )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP")
+    TestConv3DTransposeCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNBF16
+
+
 def conv3d_transpose_wrapper(
     x,
     weight,
@@ -172,12 +266,16 @@ def setUp(self):
         self.pad = [0, 0, 0]
         self.padding_algorithm = "EXPLICIT"
         self.init_op_type()
+        self.init_kernel_type()
         self.init_test_case()
 
-        input_ = np.random.random(self.input_size).astype("float32")
-        filter_ = np.random.random(self.filter_size).astype("float32")
+        if self.is_bfloat16_op():
+            input = np.random.random(self.input_size).astype(np.float32)
+            filter = np.random.random(self.filter_size).astype(np.float32)
+        else:
+            input = np.random.random(self.input_size).astype(self.dtype)
+            filter = np.random.random(self.filter_size).astype(self.dtype)
 
-        self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
@@ -189,9 +287,21 @@ def setUp(self):
         }
 
         output = conv3dtranspose_forward_naive(
-            input_, filter_, self.attrs
+            input, filter, self.attrs
         ).astype("float32")
 
+        if self.is_bfloat16_op():
+            self.inputs = {
+                'Input': convert_float_to_uint16(input),
+                'Filter': convert_float_to_uint16(filter),
+            }
+        else:
+            self.inputs = {
+                'Input': input,
+                'Filter': filter,
+            }
+            output = output.astype(self.dtype)
+
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -264,6 +374,9 @@ def init_op_type(self):
         self.op_type = "conv3d_transpose"
         self.python_api = conv3d_transpose_wrapper
 
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
 
 class TestWithSymmetricPad(TestConv3DTransposeOp):
     def init_test_case(self):
@@ -596,6 +709,30 @@ def init_op_type(self):
         self.python_api = conv3d_transpose_wrapper
 
 
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestConv3DTransposeOp)
+create_test_cudnn_fp16_class(TestWithSymmetricPad)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad)
+create_test_cudnn_fp16_class(TestWithSAMEPad)
+create_test_cudnn_fp16_class(TestWithVALIDPad)
+create_test_cudnn_fp16_class(TestWithStride)
+create_test_cudnn_fp16_class(TestWithGroups)
+create_test_cudnn_fp16_class(TestWithDilation)
+create_test_cudnn_fp16_class(Test_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestConv3DTransposeOp)
+create_test_cudnn_bf16_class(TestWithSymmetricPad)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad)
+create_test_cudnn_bf16_class(TestWithSAMEPad)
+create_test_cudnn_bf16_class(TestWithVALIDPad)
+create_test_cudnn_bf16_class(TestWithStride)
+create_test_cudnn_bf16_class(TestWithGroups)
+create_test_cudnn_bf16_class(TestWithDilation)
+create_test_cudnn_bf16_class(Test_NHWC)
+
+
 class TestConv3dTranspose(unittest.TestCase):
     def error_weight_input(self):
         array = np.array([1], dtype=np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
index b51cdd9b1087a..54d31a7bbd22b 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from test_conv3d_transpose_op import TestConv3DTransposeOp
+from test_conv3d_transpose_op import (
+    TestConv3DTransposeOp,
+    create_test_cudnn_bf16_class,
+    create_test_cudnn_fp16_class,
+)
 
 import paddle
 from paddle import fluid
@@ -84,6 +88,22 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithGroups_NHWC)
+create_test_cudnn_fp16_class(TestWithStride_NHWC)
+create_test_cudnn_fp16_class(TestWithDilation_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithGroups_NHWC)
+create_test_cudnn_bf16_class(TestWithStride_NHWC)
+create_test_cudnn_bf16_class(TestWithDilation_NHWC)
+
+
 class TestConv3DTransposeAPI(unittest.TestCase):
     def test_case1(self):
         data1 = paddle.static.data(
diff --git a/python/paddle/fluid/tests/unittests/test_fmax_op.py b/python/paddle/fluid/tests/unittests/test_fmax_op.py
index 19417919df248..0271854aebb72 100644
--- a/python/paddle/fluid/tests/unittests/test_fmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fmax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -241,5 +241,34 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out')
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestFmaxBF16OP(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_fmax"
+        self.python_api = paddle.fmax
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        out = np.fmax(x, y)
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fmin_op.py b/python/paddle/fluid/tests/unittests/test_fmin_op.py
index 19e43fa3d1d6c..1956d5b4fc433 100644
--- a/python/paddle/fluid/tests/unittests/test_fmin_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fmin_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -243,6 +243,35 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out')
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestFminBF16OP(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_fmin"
+        self.python_api = paddle.fmin
+        self.dtype = np.uint16
+        x = np.random.uniform(1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(1, 1, [13, 17]).astype("float32")
+        out = np.fmin(x, y)
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_index_add_op.py b/python/paddle/fluid/tests/unittests/test_index_add_op.py
index 6b96493cfcf78..0e9b15f45438f 100644
--- a/python/paddle/fluid/tests/unittests/test_index_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_add_op.py
@@ -15,10 +15,10 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
-from paddle.fluid import Program
+from paddle.fluid import Program, core
 
 
 def compute_index_add_ref(
@@ -99,6 +99,69 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'AddValue'], 'Out')
 
 
+class TestIndexAddFP16Op(TestIndexAddOp):
+    def init_dtype_type(self):
+        self.axis = 0
+        self.x_type = np.float16
+        self.index_type = np.int64
+        self.x_shape = (101, 3)
+        self.index_size = 3
+        self.add_value_shape = (3, 3)
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestIndexAddBF16Op(OpTest):
+    def setUp(self):
+        self.python_api = raw_index_add
+        self.op_type = "index_add"
+        self.init_dtype_type()
+        index_np = np.random.randint(
+            low=0, high=self.x_shape[self.axis], size=self.index_size
+        )
+        x_np = np.random.random(self.x_shape).astype(self.x_type)
+        add_value_np = np.random.random(self.add_value_shape).astype(
+            self.x_type
+        )
+
+        self.inputs = {
+            'X': convert_float_to_uint16(x_np),
+            'Index': index_np,
+            'AddValue': convert_float_to_uint16(add_value_np),
+        }
+        self.attrs = {'axis': self.axis}
+        out = compute_index_add_ref(
+            self.axis,
+            self.x_shape,
+            x_np,
+            self.add_value_shape,
+            add_value_np,
+            self.index_size,
+            index_np,
+        )
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.place = core.CUDAPlace(0)
+
+    def init_dtype_type(self):
+        self.axis = 0
+        self.x_type = np.float32
+        self.index_type = np.int64
+        self.x_shape = (101, 3)
+        self.index_size = 3
+        self.add_value_shape = (3, 3)
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'AddValue'], 'Out')
+
+
 class TestIndexAddAPI(unittest.TestCase):
     def setUp(self):
         self.setType()
diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
index 33491720af9c5..d9551e0bea8d8 100755
--- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
@@ -15,10 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
+from paddle.fluid import core
 
 
 class TestIndexSampleOp(OpTest):
@@ -121,6 +122,49 @@ def config(self):
         self.index_type = "int64"
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestIndexSampleBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "index_sample"
+        self.python_api = paddle.index_sample
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        indexnp = np.random.randint(
+            low=0, high=self.x_shape[1], size=self.index_shape
+        ).astype(self.index_type)
+        self.inputs = {'X': xnp, 'Index': indexnp}
+        index_array = []
+        for i in range(self.index_shape[0]):
+            for j in indexnp[i]:
+                index_array.append(xnp[i, j])
+        index_array = np.array(index_array).astype(self.x_type)
+        out = np.reshape(index_array, self.index_shape)
+        self.outputs = {'Out': out}
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.x_type = "float32"
+        self.dtype = np.uint16
+        self.index_shape = (10, 10)
+        self.index_type = "int32"
+
+
 class TestIndexSampleShape(unittest.TestCase):
     def test_shape(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_logspace.py b/python/paddle/fluid/tests/unittests/test_logspace.py
index 19461fc938f20..2982a12968c95 100644
--- a/python/paddle/fluid/tests/unittests/test_logspace.py
+++ b/python/paddle/fluid/tests/unittests/test_logspace.py
@@ -15,9 +15,10 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+from paddle.fluid import core
 
 
 class TestLogspaceOpCommonCase(OpTest):
@@ -41,6 +42,54 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestLogspaceFP16Op(TestLogspaceOpCommonCase):
+    def init_data(self):
+        self.dtype = np.float16
+        self.inputs = {
+            'Start': np.array([0]).astype(self.dtype),
+            'Stop': np.array([10]).astype(self.dtype),
+            'Num': np.array([11]).astype('int32'),
+            'Base': np.array([2]).astype(self.dtype),
+        }
+        self.attrs = {'dtype': int(paddle.float16)}
+        self.outputs = {'Out': np.power(2, np.arange(0, 11)).astype(self.dtype)}
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestLogspaceBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "logspace"
+        self.python_api = paddle.logspace
+        self.init_data()
+
+    def init_data(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+        self.inputs = {
+            'Start': np.array([0]).astype(self.np_dtype),
+            'Stop': np.array([10]).astype(self.np_dtype),
+            'Num': np.array([11]).astype('int32'),
+            'Base': np.array([2]).astype(self.np_dtype),
+        }
+        self.attrs = {'dtype': int(paddle.bfloat16)}
+        self.outputs = {
+            'Out': np.power(2, np.arange(0, 11)).astype(self.np_dtype)
+        }
+
+        self.inputs["Start"] = convert_float_to_uint16(self.inputs["Start"])
+        self.inputs["Stop"] = convert_float_to_uint16(self.inputs["Stop"])
+        self.inputs["Base"] = convert_float_to_uint16(self.inputs["Base"])
+        self.outputs["Out"] = convert_float_to_uint16(self.outputs["Out"])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
 class TestLogspaceOpReverseCase(TestLogspaceOpCommonCase):
     def init_data(self):
         dtype = 'float32'
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 1e2b909d90dab..9ce95d354d7af 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -20,7 +20,7 @@
 from paddle.fluid import core
 
 sys.path.append("..")
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 
 class TestMulOp(OpTest):
@@ -114,14 +114,14 @@ def test_check_grad_ignore_y(self):
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestFP16MulOp1(TestMulOp):
+class TestMulFP16Op1(TestMulOp):
     def init_dtype_type(self):
         self.dtype = np.float16
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=1e-1, check_dygraph=False)
+            self.check_output_with_place(place, check_dygraph=False)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
@@ -130,7 +130,6 @@ def test_check_grad_normal(self):
                 place,
                 ['X', 'Y'],
                 'Out',
-                max_relative_error=0.5,
                 check_dygraph=False,
             )
 
@@ -141,7 +140,6 @@ def test_check_grad_ingore_x(self):
                 place,
                 ['Y'],
                 'Out',
-                max_relative_error=0.5,
                 no_grad_set=set("X"),
                 check_dygraph=False,
             )
@@ -153,7 +151,6 @@ def test_check_grad_ingore_y(self):
                 place,
                 ['X'],
                 'Out',
-                max_relative_error=0.5,
                 no_grad_set=set('Y'),
                 check_dygraph=False,
             )
@@ -162,14 +159,14 @@ def test_check_grad_ingore_y(self):
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestFP16MulOp2(TestMulOp2):
+class TestMulFP16Op2(TestMulOp2):
     def init_dtype_type(self):
         self.dtype = np.float16
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_output_with_place(place, atol=2e-1, check_dygraph=False)
+            self.check_output_with_place(place, check_dygraph=False)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
@@ -178,7 +175,6 @@ def test_check_grad_normal(self):
                 place,
                 ['X', 'Y'],
                 'Out',
-                max_relative_error=0.9,
                 check_dygraph=False,
             )
 
@@ -189,7 +185,6 @@ def test_check_grad_ingore_x(self):
                 place,
                 ['Y'],
                 'Out',
-                max_relative_error=0.5,
                 no_grad_set=set("X"),
                 check_dygraph=False,
             )
@@ -201,11 +196,120 @@ def test_check_grad_ingore_y(self):
                 place,
                 ['X'],
                 'Out',
-                max_relative_error=0.9,
                 no_grad_set=set('Y'),
                 check_dygraph=False,
             )
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestMulBF16Op1(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((20, 5)).astype(self.np_dtype),
+            'Y': np.random.random((5, 21)).astype(self.np_dtype),
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X', 'Y'], 'Out', check_dygraph=False
+        )
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place,
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            check_dygraph=False,
+        )
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            check_dygraph=False,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestMulBF16Op2(TestMulBF16Op1):
+    def setUp(self):
+        self.op_type = "mul"
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((3, 4, 2, 9)).astype(self.np_dtype),
+            'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.np_dtype),
+        }
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(
+            self.inputs['X'].reshape(3 * 4, 2 * 9),
+            self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3),
+        )
+        result = result.reshape(3, 4, 1, 2, 3)
+        self.outputs = {'Out': result}
+
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X', 'Y'],
+            'Out',
+            numeric_grad_delta=0.02,
+            check_dygraph=False,
+        )
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place,
+            ['Y'],
+            'Out',
+            numeric_grad_delta=0.02,
+            no_grad_set=set("X"),
+            check_dygraph=False,
+        )
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+            numeric_grad_delta=0.02,
+            no_grad_set=set('Y'),
+            check_dygraph=False,
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index a5555657c7f15..dc6f5c4810cf9 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -15,10 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 from numpy.linalg import multi_dot
 
 import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
@@ -49,6 +50,53 @@ def test_check_grad(self):
         self.check_grad(['x1'], 'Out')
 
 
+class TestMultiDotFP16Op(TestMultiDotOp):
+    def get_dtype(self):
+        return "float16"
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestMultiDotBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "multi_dot"
+        self.python_api = paddle.linalg.multi_dot
+        self.dtype = self.get_dtype()
+        self.get_inputs_and_outputs()
+        self.place = core.CUDAPlace(0)
+
+    def get_dtype(self):
+        self.np_dtype = "float32"
+        return np.uint16
+
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((2, 8)).astype(self.np_dtype)
+        self.B = np.random.random((8, 4)).astype(self.np_dtype)
+        self.inputs = {
+            'X': [
+                ('x0', convert_float_to_uint16(self.A)),
+                ('x1', convert_float_to_uint16(self.B)),
+            ]
+        }
+        self.outputs = {
+            'Out': convert_float_to_uint16(multi_dot([self.A, self.B]))
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['x0'], 'Out', numeric_grad_delta=0.01
+        )
+        self.check_grad_with_place(
+            self.place, ['x1'], 'Out', numeric_grad_delta=0.01
+        )
+
+
 # (A*B)*C
 class TestMultiDotOp3Mat(TestMultiDotOp):
     def get_inputs_and_outputs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index d8d61f4fb2904..16d1f356537bc 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -15,9 +15,16 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+    get_numeric_gradient,
+)
 
 import paddle
+from paddle.fluid import core
+from paddle.fluid.tests.unittests.testsuite import create_op
 
 
 def adaptive_start_index(index, input_size, output_size):
@@ -149,9 +156,18 @@ def setUp(self):
         self.init_test_case()
         self.init_global()
         self.init_adaptive()
+        self.init_dtype()
+
+        if self.is_bfloat16_op():
+            input = np.random.random(self.shape).astype(np.float32)
+            input = convert_uint16_to_float(
+                convert_float_to_uint16(np.round(input * 100.0, 2))
+            )
+
+        else:
+            input = np.random.random(self.shape).astype(self.dtype)
+            input = np.round(input * 100.0, 2)
 
-        input = np.random.random(self.shape).astype("float64")
-        input = np.round(input * 100.0, 2)
         output, mask = self.pool_forward_naive(
             input,
             self.ksize,
@@ -160,8 +176,11 @@ def setUp(self):
             self.global_pool,
             self.adaptive,
         )
-        output = output.astype("float64")
         mask = mask.astype("int32")
+        if self.is_bfloat16_op():
+            output = output.astype(np.float32)
+        else:
+            output = output.astype(self.dtype)
 
         self.attrs = {
             'strides': self.strides,
@@ -171,8 +190,20 @@ def setUp(self):
             'adaptive': self.adaptive,
         }
 
-        self.inputs = {'X': input}
-        self.outputs = {'Out': output, "Mask": mask}
+        if self.is_bfloat16_op():
+            self.inputs = {'X': convert_float_to_uint16(input)}
+            self.outputs = {
+                'Out': convert_float_to_uint16(output),
+                "Mask": mask,
+            }
+            self.inputs_fp32 = {'X': input}
+
+        else:
+            self.inputs = {'X': input}
+            self.outputs = {'Out': output, "Mask": mask}
+
+    def init_dtype(self):
+        self.dtype = np.float64
 
     def test_check_output(self):
         self.check_output()
@@ -220,9 +251,90 @@ def init_global(self):
         self.global_pool = False
 
 
-# ----------------max_pool2d_with_index----------------
+class TestCastAdaptive3d(TestMaxPoolWithIndex_Op):
+    def init_adaptive(self):
+        self.adaptive = True
 
 
+# ----------------max_pool3d_with_index_fp16----------------
+def create_test_fp16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestMaxPool3dFP16(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(place, {'X'}, ['Out'])
+
+    cls_name = "{}_{}".format(parent.__name__, "FP16OP")
+    TestMaxPool3dFP16.__name__ = cls_name
+    globals()[cls_name] = TestMaxPool3dFP16
+
+
+create_test_fp16_class(TestMaxPoolWithIndex_Op)
+create_test_fp16_class(TestCase1)
+create_test_fp16_class(TestCase2)
+create_test_fp16_class(TestCase3)
+create_test_fp16_class(TestCastAdaptive3d)
+
+
+# ----------------max_pool3d_with_index_bf16----------------
+def create_test_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestMaxPool3dBF16(parent):
+        def init_dtype(self):
+            self.dtype = np.uint16
+
+        def get_numeric_grad(self, place, check_name):
+            scope = core.Scope()
+            self._check_grad_helper()
+            op = create_op(
+                scope, self.op_type, self.inputs, self.outputs, self.attrs
+            )
+            return get_numeric_gradient(
+                place, scope, op, self.inputs_fp32, check_name, ['Out']
+            )
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'X')
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place, {'X'}, ['Out'], user_defined_grads=[numeric_grads]
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "BF16OP")
+    TestMaxPool3dBF16.__name__ = cls_name
+    globals()[cls_name] = TestMaxPool3dBF16
+
+
+create_test_bf16_class(TestMaxPoolWithIndex_Op)
+create_test_bf16_class(TestCase1)
+create_test_bf16_class(TestCase2)
+create_test_bf16_class(TestCase3)
+create_test_bf16_class(TestCastAdaptive3d)
+
+
+# ----------------max_pool2d_with_index----------------
 def max_pool2d_with_index_wapper(
     x,
     kernel_size=[],
@@ -279,9 +391,82 @@ def init_adaptive(self):
         self.adaptive = True
 
 
-class TestCastAdaptive3d(TestMaxPoolWithIndex_Op):
-    def init_adaptive(self):
-        self.adaptive = True
+# ----------------max_pool2d_with_index_fp16----------------
+def create_test_fp16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestMaxPool2dFP16(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(place, {'X'}, ['Out'])
+
+    cls_name = "{}_{}".format(parent.__name__, "FP16OP")
+    TestMaxPool2dFP16.__name__ = cls_name
+    globals()[cls_name] = TestMaxPool2dFP16
+
+
+create_test_fp16_class(TestCase4)
+create_test_fp16_class(TestCase5)
+create_test_fp16_class(TestCase6)
+create_test_fp16_class(TestCase7)
+create_test_fp16_class(TestCastAdaptive2d)
+
+
+# ----------------max_pool2d_with_index_bf16----------------
+def create_test_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestMaxPool2dBF16(parent):
+        def init_dtype(self):
+            self.dtype = np.uint16
+
+        def get_numeric_grad(self, place, check_name):
+            scope = core.Scope()
+            self._check_grad_helper()
+            op = create_op(
+                scope, self.op_type, self.inputs, self.outputs, self.attrs
+            )
+            return get_numeric_gradient(
+                place, scope, op, self.inputs_fp32, check_name, ['Out']
+            )
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'X')
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place, {'X'}, ['Out'], user_defined_grads=[numeric_grads]
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "BF16OP")
+    TestMaxPool2dBF16.__name__ = cls_name
+    globals()[cls_name] = TestMaxPool2dBF16
+
+
+create_test_bf16_class(TestCase4)
+create_test_bf16_class(TestCase5)
+create_test_bf16_class(TestCase6)
+create_test_bf16_class(TestCase7)
+create_test_bf16_class(TestCastAdaptive2d)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index a9d00c42219af..8588ed498d703 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.nn.functional as F
@@ -174,7 +174,11 @@ def setUp(self):
         self.op_type = "prelu"
         self.python_api = prelu_api_wrapper
 
-        x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
+        if self.dtype == np.uint16:
+            as_type = self.np_dtype
+        else:
+            as_type = self.dtype
+        x_np = np.random.uniform(-1, 1, self.x_shape).astype(as_type)
         # Since zero point in prelu is not differentiable, avoid randomize
         # zero.
         x_np[np.abs(x_np) < 0.005] = 0.02
@@ -190,7 +194,7 @@ def setUp(self):
             alpha_np = np.random.uniform(-1, -0.5, [1, 1, 1, self.x_shape[-1]])
         else:
             alpha_np = np.random.uniform(-1, -0.5, [1] + self.x_shape[1:])
-        alpha_np = alpha_np.astype(self.dtype)
+        alpha_np = alpha_np.astype(as_type)
 
         self.inputs = {'X': x_np, 'Alpha': alpha_np}
 
@@ -393,18 +397,48 @@ def test_check_output(self):
         def test_check_grad(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place) and check_grad:
-                self.check_grad_with_place(
-                    place,
-                    ['X', 'Alpha'],
-                    'Out',
-                    max_relative_error=max_relative_error,
-                )
+                # Use the default max_relative_error, not use max_relative_error
+                self.check_grad_with_place(place, ['X', 'Alpha'], 'Out')
 
     cls_name = "{}_{}".format(parent.__name__, "Fp16Op")
     TestPReluFp16Case.__name__ = cls_name
     globals()[cls_name] = TestPReluFp16Case
 
 
+def create_test_bf16_class(
+    parent, check_grad=True, atol=1e-3, max_relative_error=0.05
+):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not complied with CUDA and not support the bfloat16",
+    )
+    class TestPReluBF16Op(parent):
+        def setUp(self):
+            super().setUp()
+            self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+            self.inputs['Alpha'] = convert_float_to_uint16(self.inputs['Alpha'])
+            self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+
+        def init_dtype(self):
+            self.dtype = np.uint16
+            self.np_dtype = np.float32
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if check_grad:
+                # Use the default max_relative_error, not use max_relative_error
+                self.check_grad_with_place(place, ['X', 'Alpha'], 'Out')
+
+    cls_name = "{}_{}".format(parent.__name__, "BF16Op")
+    TestPReluBF16Op.__name__ = cls_name
+    globals()[cls_name] = TestPReluBF16Op
+
+
 create_test_fp16_class(TestModeElt)
 create_test_fp16_class(TestModeAllRank3)
 create_test_fp16_class(TestModeAllRank6)
@@ -420,6 +454,21 @@ def test_check_grad(self):
 create_test_fp16_class(TestModeElementRank3NHWC)
 create_test_fp16_class(TestModeElementRank6NHWC)
 
+create_test_bf16_class(TestModeElt)
+create_test_bf16_class(TestModeAllRank3)
+create_test_bf16_class(TestModeAllRank6)
+create_test_bf16_class(TestModeChannelRank3)
+create_test_bf16_class(TestModeChannelRank6)
+create_test_bf16_class(TestModeElementRank3)
+create_test_bf16_class(TestModeElementRank6)
+create_test_bf16_class(TestModeEltNHWC)
+create_test_bf16_class(TestModeAllRank3NHWC)
+create_test_bf16_class(TestModeAllRank6NHWC)
+create_test_bf16_class(TestModeChannelRank3NHWC)
+create_test_bf16_class(TestModeChannelRank6NHWC)
+create_test_bf16_class(TestModeElementRank3NHWC)
+create_test_bf16_class(TestModeElementRank6NHWC)
+
 
 def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
     helper = fluid.layer_helper.LayerHelper('prelu', **locals())
diff --git a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
index 75be4531820fb..a2085cc416a19 100644
--- a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.framework import core
@@ -28,19 +28,18 @@ class TestPutAlongAxisOp(OpTest):
     def setUp(self):
         self.init_data()
         self.reduce_op = "assign"
-        self.dtype = 'float64'
         self.op_type = "put_along_axis"
         self.python_api = paddle.tensor.put_along_axis
         self.xnp = np.random.random(self.x_shape).astype(self.x_type)
-        # numpy put_along_axis is an inplace opearion.
+        # numpy put_along_axis is an inplace operation.
         self.xnp_result = copy.deepcopy(self.xnp)
         np.put_along_axis(self.xnp_result, self.index, self.value, self.axis)
         self.target = self.xnp_result
         broadcast_shape_list = list(self.x_shape)
         broadcast_shape_list[self.axis] = 1
-        self.braodcast_shape = tuple(broadcast_shape_list)
-        self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape)
-        self.value_broadcast = np.broadcast_to(self.value, self.braodcast_shape)
+        self.broadcast_shape = tuple(broadcast_shape_list)
+        self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape)
+        self.value_broadcast = np.broadcast_to(self.value, self.broadcast_shape)
         self.inputs = {
             'Input': self.xnp,
             'Index': self.index_broadcast,
@@ -56,6 +55,7 @@ def test_check_grad(self):
         self.check_grad(["Input", "Value"], "Result")
 
     def init_data(self):
+        self.dtype = 'float64'
         self.x_type = "float64"
         self.x_shape = (10, 10, 10)
         self.value_type = "float64"
@@ -66,6 +66,71 @@ def init_data(self):
         self.axis_type = "int64"
 
 
+class TestPutAlongAxisFP16Op(TestPutAlongAxisOp):
+    def init_data(self):
+        self.dtype = np.float16
+        self.x_type = "float16"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float16"
+        self.value = np.array([99]).astype(self.value_type)
+        self.index_type = "int32"
+        self.index = np.array([[[0]]]).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestPutAlongAxisBF16Op(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "assign"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.xnp_result = copy.deepcopy(self.xnp)
+        np.put_along_axis(self.xnp_result, self.index, self.value, self.axis)
+        self.target = self.xnp_result
+        broadcast_shape_list = list(self.x_shape)
+        broadcast_shape_list[self.axis] = 1
+        self.broadcast_shape = tuple(broadcast_shape_list)
+        self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape)
+        self.value_broadcast = np.broadcast_to(self.value, self.broadcast_shape)
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index_broadcast,
+            'Value': self.value_broadcast,
+        }
+        self.attrs = {'Axis': self.axis, 'Reduce': self.reduce_op}
+        self.outputs = {'Result': self.target}
+
+        self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input'])
+        self.inputs['Value'] = convert_float_to_uint16(self.inputs['Value'])
+        self.outputs['Result'] = convert_float_to_uint16(self.outputs['Result'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["Input", "Value"], "Result")
+
+    def init_data(self):
+        self.dtype = np.uint16
+        self.x_type = "float32"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float32"
+        self.value = np.array([99]).astype(self.value_type)
+        self.index_type = "int32"
+        self.index = np.array([[[0]]]).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
 class TestPutAlongAxisAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(0)
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index eaecf087f9f04..5df2873b93ec4 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
 from paddle.fluid import core
@@ -40,12 +44,21 @@ def error_msg(data_np):
 
 
 def convert_dtype(dtype_str):
-    dtype_str_list = ["int32", "int64", "float32", "float64"]
+    dtype_str_list = [
+        "int32",
+        "int64",
+        "float16",
+        "float32",
+        "float64",
+        "uint16",
+    ]
     dtype_num_list = [
         core.VarDesc.VarType.INT32,
         core.VarDesc.VarType.INT64,
+        core.VarDesc.VarType.FP16,
         core.VarDesc.VarType.FP32,
         core.VarDesc.VarType.FP64,
+        core.VarDesc.VarType.BF16,
     ]
     assert dtype_str in dtype_str_list, (
         dtype_str + " should in " + str(dtype_str_list)
@@ -62,9 +75,9 @@ def setUp(self):
         self.n = 200
         self.dtype = "int64"
 
+        self.init_attrs()
         self.inputs = {}
         self.outputs = {"Out": np.zeros(self.n).astype(self.dtype)}
-        self.init_attrs()
         self.attrs = {
             "n": self.n,
             "dtype": convert_dtype(self.dtype),
@@ -103,6 +116,47 @@ def init_attrs(self):
         self.dtype = "float64"
 
 
+class TestRandpermFP16Op(TestRandpermOp):
+    def init_attrs(self):
+        self.dtype = "float16"
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestRandpermBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "randperm"
+        self.python_api = paddle.randperm
+        self.n = 200
+
+        self.init_attrs()
+        self.inputs = {}
+        self.outputs = {"Out": np.zeros(self.n).astype(self.np_dtype)}
+        self.attrs = {
+            "n": self.n,
+            "dtype": convert_dtype(self.dtype),
+        }
+
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def init_attrs(self):
+        self.dtype = "uint16"
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(self.verify_output, self.place)
+
+    def verify_output(self, outs):
+        out_np = convert_uint16_to_float(np.array(outs[0]))
+        self.assertTrue(
+            check_randperm_out(self.n, out_np), msg=error_msg(out_np)
+        )
+
+
 class TestRandpermOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 050879369244d..631b760a7b8da 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -418,6 +418,51 @@ def test_check_output(self):
         self.check_output()
 
 
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework."
+)
+class TestMinFP16Op(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.python_api = paddle.min
+        self.public_python_api = paddle.min
+        self.init_dtype()
+        if self.dtype == np.uint16:
+            x = np.random.random((5, 6, 10)).astype(np.float32)
+            self.inputs = {'X': convert_float_to_uint16(x)}
+        else:
+            x = np.random.random((5, 6, 10)).astype(self.dtype)
+            self.inputs = {'X': x}
+        self.attrs = {'dim': [2], 'keep_dim': True}
+        out = x.min(axis=tuple(self.attrs['dim']), keepdims=True)
+        if self.dtype == np.uint16:
+            self.outputs = {'Out': convert_float_to_uint16(out)}
+        else:
+            self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output()
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestMinBF16Op(TestMinFP16Op):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
 def raw_reduce_prod(x, dim=[0], keep_dim=False):
     return paddle.prod(x, dim, keep_dim)
 
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 3149ca82b3f62..29446bafbbfeb 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -65,7 +65,7 @@ def test_check_grad(self):
 
 
 # test with attr(num)
-class TestSplitOp_2(OpTest):
+class TestSplitWithNumOp(OpTest):
     def setUp(self):
         self.python_api = paddle.split
         self.public_python_api = paddle.split
@@ -74,18 +74,32 @@ def setUp(self):
         self.prim_op_type = "prim"
         self.dtype = self.get_dtype()
         self.init_data()
-        self.inputs = {'X': self.x}
         self.attrs = {
             'axis': self.axis,
             'sections': self.sections,
             'num': self.num,
         }
-
-        out = np.split(self.x, self.indices_or_sections, self.axis)
-        self.outputs = {'Out': [('out%d' % i, out[i]) for i in range(len(out))]}
+        if self.dtype == np.uint16:
+            self.inputs = {'X': convert_float_to_uint16(self.x)}
+            out = np.split(self.x, self.indices_or_sections, self.axis)
+            self.outputs = {
+                'Out': [
+                    ('out%d' % i, convert_float_to_uint16(out[i]))
+                    for i in range(len(out))
+                ]
+            }
+        else:
+            self.inputs = {'X': self.x}
+            out = np.split(self.x, self.indices_or_sections, self.axis)
+            self.outputs = {
+                'Out': [('out%d' % i, out[i]) for i in range(len(out))]
+            }
 
     def init_data(self):
-        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        if self.dtype == np.uint16:
+            self.x = np.random.random((4, 5, 6)).astype(np.float32)
+        else:
+            self.x = np.random.random((4, 5, 6)).astype(self.dtype)
         self.axis = 2
         self.sections = []
         self.num = 3
@@ -240,28 +254,28 @@ def create_test_fp16(parent):
     @unittest.skipIf(
         not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
     )
-    class TestSplitFp16(parent):
+    class TestSplitFP16Op(parent):
         def get_dtype(self):
             return np.float16
 
-        def test_check_grad(self):
-            pass
-
-    cls_name = "{}_{}".format(parent.__name__, "Fp16")
-    TestSplitFp16.__name__ = cls_name
-    globals()[cls_name] = TestSplitFp16
+    cls_name = "{}_{}".format(parent.__name__, "FP16Op")
+    TestSplitFP16Op.__name__ = cls_name
+    globals()[cls_name] = TestSplitFP16Op
 
 
 create_test_fp16(TestSplitOp)
+create_test_fp16(TestSplitWithNumOp)
 
 # ----------------Split Bf16----------------
 
 
 def create_test_bf16(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA or not support bfloat16",
     )
-    class TestSplitBf16(parent):
+    class TestSplitBF16Op(parent):
         def get_dtype(self):
             return np.uint16
 
@@ -270,14 +284,16 @@ def test_check_output(self):
             self.check_output_with_place(place)
 
         def test_check_grad(self):
-            pass
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(place, ['X'], 'out2')
 
-    cls_name = "{}_{}".format(parent.__name__, "Bf16")
-    TestSplitBf16.__name__ = cls_name
-    globals()[cls_name] = TestSplitBf16
+    cls_name = "{}_{}".format(parent.__name__, "BF16Op")
+    TestSplitBF16Op.__name__ = cls_name
+    globals()[cls_name] = TestSplitBF16Op
 
 
 create_test_bf16(TestSplitOp)
+create_test_bf16(TestSplitWithNumOp)
 
 
 class TestSplitAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
index b5a9c2169ff96..4180a7c4dc758 100644
--- a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.framework import core
@@ -32,8 +32,8 @@ def setUp(self):
         self.target = np.take_along_axis(self.xnp, self.index, self.axis)
         broadcast_shape_list = list(self.x_shape)
         broadcast_shape_list[self.axis] = 1
-        self.braodcast_shape = tuple(broadcast_shape_list)
-        self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape)
+        self.broadcast_shape = tuple(broadcast_shape_list)
+        self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape)
         self.inputs = {
             'Input': self.xnp,
             'Index': self.index_broadcast,
@@ -58,6 +58,64 @@ def init_data(self):
         self.axis_type = "int64"
 
 
+class TestTakeAlongAxisFP16Op(TestTakeAlongAxisOp):
+    def init_data(self):
+        self.dtype = np.float16
+        self.x_type = "float16"
+        self.x_shape = (5, 5, 5)
+        self.index_type = "int32"
+        self.index = np.array([[[1]], [[1]], [[2]], [[4]], [[3]]]).astype(
+            self.index_type
+        )
+        self.axis = 2
+        self.axis_type = "int64"
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestTakeAlongAxisBF16Op(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "take_along_axis"
+        self.python_api = paddle.tensor.take_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.target = np.take_along_axis(self.xnp, self.index, self.axis)
+        broadcast_shape_list = list(self.x_shape)
+        broadcast_shape_list[self.axis] = 1
+        self.broadcast_shape = tuple(broadcast_shape_list)
+        self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape)
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index_broadcast,
+        }
+        self.attrs = {'Axis': self.axis}
+        self.outputs = {'Result': self.target}
+
+        self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input'])
+        self.outputs['Result'] = convert_float_to_uint16(self.outputs['Result'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['Input'], 'Result')
+
+    def init_data(self):
+        self.dtype = np.uint16
+        self.x_type = "float32"
+        self.x_shape = (5, 5, 5)
+        self.index_type = "int32"
+        self.index = np.array([[[1]], [[1]], [[2]], [[4]], [[3]]]).astype(
+            self.index_type
+        )
+        self.axis = 2
+        self.axis_type = "int64"
+
+
 class TestCase1(TestTakeAlongAxisOp):
     def init_data(self):
         self.x_type = "float64"
diff --git a/python/paddle/fluid/tests/unittests/test_trace_op.py b/python/paddle/fluid/tests/unittests/test_trace_op.py
index e910658721628..f13dd1701affe 100644
--- a/python/paddle/fluid/tests/unittests/test_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trace_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid, tensor
@@ -68,6 +68,82 @@ def init_config(self):
         )
 
 
+class TestTraceFP16Op1(TestTraceOp):
+    def init_config(self):
+        self.dtype = np.float16
+        self.case = np.random.randn(20, 6).astype(self.dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1}
+        self.target = np.trace(self.inputs['Input'])
+
+
+class TestTraceFP16Op2(TestTraceOp):
+    def init_config(self):
+        self.dtype = np.float16
+        self.case = np.random.randn(2, 20, 2, 3).astype(self.dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1}
+        self.target = np.trace(
+            self.inputs['Input'],
+            offset=self.attrs['offset'],
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'],
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestTraceBF16Op1(OpTest):
+    def setUp(self):
+        self.op_type = "trace"
+        self.python_api = paddle.trace
+        self.init_config()
+        self.outputs = {'Out': self.target}
+
+        self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', numeric_grad_delta=0.02
+        )
+
+    def init_config(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+        self.case = np.random.randn(20, 6).astype(self.np_dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1}
+        self.target = np.trace(self.inputs['Input'])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestTraceBF16Op2(TestTraceBF16Op1):
+    def init_config(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+        self.case = np.random.randn(2, 20, 2, 3).astype(self.np_dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1}
+        self.target = np.trace(
+            self.inputs['Input'],
+            offset=self.attrs['offset'],
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'],
+        )
+
+
 class TestTraceAPICase(unittest.TestCase):
     def test_case1(self):
         case = np.random.randn(2, 20, 2, 3).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_unfold_op.py b/python/paddle/fluid/tests/unittests/test_unfold_op.py
index 1e7a91ef960d1..2ed5bf3f82c85 100644
--- a/python/paddle/fluid/tests/unittests/test_unfold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unfold_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
@@ -42,7 +42,11 @@ def init_data(self):
             self.input_height,
             self.input_width,
         ]
-        self.x = np.random.rand(*input_shape).astype(np.float64)
+        if self.dtype == np.uint16:
+            as_type = self.np_dtype
+        else:
+            as_type = self.dtype
+        self.x = np.random.rand(*input_shape).astype(as_type)
 
     def calc_unfold(self):
         output_shape = [0] * 3
@@ -77,7 +81,11 @@ def calc_unfold(self):
             + 1
         )
         output_shape[2] = out_height * out_width
-        output = np.zeros(output_shape).astype(np.float64)
+        if self.dtype == np.uint16:
+            as_type = self.np_dtype
+        else:
+            as_type = self.dtype
+        output = np.zeros(output_shape).astype(as_type)
         # ------------ calculate output -------------- #
         for i in range(output_shape[0]):
             for j in range(output_shape[1]):
@@ -123,9 +131,13 @@ def set_data(self):
 
     def setUp(self):
         self.op_type = 'unfold'
+        self.init_dtype()
         self.python_api = paddle.nn.functional.unfold
         self.set_data()
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
     def test_check_output(self):
         self.check_output()
 
@@ -133,6 +145,55 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Y')
 
 
+class TestUnfoldFP16Op(TestUnfoldOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestUnfoldBF16Op(TestUnfoldOp):
+    # Notice: The test is time consuming, may cause timeout, modify the parameters to reduce the time
+    def init_data(self):
+        self.batch_size = 3
+        self.input_channels = 3
+        self.input_height = 5
+        self.input_width = 5
+        self.kernel_sizes = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1, 1, 1]
+        self.dilations = [1, 1]
+        input_shape = [
+            self.batch_size,
+            self.input_channels,
+            self.input_height,
+            self.input_width,
+        ]
+        self.x = np.random.rand(*input_shape).astype(self.np_dtype)
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def setUp(self):
+        self.op_type = 'unfold'
+        self.init_dtype()
+        self.python_api = paddle.nn.functional.unfold
+        self.set_data()
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.outputs['Y'] = convert_float_to_uint16(self.outputs['Y'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Y')
+
+
 class TestUnfoldAPI(TestUnfoldOp):
     """
     This is for test on paddle.nn.Unfold
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py
index a704ae9a4ee21..c4a243f47897b 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py
@@ -15,9 +15,19 @@
 import unittest
 
 import numpy as np
+from eager_op_test import OpTest, convert_uint16_to_float
 
 import paddle
 from paddle import fluid
+from paddle.fluid import core
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-1, 1))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones(10)
+    return hist, prob
 
 
 class TestUniformRandomInplaceOpDtype(unittest.TestCase):
@@ -44,6 +54,72 @@ def test_fp64():
             test_fp64()
 
 
+class TestUniformRandomInplaceFP16Op(OpTest):
+    def setUp(self):
+        self.op_type = "uniform_random_inplace"
+        self.dtype = np.float16
+        self.shape = (1000, 784)
+        x = np.random.random(self.shape).astype(self.dtype)
+        y = np.random.random(self.shape).astype(self.dtype)
+        self.inputs = {"X": x}
+        self.outputs = {"Out": y}
+        self.init_attrs()
+
+    def init_attrs(self):
+        self.output_hist = output_hist
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        hist, prob = self.output_hist(np.array(outs[0]))
+        np.testing.assert_allclose(hist, prob, rtol=0, atol=0.001)
+
+    # TODO: Due to the lack of the self.python_api=paddle.uniform_random_inplace setting, the dynamic graph is temporarily turned off, set check_dygraph=False
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestUniformRandomInplaceBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "uniform_random_inplace"
+        self.dtype = np.uint16
+        self.shape = (1000, 784)
+        x = np.random.random(self.shape).astype(self.dtype)
+        y = np.random.random(self.shape).astype(self.dtype)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': y}
+        self.init_attrs()
+        self.place = core.CUDAPlace(0)
+
+    def init_attrs(self):
+        self.output_hist = output_hist
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(self.verify_output, self.place)
+
+    def verify_output(self, outs):
+        result = convert_uint16_to_float(np.array(outs[0]))
+        hist, prob = self.output_hist(result)
+        np.testing.assert_allclose(hist, prob, rtol=0, atol=0.002)
+
+    # TODO: Due to the lack of the self.python_api=paddle.uniform_random_inplace setting, the dynamic graph is temporarily turned off, set check_dygraph=False
+    def test_check_grad(self):
+        grads = [paddle.zeros(self.shape, dtype=self.dtype)]
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+            check_dygraph=False,
+            user_defined_grads=grads,
+        )
+
+
 class TestUniformRandomInplaceOpIsInplace(unittest.TestCase):
     def setUp(self):
         self.shape = (1000, 784)
diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
index 5ed9d0c85915d..6060236c5c5c9 100644
--- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -15,10 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle import fluid
+from paddle.fluid import core
 from paddle.static.amp import amp_nn
 
 
@@ -81,8 +82,10 @@ def setUp(self):
     def init(self):
         self.incr_ratio = 2.0
         self.decr_ratio = 0.8
-        self.dtype = np.float32
-        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+        self.init_dtype()
+        self.prev_loss_scaling = np.array([2048]).astype(
+            self.loss_scaling_dtype
+        )
         self.num_good_steps = np.array([999], dtype=np.int32)
         self.num_bad_steps = np.array([1], dtype=np.int32)
         self.zero_steps = np.array([0], dtype=np.int32)
@@ -94,10 +97,77 @@ def init(self):
             'decr_ratio': self.decr_ratio,
         }
 
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.loss_scaling_dtype = np.float32
+
     def test_check_output(self):
         self.check_output(no_check_set=['Out'])
 
 
+class TestUpdateLossScalingFP16Op(TestUpdateLossScalingOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.loss_scaling_dtype = np.float32
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestUpdateLossScalingBF16Op(OpTest):
+    def init(self):
+        self.incr_ratio = 2.0
+        self.decr_ratio = 0.8
+        self.dtype = np.uint16
+        self.np_dtype = "float32"
+        self.prev_loss_scaling = np.array([2048]).astype(self.np_dtype)
+        self.num_good_steps = np.array([999], dtype=np.int32)
+        self.num_bad_steps = np.array([1], dtype=np.int32)
+        self.zero_steps = np.array([0], dtype=np.int32)
+        self.stop_update = np.array([False], dtype=np.bool_)
+        self.attrs = {
+            'incr_every_n_steps': 1000,
+            'decr_every_n_nan_or_inf': 2,
+            'incr_ratio': self.incr_ratio,
+            'decr_ratio': self.decr_ratio,
+        }
+
+    def setUp(self):
+        self.op_type = "update_loss_scaling"
+        self.init()
+        self.python_api = update_loss_scaling_wrapper
+        self.python_out_sig = [
+            "out0",
+            "LossScaling",
+            "OutGoodSteps",
+            "OutBadSteps",
+        ]
+        found_inf = np.array([False], dtype=np.bool_)
+        x = np.random.random((1024, 1024)).astype(self.np_dtype)
+
+        self.inputs = {
+            'X': [('x0', convert_float_to_uint16(x))],
+            'FoundInfinite': found_inf,
+            # do not convert
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps,
+        }
+
+        self.outputs = {
+            'Out': [('out0', convert_float_to_uint16(x))],
+            # do not convert
+            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps,
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0), no_check_set=['Out'])
+
+
 class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
     def setUp(self):
         self.op_type = "update_loss_scaling"
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index a6e70a5d08ec2..1952ea0514b50 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -63,7 +63,9 @@ def clip_by_norm(x, max_norm, name=None):
         return _legacy_C_ops.clip_by_norm(x, 'max_norm', max_norm)
 
     helper = LayerHelper("clip_by_norm", **locals())
-    check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
+    check_variable_and_dtype(
+        x, 'X', ['float16', 'float32', 'uint16'], 'clip_by_norm'
+    )
     check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
 
     if name is None:
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 8fd603f4379f0..d89ce9cb4537a 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -538,10 +538,13 @@ def prelu(x, weight, data_format="NCHW", name=None):
         return _C_ops.prelu(x, weight, data_format, mode)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64'], 'prelu'
+            x, 'x', ['float16', 'float32', 'float64', 'uint16'], 'prelu'
         )
         check_variable_and_dtype(
-            weight, 'weight', ['float16', 'float32', 'float64'], 'prelu'
+            weight,
+            'weight',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'prelu',
         )
         helper = LayerHelper('prelu', **locals())
         out = helper.create_variable_for_type_inference(x.dtype)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index dacbeb7f30857..e36af4a8eae53 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -2498,7 +2498,7 @@ def multi_dot(x, name=None):
             check_variable_and_dtype(
                 item,
                 'x[' + str(id) + ']',
-                ['float16', 'float32', 'float64'],
+                ['float16', 'float32', 'float64', 'uint16'],
                 'multi_dot',
             )
             if item.dtype != x[0].dtype:
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index fee880f5bf695..c2b4ec7126388 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1240,7 +1240,15 @@ def broadcast_tensors(input, name=None):
             check_variable_and_dtype(
                 x,
                 'input[' + str(id) + ']',
-                ['bool', 'float32', 'float64', 'int32', 'int64'],
+                [
+                    'bool',
+                    'float16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'uint16',
+                ],
                 'broadcast_tensors',
             )
             if x.dtype != input[0].dtype:
@@ -1977,6 +1985,7 @@ def split(x, num_or_sections, axis=0, name=None):
                 'int32',
                 'int64',
                 'uint8',
+                'uint16',
                 'int8',
             ],
             'split',
@@ -4559,7 +4568,15 @@ def take_along_axis(arr, indices, axis):
         check_variable_and_dtype(
             arr,
             'x',
-            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint8',
+                'uint16',
+            ],
             'take_along_axis',
         )
         check_variable_and_dtype(
@@ -4631,7 +4648,15 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
         check_variable_and_dtype(
             arr,
             'x',
-            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint8',
+                'uint16',
+            ],
             'put_along_axis',
         )
         check_variable_and_dtype(
@@ -4713,7 +4738,7 @@ def index_add(x, index, axis, value, name=None):
     check_variable_and_dtype(
         x,
         'x',
-        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
         'paddle.tensor.manipulation.index_add',
     )
     check_variable_and_dtype(
@@ -4725,7 +4750,7 @@ def index_add(x, index, axis, value, name=None):
     check_variable_and_dtype(
         value,
         'add_value',
-        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
         'paddle.tensor.manipulation.index_add',
     )
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index efa8cbfc54e02..bd117715bed49 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1958,10 +1958,14 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
         helper = LayerHelper("addmm", **locals())
         check_variable_and_dtype(
-            input, 'Input', ['float32', 'float64'], 'addmm'
+            input, 'Input', ['float16', 'float32', 'float64', 'uint16'], 'addmm'
+        )
+        check_variable_and_dtype(
+            x, 'X', ['float16', 'float32', 'float64', 'uint16'], 'addmm'
+        )
+        check_variable_and_dtype(
+            y, 'Y', ['float16', 'float32', 'float64', 'uint16'], 'addmm'
         )
-        check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'addmm')
-        check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'addmm')
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
         helper.append_op(
@@ -2456,7 +2460,10 @@ def min(x, axis=None, keepdim=False, name=None):
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
         helper = LayerHelper('min', **locals())
         check_variable_and_dtype(
-            x, 'x', ['float32', 'float64', 'int32', 'int64'], 'min'
+            x,
+            'x',
+            ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+            'min',
         )
 
         out = helper.create_variable_for_type_inference(dtype=x.dtype)